Just using Quanteda at this point
library(quanteda)
## Package version: 1.3.14
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
Call in both datasets
mission <- read.csv( "https://github.com/Nonprofit-Open-Data-Collective/machine_learning_mission_codes/blob/master/DATA/MISSION.csv?raw=true", stringsAsFactors=F )
programs <- read.csv( "https://github.com/Nonprofit-Open-Data-Collective/machine_learning_mission_codes/blob/master/DATA/PROGRAMS.csv?raw=true", stringsAsFactors=F )
Reducing to the necessary columns and dropping any repeating observations
mission <- mission[!duplicated(mission[c('EIN', 'TAXYR', 'F9_03_PZ_MISSION')]),]
programs <- programs[!duplicated(programs[c('EIN', 'TAXYR', 'DESCRIPTION')]),]
programs <- programs[, c("EIN", "TAXYR", "DESCRIPTION")]
Creating a single column with all of the text from name, mission, and programs before turing into a corpus
mp <- merge(mission, programs, by=c( "EIN", "TAXYR"))
mp$text <- paste(mp$NAME, mp$F9_03_PZ_MISSION, mp$DESCRIPTION)
mp.lim <- mp[, c("EIN", "text")]
In addition, need to ensure all varaibles are characters in order to change to corpus
mp.lim <- data.frame(lapply(mp.lim, as.character), stringsAsFactors=FALSE)
Converting data to a corpus using ‘corpus’ command from quanteda, text_field indicates which column holds the text data we want to analyze. Also creating a label for each listing in order ot ensure the data is labeled thorugh to the end of the analysis.
mp.corp <- corpus(mp.lim, text_field = "text")
We can look at the corpus to see how it’s structured
mp.corp
## Corpus consisting of 4,278 documents and 1 docvar.
mp.corp[2]
## text2
## "CREEKSIDE ELEMENTARY PTO INC SUPPORT HARTLAND CREEKSIDE ELEMENTARY SCHOOL. PROVIDE CLASSROOM SUPPLIES FOR BENEFIT OF STUDENTS AND TEACHERS. PAID FOR EDUCATIONAL ASSEMBLIES AND PROGRAMS FOR STUDENTS. PROVIDED FUNDING FOR FIELD TRIP TRANSPORTATION BENEFITING STUDENTS."
summary(mp.corp)[1:10,]
## Text Types Tokens Sentences EIN
## 1 text1 132 272 8 10716217
## 2 text2 27 38 4 10842551
## 3 text3 34 45 2 113087078
## 4 text4 34 45 2 113087078
## 5 text5 34 45 2 113087078
## 6 text6 77 138 5 113187592
## 7 text7 7 10 1 113731446
## 8 text8 7 10 1 113731446
## 9 text9 7 10 1 113731446
## 10 text10 7 10 1 113731446
Preprocessing steps before identifying Ngrams. We can do many of these steps quickly while converting to a document feature matrix later, but want to do them explicitly before identifying Ngrams. We make the text lower case, break into topens, and remove stopwords
mp.corp2 <- tolower(mp.corp)
mp.corp2[2]
## text2
## "creekside elementary pto inc support hartland creekside elementary school. provide classroom supplies for benefit of students and teachers. paid for educational assemblies and programs for students. provided funding for field trip transportation benefiting students."
mp.corp3 <- tokens(mp.corp2, remove_punct = TRUE)
mp.corp3[2]
## tokens from 1 document.
## text2 :
## [1] "creekside" "elementary" "pto" "inc"
## [5] "support" "hartland" "creekside" "elementary"
## [9] "school" "provide" "classroom" "supplies"
## [13] "for" "benefit" "of" "students"
## [17] "and" "teachers" "paid" "for"
## [21] "educational" "assemblies" "and" "programs"
## [25] "for" "students" "provided" "funding"
## [29] "for" "field" "trip" "transportation"
## [33] "benefiting" "students"
mp.corp4 <- tokens_remove(tokens(mp.corp3), c(stopwords("english"), "nbsp"), padding = F)
mp.corp4[2]
## tokens from 1 document.
## text2 :
## [1] "creekside" "elementary" "pto" "inc"
## [5] "support" "hartland" "creekside" "elementary"
## [9] "school" "provide" "classroom" "supplies"
## [13] "benefit" "students" "teachers" "paid"
## [17] "educational" "assemblies" "programs" "students"
## [21] "provided" "funding" "field" "trip"
## [25] "transportation" "benefiting" "students"
Now looking at Ngrams. Looking for combinations of 2 and 3 words. I’ve exported the lists that were produces for us all to look over to decide what we want to capture into a dictionary. This code can be updated once we have a larger list.
myNgram2 <- tokens(mp.corp4) %>%
tokens_ngrams(n = 2) %>%
dfm()
myNgram3 <- tokens(mp.corp4) %>%
tokens_ngrams(n = 3) %>%
dfm()
myNgram2miss.df <- textstat_frequency(myNgram2)
myNgram3miss.df <- textstat_frequency(myNgram3)
topfeatures(myNgram2)
## high_school inc_provide foundation_inc 501_c c_3
## 243 215 209 137 137
## quality_life united_states mental_health new_york inc_mission
## 112 111 104 102 100
topfeatures(myNgram3)
## 501_c_3 section_501_c
## 137 55
## internal_revenue_code maxcen_housing_society
## 53 48
## improve_quality_life provide_financial_assistance
## 44 42
## housing_society_inc high_school_students
## 42 39
## 3_internal_revenue see_schedule_o
## 38 36
We can see the top candidates and others with the data created. Now create a dcitionary in order to identify and transofrm those combinations of words
my_dict_prog <- dictionary(list(five01_c_3= c("501 c 3","section 501 c 3") ,
jesus_christ=c("jesus christ"),
high_school=c("high school"),
non_profit=c("non-profit", "non profit"),
stem=c("science technology engineering math", "science technology engineering mathematics"),
steam=c("science technology engineering art math", "science technology engineering art mathematics")))
my_dict_place <- dictionary(list(ny_city=c("new york city"),
ny_state=c("new york state"),
ny=c("new york"),
sf=c("san francisco"),
san_diego=c("san diego"),
santa_barbara=c("santa barbara"),
new_hampshire=c("new hampshire"),
new_orleans=c("new orleans"),
san_antonio=c("san antonio"),
san_gabriel=c("san gabriel"),
santa_monica=c("santa monica"),
santa_clarita=c("santa clarita"),
los_angeles=c("los angeles"),
united_states = c("united states")))
mp.corp5 <- tokens_compound(mp.corp4, pattern = my_dict_prog)
mp.corp6 <- tokens_compound(mp.corp5, pattern = my_dict_place)
mp.corp7 <- sapply(mp.corp6, paste, collapse=c(" ", " "))
converting to a document frequency matrix as a final step, and removing stems.
mp.dfm <- dfm(mp.corp7,
stem = T)
mp.dfm
## Document-feature matrix of: 4,278 documents, 9,587 features (99.7% sparse).
topfeatures(mp.dfm, 20)
## provid educ communiti organ inc support program
## 3438 2521 2387 1994 1917 1738 1700
## servic foundat famili children promot help youth
## 1254 1080 1050 1007 1003 994 956
## develop school need mission student activ
## 944 911 855 848 799 796
Now converting the DFM to a data frame and combing with corpus and original data
mp.dfm.df <- convert(mp.dfm, to = "data.frame")
mp.corpus.df <- as.data.frame(mp.corp7)
colnames(mp.corpus.df) <- "Corpus"
full.data <- cbind(mp, mp.corpus.df)
full.data2 <- cbind(full.data, mp.dfm.df)