Just using Quanteda at this point

library(quanteda)
## Package version: 1.3.14
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View

Preprocessing data on organization name, mission, and programs

Call in both datasets

mission <- read.csv( "https://github.com/Nonprofit-Open-Data-Collective/machine_learning_mission_codes/blob/master/DATA/MISSION.csv?raw=true", stringsAsFactors=F )
programs <- read.csv( "https://github.com/Nonprofit-Open-Data-Collective/machine_learning_mission_codes/blob/master/DATA/PROGRAMS.csv?raw=true", stringsAsFactors=F )

Reducing to the necessary columns and dropping any repeating observations

mission <- mission[!duplicated(mission[c('EIN', 'TAXYR', 'F9_03_PZ_MISSION')]),]
programs <- programs[!duplicated(programs[c('EIN', 'TAXYR', 'DESCRIPTION')]),]

programs <- programs[, c("EIN", "TAXYR", "DESCRIPTION")]

Creating a single column with all of the text from name, mission, and programs before turing into a corpus

mp <- merge(mission, programs, by=c( "EIN", "TAXYR"))
mp$text <- paste(mp$NAME, mp$F9_03_PZ_MISSION, mp$DESCRIPTION)
mp.lim <- mp[, c("EIN", "text")]

In addition, need to ensure all varaibles are characters in order to change to corpus

mp.lim <- data.frame(lapply(mp.lim, as.character), stringsAsFactors=FALSE)

Converting data to a corpus using ‘corpus’ command from quanteda, text_field indicates which column holds the text data we want to analyze. Also creating a label for each listing in order ot ensure the data is labeled thorugh to the end of the analysis.

mp.corp <- corpus(mp.lim,  text_field = "text")

We can look at the corpus to see how it’s structured

mp.corp
## Corpus consisting of 4,278 documents and 1 docvar.
mp.corp[2]
##                                                                                                                                                                                                                                                                        text2 
## "CREEKSIDE ELEMENTARY PTO INC SUPPORT HARTLAND CREEKSIDE ELEMENTARY SCHOOL. PROVIDE CLASSROOM SUPPLIES FOR BENEFIT OF STUDENTS AND TEACHERS. PAID FOR EDUCATIONAL ASSEMBLIES AND PROGRAMS FOR STUDENTS. PROVIDED FUNDING FOR FIELD TRIP TRANSPORTATION BENEFITING STUDENTS."
summary(mp.corp)[1:10,]
##      Text Types Tokens Sentences       EIN
## 1   text1   132    272         8  10716217
## 2   text2    27     38         4  10842551
## 3   text3    34     45         2 113087078
## 4   text4    34     45         2 113087078
## 5   text5    34     45         2 113087078
## 6   text6    77    138         5 113187592
## 7   text7     7     10         1 113731446
## 8   text8     7     10         1 113731446
## 9   text9     7     10         1 113731446
## 10 text10     7     10         1 113731446

Preprocessing steps before identifying Ngrams. We can do many of these steps quickly while converting to a document feature matrix later, but want to do them explicitly before identifying Ngrams. We make the text lower case, break into topens, and remove stopwords

mp.corp2 <- tolower(mp.corp)
mp.corp2[2]
##                                                                                                                                                                                                                                                                        text2 
## "creekside elementary pto inc support hartland creekside elementary school. provide classroom supplies for benefit of students and teachers. paid for educational assemblies and programs for students. provided funding for field trip transportation benefiting students."
mp.corp3 <- tokens(mp.corp2, remove_punct = TRUE)
mp.corp3[2]
## tokens from 1 document.
## text2 :
##  [1] "creekside"      "elementary"     "pto"            "inc"           
##  [5] "support"        "hartland"       "creekside"      "elementary"    
##  [9] "school"         "provide"        "classroom"      "supplies"      
## [13] "for"            "benefit"        "of"             "students"      
## [17] "and"            "teachers"       "paid"           "for"           
## [21] "educational"    "assemblies"     "and"            "programs"      
## [25] "for"            "students"       "provided"       "funding"       
## [29] "for"            "field"          "trip"           "transportation"
## [33] "benefiting"     "students"
mp.corp4 <- tokens_remove(tokens(mp.corp3), c(stopwords("english"), "nbsp"), padding  = F)
mp.corp4[2]
## tokens from 1 document.
## text2 :
##  [1] "creekside"      "elementary"     "pto"            "inc"           
##  [5] "support"        "hartland"       "creekside"      "elementary"    
##  [9] "school"         "provide"        "classroom"      "supplies"      
## [13] "benefit"        "students"       "teachers"       "paid"          
## [17] "educational"    "assemblies"     "programs"       "students"      
## [21] "provided"       "funding"        "field"          "trip"          
## [25] "transportation" "benefiting"     "students"

Now looking at Ngrams. Looking for combinations of 2 and 3 words. I’ve exported the lists that were produces for us all to look over to decide what we want to capture into a dictionary. This code can be updated once we have a larger list.

myNgram2 <- tokens(mp.corp4) %>%
  tokens_ngrams(n = 2) %>%
  dfm()
myNgram3 <- tokens(mp.corp4) %>%
  tokens_ngrams(n = 3) %>%
  dfm()

myNgram2miss.df <- textstat_frequency(myNgram2)
myNgram3miss.df <- textstat_frequency(myNgram3)

topfeatures(myNgram2)
##    high_school    inc_provide foundation_inc          501_c            c_3 
##            243            215            209            137            137 
##   quality_life  united_states  mental_health       new_york    inc_mission 
##            112            111            104            102            100
topfeatures(myNgram3)
##                      501_c_3                section_501_c 
##                          137                           55 
##        internal_revenue_code       maxcen_housing_society 
##                           53                           48 
##         improve_quality_life provide_financial_assistance 
##                           44                           42 
##          housing_society_inc         high_school_students 
##                           42                           39 
##           3_internal_revenue               see_schedule_o 
##                           38                           36

We can see the top candidates and others with the data created. Now create a dcitionary in order to identify and transofrm those combinations of words

my_dict_prog <- dictionary(list(five01_c_3= c("501 c 3","section 501 c 3") ,
                           jesus_christ=c("jesus christ"),
                           high_school=c("high school"),
                           non_profit=c("non-profit", "non profit"),
                           stem=c("science technology engineering math", "science technology engineering mathematics"), 
                           steam=c("science technology engineering art math", "science technology engineering art mathematics")))
my_dict_place <- dictionary(list(ny_city=c("new york city"),
                            ny_state=c("new york state"),
                            ny=c("new york"),
                            sf=c("san francisco"),
                            san_diego=c("san diego"),
                            santa_barbara=c("santa barbara"),
                            new_hampshire=c("new hampshire"),
                            new_orleans=c("new orleans"),
                            san_antonio=c("san antonio"),
                            san_gabriel=c("san gabriel"),
                            santa_monica=c("santa monica"),
                            santa_clarita=c("santa clarita"),
                            los_angeles=c("los angeles"),
                            united_states = c("united states")))
                           
mp.corp5 <- tokens_compound(mp.corp4, pattern = my_dict_prog)
mp.corp6 <- tokens_compound(mp.corp5, pattern = my_dict_place)
mp.corp7 <- sapply(mp.corp6, paste, collapse=c(" ", "  "))

converting to a document frequency matrix as a final step, and removing stems.

mp.dfm <- dfm(mp.corp7,
                   stem = T)
mp.dfm
## Document-feature matrix of: 4,278 documents, 9,587 features (99.7% sparse).
topfeatures(mp.dfm, 20)
##    provid      educ communiti     organ       inc   support   program 
##      3438      2521      2387      1994      1917      1738      1700 
##    servic   foundat    famili  children    promot      help     youth 
##      1254      1080      1050      1007      1003       994       956 
##   develop    school      need   mission   student     activ 
##       944       911       855       848       799       796

Now converting the DFM to a data frame and combing with corpus and original data

mp.dfm.df <- convert(mp.dfm, to = "data.frame")
mp.corpus.df <- as.data.frame(mp.corp7)

colnames(mp.corpus.df) <- "Corpus"


full.data <- cbind(mp, mp.corpus.df)
full.data2 <- cbind(full.data, mp.dfm.df)