library(e1071)
library(caret) #confusion matrix
## Loading required package: lattice
## Loading required package: ggplot2
library(quanteda) # naive bayes
## Package version: 1.3.14
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
library(pander) #attractive tables in rmarkdown

Calling in data from github. The second step removes all of the columns that preceed the document-features matrix that we donโ€™t want to include

dat <- read.csv( "~/Dropbox (ASU)/USC Mission Paper/Data and Analysis/github_files/machine_learning_mission_codes/DATA/Name Mission Program w Corpus and DFM.csv" )
dat2 <- (dat[,119:ncol(dat)])

Selecting one purpose varaible as the outcome we classify and then adding it backto the DFM. Could also add additional variables of interest at this stage

others <- dat[,c("Incorporatedstate","Donatefundsyes","Onethirdsupportpublic","Onethirdsupportgifts",
                 "Disasterreliefyes", "Orgpurposecharitable")]
dat2 <- cbind(dat2, others)

Selecting a subset of observations as a training set

set.seed(300)
id_train <- sample(1:nrow(dat2), 1000, replace = FALSE)
head(id_train, 10)
##  [1] 3916 3265 3446 3137 2916   52 3241 2132 1991 3847

Changing out data to a corpus as an intermediatry step to convert it to a dfm

dat2$id_numeric <- 1:nrow(dat2)
dat.corpus <- data.frame(lapply(dat2, as.character), stringsAsFactors=FALSE)
dat.corpus <- corpus(dat.corpus,        text_field = "Corpus")

With a corpus, we can use corpus_subset in order to selct the observations that are and are not in the trainging set. The final bit of code then converts it to a dfm. The naive bayes command requires a dfm

training_nb <- corpus_subset(dat.corpus, id_numeric %in% id_train)%>%
    dfm(stem = TRUE)

test_nb <- corpus_subset(dat.corpus, !id_numeric %in% id_train)%>%
    dfm(stem = TRUE)

textmodel_nb runs the naive bayes for the training set

train_results <- textmodel_nb(training_nb, docvars(training_nb, "Orgpurposecharitable"))
summary(train_results)
## 
## Call:
## textmodel_nb.dfm(x = training_nb, y = docvars(training_nb, "Orgpurposecharitable"))
## 
## Class Priors:
## (showing first 2 elements)
##   0   1 
## 0.5 0.5 
## 
## Estimated Feature Scores:
##   worship  doric  grand   lodg charit  organ purpos corpor   form   rais
## 0  0.7529 0.6037 0.4324 0.6037 0.2983 0.3704 0.4243   0.41 0.2381 0.2758
## 1  0.2471 0.3963 0.5676 0.3963 0.7017 0.6296 0.5757   0.59 0.7619 0.7242
##    money  donat worthi chariti   also    set scholarship  fund recipi
## 0 0.1382 0.2862 0.2758   0.117 0.3647 0.1899      0.3672 0.277 0.3786
## 1 0.8618 0.7138 0.7242   0.883 0.6353 0.8101      0.6328 0.723 0.6214
##   communiti metropolitan    ny  right   life promot   educ  human  radio
## 0    0.4174       0.5333 0.859 0.3786 0.2779 0.4567 0.4406 0.6535 0.5039
## 1    0.5826       0.4667 0.141 0.6214 0.7221 0.5433 0.5594 0.3465 0.4961
##   program respect
## 0  0.3217  0.4726
## 1  0.6783  0.5274

need to make sure features in the two data sets match before using the training data on the test set

dfmat_matched <- dfm_select(test_nb, pattern=training_nb, selection = "keep")

Here we use the training data with the test data and see how well it performed

actual_class <- docvars(dfmat_matched, "Orgpurposecharitable")
predicted_class <- predict(train_results, newdata = dfmat_matched)
tab_class <- prop.table(table(actual_class, predicted_class))
tab_class
##             predicted_class
## actual_class          0          1
##            0 0.09823063 0.11012813
##            1 0.05369128 0.73794997

A confusion matrix from the caret package can give us additional detail to evaluate the outcome

confusionMatrix(tab_class, mode = "everything")
## Confusion Matrix and Statistics
## 
##             predicted_class
## actual_class          0          1
##            0 0.09823063 0.11012813
##            1 0.05369128 0.73794997
##                                   
##                Accuracy : 0.8362  
##                  95% CI : (NA, NA)
##     No Information Rate : NA      
##     P-Value [Acc > NIR] : NA      
##                                   
##                   Kappa : 0.4484  
##  Mcnemar's Test P-Value : 0.01974 
##                                   
##             Sensitivity : 0.64659 
##             Specificity : 0.87014 
##          Pos Pred Value : 0.47145 
##          Neg Pred Value : 0.93218 
##               Precision : 0.47145 
##                  Recall : 0.64659 
##                      F1 : 0.54530 
##              Prevalence : 0.15192 
##          Detection Rate : 0.09823 
##    Detection Prevalence : 0.20836 
##       Balanced Accuracy : 0.75837 
##                                   
##        'Positive' Class : 0       
## 

We can convert the confusion matrix to a data frame and then combine it with tests for other organization purposes to compare the models for different classifications

cm <- confusionMatrix(tab_class, mode = "everything")
tocsv<-as.data.frame(t(data.frame(cbind(t(cm$byClass),t(cm$overall)))))

We can rerun that code using the other organization purposes in the data, and combine them into a data frame to compare the sucess of the naive bayes

pandoc.table(tocsv)
## 
## -------------------------------------------------------------------------
##           &nbsp;            Charity   Religious   Education   Scientific 
## -------------------------- --------- ----------- ----------- ------------
##      **Sensitivity**        0.6466     0.9354      0.7729       0.9543   
## 
##      **Specificity**        0.8701     0.7633      0.7223       0.5385   
## 
##     **Pos.Pred.Value**      0.4714     0.9717      0.7784       0.9768   
## 
##     **Neg.Pred.Value**      0.9322     0.5759      0.7159       0.3665   
## 
##       **Precision**         0.4714     0.9717      0.7784       0.9768   
## 
##         **Recall**          0.6466     0.9354      0.7729       0.9543   
## 
##           **F1**            0.5453     0.9532      0.7757       0.9654   
## 
##       **Prevalence**        0.1519     0.8969       0.558       0.9532   
## 
##     **Detection.Rate**      0.09823    0.8389      0.4312       0.9096   
## 
##  **Detection.Prevalence**   0.2084     0.8633       0.554       0.9312   
## 
##   **Balanced.Accuracy**     0.7584     0.8493      0.7476       0.7464   
## 
##        **Accuracy**         0.8362     0.9176      0.7505       0.9348   
## 
##         **Kappa**           0.4484     0.6107      0.4948       0.4029   
## 
##     **AccuracyLower**         NA         NA          NA           NA     
## 
##     **AccuracyUpper**         NA         NA          NA           NA     
## 
##      **AccuracyNull**         NA         NA          NA           NA     
## 
##     **AccuracyPValue**        NA         NA          NA           NA     
## 
##     **McnemarPValue**       0.01974   0.0007587    0.04613    0.0001273  
## -------------------------------------------------------------------------
## 
## Table: Table continues below
## 
##  
## -------------------------------------------------------------------------
##           &nbsp;            Literary     Safety      Sports     Cruelty  
## -------------------------- ----------- ----------- ----------- ----------
##      **Sensitivity**         0.9695      0.9906      0.9765      0.9759  
## 
##      **Specificity**         0.4068      0.1538      0.6519      0.6627  
## 
##     **Pos.Pred.Value**       0.9869       0.996      0.9758      0.9785  
## 
##     **Neg.Pred.Value**       0.2243      0.07143     0.6592      0.6358  
## 
##       **Precision**          0.9869       0.996      0.9758      0.9785  
## 
##         **Recall**           0.9695      0.9906      0.9765      0.9759  
## 
##           **F1**             0.9781      0.9933      0.9761      0.9772  
## 
##       **Prevalence**         0.9788      0.9953      0.9348      0.9402  
## 
##     **Detection.Rate**       0.9489       0.986      0.9129      0.9176  
## 
##  **Detection.Prevalence**    0.9615      0.9899      0.9356      0.9377  
## 
##   **Balanced.Accuracy**      0.6881      0.5722      0.8142      0.8193  
## 
##        **Accuracy**          0.9575      0.9867      0.9554      0.9572  
## 
##         **Kappa**            0.2691      0.09176     0.6317      0.6262  
## 
##     **AccuracyLower**          NA          NA          NA          NA    
## 
##     **AccuracyUpper**          NA          NA          NA          NA    
## 
##      **AccuracyNull**          NA          NA          NA          NA    
## 
##     **AccuracyPValue**         NA          NA          NA          NA    
## 
##     **McnemarPValue**       1.859e-06   6.805e-18   2.247e-06   1.44e-06 
## -------------------------------------------------------------------------

Credit to the tutorials on the Quanteda website, where much of this was originally learned