Packages

library( dplyr )      # data wrangling
library( pander )     # document formatting
library( quanteda )   # text analysis tools
library( igraph )     # network analysis
library( networkD3 )  # visualization package

Semantic Network Basics

To demonstrate the method we will turn two small corpi into semantic networks and compare them.

These are frivolous examples but interestin in that they have an identical set of words, but different punctuation. We will create networks between all words that occur within sentences.

Dear John:

I want a man who knows what love is all about.

You are generous, kind, thoughtful.

People who are not like you admit to being useless and inferior.

You have ruined me for other men.

I yearn for you.

I have no feelings whatsoever when we’re apart.

I can be forever happy.

Will you let me be yours?

Gloria

Dear John:

I want a man who knows what love is.

All about you are generous, kind, thoughtful people, who are not like you.

Admit to being useless and inferior.

You have ruined me.

For other men, I yearn.

For you, I have no feelings whatsoever.

When we’re apart, I can be forever happy.

Will you let me be?

Yours,

Gloria

# setwd( "C:/Users/jdlecy/Dropbox/02 - CLASSES/02 - MASTERS/03 - R Crash Course/03 - Labs/Lab 10 - Text as Data" )
# x <- readLines( "assets/dear_john_letter_1.txt", warn=FALSE )

x <- readLines( "https://raw.githubusercontent.com/Nonprofit-Open-Data-Collective/machine_learning_mission_codes/master/docs/tutorials/assets/dear_john_letter_1.txt", warn=FALSE )

x <- toupper( x )

x <- gsub( ",", "", x )
x <- gsub( "WE’RE", "WE ARE", x )
x <- gsub( "\\:", "", x )
x <- gsub( "\\?", "", x )

x

## [1] "DEAR JOHN"                                                                                                                                                                                                                                                                    
## [2] ""                                                                                                                                                                                                                                                                             
## [3] "I WANT A MAN WHO KNOWS WHAT LOVE IS ALL ABOUT. YOU ARE GENEROUS KIND THOUGHTFUL. PEOPLE WHO ARE NOT LIKE YOU ADMIT TO BEING USELESS AND INFERIOR. YOU HAVE RUINED ME FOR OTHER MEN. I YEARN FOR YOU. I HAVE NO FEELINGS WHATSOEVER WHEN WE ARE APART. I CAN BE FOREVER HAPPY."
## [4] ""                                                                                                                                                                                                                                                                             
## [5] "WILL YOU LET ME BE YOURS"                                                                                                                                                                                                                                                     
## [6] ""                                                                                                                                                                                                                                                                             
## [7] "GLORIA"

x <- x[ - grep( "^$", x ) ]  # remove empty lines
x

## [1] "DEAR JOHN"                                                                                                                                                                                                                                                                    
## [2] "I WANT A MAN WHO KNOWS WHAT LOVE IS ALL ABOUT. YOU ARE GENEROUS KIND THOUGHTFUL. PEOPLE WHO ARE NOT LIKE YOU ADMIT TO BEING USELESS AND INFERIOR. YOU HAVE RUINED ME FOR OTHER MEN. I YEARN FOR YOU. I HAVE NO FEELINGS WHATSOEVER WHEN WE ARE APART. I CAN BE FOREVER HAPPY."
## [3] "WILL YOU LET ME BE YOURS"                                                                                                                                                                                                                                                     
## [4] "GLORIA"

Break paragraphs into sentences:

sentences <- strsplit( x, split="\\.")
sentences

## [[1]]
## [1] "DEAR JOHN"
## 
## [[2]]
## [1] "I WANT A MAN WHO KNOWS WHAT LOVE IS ALL ABOUT"                   
## [2] " YOU ARE GENEROUS KIND THOUGHTFUL"                               
## [3] " PEOPLE WHO ARE NOT LIKE YOU ADMIT TO BEING USELESS AND INFERIOR"
## [4] " YOU HAVE RUINED ME FOR OTHER MEN"                               
## [5] " I YEARN FOR YOU"                                                
## [6] " I HAVE NO FEELINGS WHATSOEVER WHEN WE ARE APART"                
## [7] " I CAN BE FOREVER HAPPY"                                         
## 
## [[3]]
## [1] "WILL YOU LET ME BE YOURS"
## 
## [[4]]
## [1] "GLORIA"

Turn sentences into networks.

First turn sentences into tokens:

strsplit( "YOU ARE GENEROUS KIND THOUGHTFUL", split=" " )[[1]]

## [1] "YOU"        "ARE"        "GENEROUS"   "KIND"       "THOUGHTFUL"

Then turn tokens into word pairs:

t( combn( strsplit( "YOU ARE GENEROUS KIND THOUGHTFUL", split=" " )[[1]], 2) )

##       [,1]       [,2]        
##  [1,] "YOU"      "ARE"       
##  [2,] "YOU"      "GENEROUS"  
##  [3,] "YOU"      "KIND"      
##  [4,] "YOU"      "THOUGHTFUL"
##  [5,] "ARE"      "GENEROUS"  
##  [6,] "ARE"      "KIND"      
##  [7,] "ARE"      "THOUGHTFUL"
##  [8,] "GENEROUS" "KIND"      
##  [9,] "GENEROUS" "THOUGHTFUL"
## [10,] "KIND"     "THOUGHTFUL"

Now apply this to all sentences, one at a time:

textToNet <- function( x )
{

   x <- unlist( x )
   
   word.pairs <- NULL
   
   for( i in 1:length(x) )
   {
      x.sub <- x[i]
      x.split <- strsplit( x.sub, split=" " )[[1]]
      x.split <- x.split[ - grep( "^$", x.split ) ]
      if( length(x.split) > 1 ) { word.pairs <- rbind( word.pairs, t( combn( x.split, 2) ) ) }
      
   }
   
   return( word.pairs )


}

textToNet( sentences ) %>% head()

##      [,1]  [,2]        
## [1,] "YOU" "ARE"       
## [2,] "YOU" "GENEROUS"  
## [3,] "YOU" "KIND"      
## [4,] "YOU" "THOUGHTFUL"
## [5,] "ARE" "GENEROUS"  
## [6,] "ARE" "KIND"

LETTER 1

# library( igraph )

mat1 <- textToNet( sentences )

g1 <- graph.edgelist( mat1, directed=FALSE )

par( mar=c(0,0,3,0) )

plot( g1, 
      layout = layout.kamada.kawai(g1), 
      edge.arrow.size  =0.4, 
      vertex.size = 10,
      vertex.label.cex = 1.5, 
      vertex.frame.color = "gray",
      vertex.color = "gray"
     )

title( main="Dear John Version 1" )

# library( networkD3 )
simpleNetwork( as.data.frame(mat1), 
               linkDistance = 70, charge = -50, 
               fontSize = 10, fontFamily = "serif",
               linkColour = "#D3D3D3",
               zoom = T )

LETTER 2

# x <- readLines("assets/dear_john_letter_2.txt", warn=FALSE)

x <- readLines( "https://raw.githubusercontent.com/Nonprofit-Open-Data-Collective/machine_learning_mission_codes/master/docs/tutorials/assets/dear_john_letter_2.txt", warn=FALSE )

x <- toupper( x )

x <- gsub( ",", "", x )
x <- gsub( "WE’RE", "WE ARE", x )
x <- gsub( "\\:", "", x )
x <- gsub( "\\?", "", x )

x <- x[ - grep( "^$", x )  ]

sentences <- strsplit( x, split="\\.")

mat2 <- textToNet( sentences )

g2 <- graph.edgelist( mat2, directed=FALSE )

par( mar=c(0,0,3,0) )

plot( g2, 
      layout = layout.kamada.kawai(g2), 
      edge.arrow.size  =0.4, 
      vertex.size = 10,
      vertex.label.cex = 1.5, 
      vertex.frame.color = "gray",
      vertex.color = "gray"
     )

title( main="Dear John Version 2" )

g.intersect <- g1 %s% g2

# remove isolates
g.intersect <- delete.vertices( g.intersect, degree(g.intersect)==0 )



# plot( g.intersect, 
#       layout = layout.kamada.kawai(g.intersect), 
#       edge.arrow.size  =0.4, 
#       vertex.size = 10,
#       vertex.label.cex = 1.5, 
#       vertex.frame.color = "gray",
#       vertex.color = "gray"
#      )
# 
# title( main="Shared Semantic Ties" )

simpleNetwork( as_data_frame( g.intersect ), 
               linkDistance = 70, charge = -50, 
               fontSize = 10, fontFamily = "serif",
               linkColour = "#D3D3D3",
               zoom = T )

g1.unique <- g1 %m% g2

# remove isolates
g1.unique <- delete.vertices( g1.unique, degree(g1.unique)==0 )

# plot( g1.unique, 
#       layout = layout.kamada.kawai(g1.unique), 
#       edge.arrow.size  =0.4, 
#       vertex.size = 10,
#       vertex.label.cex = 1.5, 
#       vertex.frame.color = "gray",
#       vertex.color = "gray"
#      )
# 
# title( main="Unique Semantic Ties G1" )

simpleNetwork( as_data_frame( g1.unique ), 
               linkDistance = 70, charge = -50, 
               fontSize = 10, fontFamily = "serif",
               linkColour = "#D3D3D3",
               zoom = T )

g2.unique <- g2 %m% g1

# remove isolates
g2.unique <- delete.vertices( g2.unique, degree(g2.unique)==0 )

# plot( g2.unique, 
#       layout = layout.kamada.kawai(g2.unique), 
#       edge.arrow.size  =0.4, 
#       vertex.size = 10,
#       vertex.label.cex = 1.5, 
#       vertex.frame.color = "gray",
#       vertex.color = "gray"
#      )
# 
# title( main="Unique Semantic Ties G2" )

simpleNetwork( as_data_frame( g2.unique ), 
               linkDistance = 70, charge = -50, 
               fontSize = 10, fontFamily = "serif",
               linkColour = "#D3D3D3",
               zoom = T )

Read Mission Data

mission <- readRDS( gzcon( url( "https://github.com/Nonprofit-Open-Data-Collective/machine_learning_mission_codes/blob/master/DATA/MISSION.rds?raw=true" )))

head( mission[ c("NAME","F9_03_PZ_MISSION") ] )

mission$docid <- paste( mission$EIN, mission$TAXYR, sep="-" )
mission <- mission[ ! duplicated( mission$docid ) , ]

names(mission) <- toupper( names(mission ) )
mission <- mission[ c("EIN","TAXYR","NAME","F9_03_PZ_MISSION","NTEECODE","NTEE",
                      "ORGPURPOSECHARITABLE","ORGPURPOSERELIGIOUS",
                      "ORGPURPOSEEDUCATIONAL","ORGPURPOSESCIENTIFIC",
                      "ORGPURPOSELITERARY","ORGPURPOSEPUBLICSAFETY",
                      "ORGPURPOSEAMATEURSPORTS","ORGPURPOSECRUELTYPREVENTION") ]
                      

mission.corp <- corpus( mission, text_field = "F9_03_PZ_MISSION")

# add document IDs
docid <- paste( mission$EIN, mission$TAXYR, sep="-" )
docnames( mission.corp ) <- docid

summary( mission.corp, 5)

mission.corp2 <- tolower( mission.corp )
mission.corp3 <- tokens( mission.corp2, remove_punct = TRUE )
mission.corp4 <- tokens_remove( tokens(mission.corp3), stopwords("english"), padding = TRUE )
mission.corp5 <- tokens_wordstem( mission.corp4 )

head( mission.corp5, 3 )

## tokens from 3 documents.
## 10716217-2016 :
##  [1] ""            "corpor"      "specif"      "purpos"      ""           
##  [6] ""            "support"     "afford"      "hous"        "communiti"  
## [11] "develop"     ""            "econom"      "develop"     ""           
## [16] ""            "citi"        ""            "counti"      ""           
## [21] "san"         "francisco"   "econom"      "disadvantag" "individu"   
## [26] ""            "communiti"   ""            "lend"        ""           
## [31] "invest"      ""            ""            "direct"      "acquir"     
## [36] ""            "afford"      "hous"        ""            "relat"      
## [41] "communiti"   "develop"     "real"        "estat"       "asset"      
## 
## 10842551-2015 :
## [1] "support"    "hartland"   "creeksid"   "elementari" "school"    
## 
## 20792368-2011 :
## [1] "communiti" "sport"     "program"

token.list <- as.list( mission.corp5 )
token.list <- lapply( token.list, function(x){ x[ ! grepl( "^$", x ) ] } )
token.list[[1]]

##  [1] "corpor"      "specif"      "purpos"      "support"     "afford"     
##  [6] "hous"        "communiti"   "develop"     "econom"      "develop"    
## [11] "citi"        "counti"      "san"         "francisco"   "econom"     
## [16] "disadvantag" "individu"    "communiti"   "lend"        "invest"     
## [21] "direct"      "acquir"      "afford"      "hous"        "relat"      
## [26] "communiti"   "develop"     "real"        "estat"       "asset"

listToNet <- function( x )
{
   
   word.pairs <- list()
   
   for( i in 1:length(x) )
   {
      x.i <- x[[i]]
      word.pairs[[i]] <- NULL
      if( length( x.i ) > 1 ) { word.pairs[[i]] <-  data.frame( t( combn( x.i, 2) ) ) }
      if( length( x.i ) > 1 ) { names( word.pairs[[i]] ) <-  c("from","to") }
   }
   
   return( word.pairs )

}

g.list <- listToNet( token.list )
head( g.list[[1]] )

Sanity check, nets should have n * (n-1) / 2 nodes.

mat <- bind_rows( g.list )
dim( mat )

## [1] 793658      2

g <- graph.edgelist( as.matrix(mat), directed=FALSE )

summary( g )

## IGRAPH 57993a6 UN-- 4826 793658 -- 
## + attr: name (v/c)

Comparison by Mission

table( mission$ORGPURPOSERELIGIOUS, useNA="ifany" )

## 
##    0    1 
## 2961  447

g.list.1 <- g.list[ mission$ORGPURPOSERELIGIOUS == 1 ]
m1 <- bind_rows( g.list.1 )
length( g.list.1 )

## [1] 447

g.list.2 <- g.list[ mission$ORGPURPOSERELIGIOUS == 0 ]
m2 <- bind_rows( g.list.2 )
length( g.list.2 )

## [1] 2961

g.religious.yes <- graph.edgelist( as.matrix(m1), directed=FALSE )
g.religious.no <- graph.edgelist( as.matrix(m2), directed=FALSE )

summary( g.religious.yes )

## IGRAPH 5801fef UN-- 1452 104179 -- 
## + attr: name (v/c)

summary( g.religious.no )

## IGRAPH 582dea7 UN-- 4438 689479 -- 
## + attr: name (v/c)

g.religious.yes <- simplify( g.religious.yes )
summary( g.religious.yes )

## IGRAPH 584eb40 UN-- 1452 45910 -- 
## + attr: name (v/c)

g.religious.no  <- simplify( g.religious.no )
summary( g.religious.no )

## IGRAPH 58634a7 UN-- 4438 211114 -- 
## + attr: name (v/c)

g.religious.yes.unique <- g.religious.yes %m% g.religious.no

# remove isolates
g.religious.yes.unique <- delete.vertices( g.religious.yes.unique, 
                                           degree(g.religious.yes.unique)==0 )

summary( g.religious.yes.unique )

## IGRAPH 587485f UN-- 1449 31804 -- 
## + attr: name (v/c)

plot( g.religious.yes.unique, 
      layout = layout.kamada.kawai(g.religious.yes.unique), 
      edge.arrow.size  =0.4, 
      vertex.size = 10,
      vertex.label.cex = 1.5, 
      vertex.frame.color = "gray",
      vertex.color = "gray"
     )

title( main="Unique Word Ties Religious Nonprofits" )

These need to be filtered using a probabalistic method to identify only the ties that are statistically meaningful (would not occur by chance) within each group.

Religious Purpose Nonprofits

g1 <- induced_subgraph( g.religious.yes.unique, 
                        sample( 1:length( V(g.religious.yes.unique) ), 100 ) )

# remove isolates
g1 <- delete.vertices( g1, degree(g1)==0 )

simpleNetwork( as_data_frame( g1 ), 
               linkDistance = 70, charge = -50, 
               fontSize = 10, fontFamily = "serif",
               linkColour = "#D3D3D3",
               zoom = T )

g1 <- induced_subgraph( g.religious.yes.unique, 
                        sample( 1:length( V(g.religious.yes.unique) ), 100 ) )

# remove isolates
g1 <- delete.vertices( g1, degree(g1)==0 )

simpleNetwork( as_data_frame( g1 ), 
               linkDistance = 70, charge = -50, 
               fontSize = 10, fontFamily = "serif",
               linkColour = "#D3D3D3",
               zoom = T )

Non-Religious Purpose Nonprofits

g.religious.no.unique <- g.religious.no %m% g.religious.yes

# remove isolates
g.religious.no.unique <- delete.vertices( g.religious.no.unique, 
                                           degree(g.religious.no.unique)==0 )

g2 <- induced_subgraph( g.religious.no.unique, 
                        sample( 1:length( V(g.religious.no.unique) ), 100 ) )

# remove isolates
g2 <- delete.vertices( g2, degree(g2)==0 )

g2 <- simplify( g2 )

simpleNetwork( as_data_frame( g2 ), 
               linkDistance = 70, charge = -50, 
               fontSize = 10, fontFamily = "serif",
               linkColour = "#D3D3D3",
               zoom = T )

g.religious.no.unique <- g.religious.no %m% g.religious.yes

# remove isolates
g.religious.no.unique <- delete.vertices( g.religious.no.unique, 
                                           degree(g.religious.no.unique)==0 )

g2 <- induced_subgraph( g.religious.no.unique, 
                        sample( 1:length( V(g.religious.no.unique) ), 100 ) )

# remove isolates
g2 <- delete.vertices( g2, degree(g2)==0 )

g2 <- simplify( g2 )

simpleNetwork( as_data_frame( g2 ), 
               linkDistance = 70, charge = -50, 
               fontSize = 10, fontFamily = "serif",
               linkColour = "#D3D3D3",
               zoom = T )

Semantic Networks