To demonstrate the method we will turn two small corpi into semantic networks and compare them.
These are frivolous examples but interestin in that they have an identical set of words, but different punctuation. We will create networks between all words that occur within sentences.
Dear John:
I want a man who knows what love is all about.
You are generous, kind, thoughtful.
People who are not like you admit to being useless and inferior.
You have ruined me for other men.
I yearn for you.
I have no feelings whatsoever when we’re apart.
I can be forever happy.
Will you let me be yours?
Gloria
Dear John:
I want a man who knows what love is.
All about you are generous, kind, thoughtful people, who are not like you.
Admit to being useless and inferior.
You have ruined me.
For other men, I yearn.
For you, I have no feelings whatsoever.
When we’re apart, I can be forever happy.
Will you let me be?
Yours,
Gloria
# setwd( "C:/Users/jdlecy/Dropbox/02 - CLASSES/02 - MASTERS/03 - R Crash Course/03 - Labs/Lab 10 - Text as Data" )
# x <- readLines( "assets/dear_john_letter_1.txt", warn=FALSE )
x <- readLines( "https://raw.githubusercontent.com/Nonprofit-Open-Data-Collective/machine_learning_mission_codes/master/docs/tutorials/assets/dear_john_letter_1.txt", warn=FALSE )
x <- toupper( x )
x <- gsub( ",", "", x )
x <- gsub( "WE’RE", "WE ARE", x )
x <- gsub( "\\:", "", x )
x <- gsub( "\\?", "", x )
x
## [1] "DEAR JOHN"
## [2] ""
## [3] "I WANT A MAN WHO KNOWS WHAT LOVE IS ALL ABOUT. YOU ARE GENEROUS KIND THOUGHTFUL. PEOPLE WHO ARE NOT LIKE YOU ADMIT TO BEING USELESS AND INFERIOR. YOU HAVE RUINED ME FOR OTHER MEN. I YEARN FOR YOU. I HAVE NO FEELINGS WHATSOEVER WHEN WE ARE APART. I CAN BE FOREVER HAPPY."
## [4] ""
## [5] "WILL YOU LET ME BE YOURS"
## [6] ""
## [7] "GLORIA"
## [1] "DEAR JOHN"
## [2] "I WANT A MAN WHO KNOWS WHAT LOVE IS ALL ABOUT. YOU ARE GENEROUS KIND THOUGHTFUL. PEOPLE WHO ARE NOT LIKE YOU ADMIT TO BEING USELESS AND INFERIOR. YOU HAVE RUINED ME FOR OTHER MEN. I YEARN FOR YOU. I HAVE NO FEELINGS WHATSOEVER WHEN WE ARE APART. I CAN BE FOREVER HAPPY."
## [3] "WILL YOU LET ME BE YOURS"
## [4] "GLORIA"
Break paragraphs into sentences:
## [[1]]
## [1] "DEAR JOHN"
##
## [[2]]
## [1] "I WANT A MAN WHO KNOWS WHAT LOVE IS ALL ABOUT"
## [2] " YOU ARE GENEROUS KIND THOUGHTFUL"
## [3] " PEOPLE WHO ARE NOT LIKE YOU ADMIT TO BEING USELESS AND INFERIOR"
## [4] " YOU HAVE RUINED ME FOR OTHER MEN"
## [5] " I YEARN FOR YOU"
## [6] " I HAVE NO FEELINGS WHATSOEVER WHEN WE ARE APART"
## [7] " I CAN BE FOREVER HAPPY"
##
## [[3]]
## [1] "WILL YOU LET ME BE YOURS"
##
## [[4]]
## [1] "GLORIA"
Turn sentences into networks.
First turn sentences into tokens:
## [1] "YOU" "ARE" "GENEROUS" "KIND" "THOUGHTFUL"
Then turn tokens into word pairs:
## [,1] [,2]
## [1,] "YOU" "ARE"
## [2,] "YOU" "GENEROUS"
## [3,] "YOU" "KIND"
## [4,] "YOU" "THOUGHTFUL"
## [5,] "ARE" "GENEROUS"
## [6,] "ARE" "KIND"
## [7,] "ARE" "THOUGHTFUL"
## [8,] "GENEROUS" "KIND"
## [9,] "GENEROUS" "THOUGHTFUL"
## [10,] "KIND" "THOUGHTFUL"
Now apply this to all sentences, one at a time:
textToNet <- function( x )
{
x <- unlist( x )
word.pairs <- NULL
for( i in 1:length(x) )
{
x.sub <- x[i]
x.split <- strsplit( x.sub, split=" " )[[1]]
x.split <- x.split[ - grep( "^$", x.split ) ]
if( length(x.split) > 1 ) { word.pairs <- rbind( word.pairs, t( combn( x.split, 2) ) ) }
}
return( word.pairs )
}
textToNet( sentences ) %>% head()
## [,1] [,2]
## [1,] "YOU" "ARE"
## [2,] "YOU" "GENEROUS"
## [3,] "YOU" "KIND"
## [4,] "YOU" "THOUGHTFUL"
## [5,] "ARE" "GENEROUS"
## [6,] "ARE" "KIND"
# library( igraph )
mat1 <- textToNet( sentences )
g1 <- graph.edgelist( mat1, directed=FALSE )
par( mar=c(0,0,3,0) )
plot( g1,
layout = layout.kamada.kawai(g1),
edge.arrow.size =0.4,
vertex.size = 10,
vertex.label.cex = 1.5,
vertex.frame.color = "gray",
vertex.color = "gray"
)
title( main="Dear John Version 1" )
# x <- readLines("assets/dear_john_letter_2.txt", warn=FALSE)
x <- readLines( "https://raw.githubusercontent.com/Nonprofit-Open-Data-Collective/machine_learning_mission_codes/master/docs/tutorials/assets/dear_john_letter_2.txt", warn=FALSE )
x <- toupper( x )
x <- gsub( ",", "", x )
x <- gsub( "WE’RE", "WE ARE", x )
x <- gsub( "\\:", "", x )
x <- gsub( "\\?", "", x )
x <- x[ - grep( "^$", x ) ]
sentences <- strsplit( x, split="\\.")
mat2 <- textToNet( sentences )
g2 <- graph.edgelist( mat2, directed=FALSE )
par( mar=c(0,0,3,0) )
plot( g2,
layout = layout.kamada.kawai(g2),
edge.arrow.size =0.4,
vertex.size = 10,
vertex.label.cex = 1.5,
vertex.frame.color = "gray",
vertex.color = "gray"
)
title( main="Dear John Version 2" )
g.intersect <- g1 %s% g2
# remove isolates
g.intersect <- delete.vertices( g.intersect, degree(g.intersect)==0 )
# plot( g.intersect,
# layout = layout.kamada.kawai(g.intersect),
# edge.arrow.size =0.4,
# vertex.size = 10,
# vertex.label.cex = 1.5,
# vertex.frame.color = "gray",
# vertex.color = "gray"
# )
#
# title( main="Shared Semantic Ties" )
simpleNetwork( as_data_frame( g.intersect ),
linkDistance = 70, charge = -50,
fontSize = 10, fontFamily = "serif",
linkColour = "#D3D3D3",
zoom = T )
g1.unique <- g1 %m% g2
# remove isolates
g1.unique <- delete.vertices( g1.unique, degree(g1.unique)==0 )
# plot( g1.unique,
# layout = layout.kamada.kawai(g1.unique),
# edge.arrow.size =0.4,
# vertex.size = 10,
# vertex.label.cex = 1.5,
# vertex.frame.color = "gray",
# vertex.color = "gray"
# )
#
# title( main="Unique Semantic Ties G1" )
simpleNetwork( as_data_frame( g1.unique ),
linkDistance = 70, charge = -50,
fontSize = 10, fontFamily = "serif",
linkColour = "#D3D3D3",
zoom = T )
g2.unique <- g2 %m% g1
# remove isolates
g2.unique <- delete.vertices( g2.unique, degree(g2.unique)==0 )
# plot( g2.unique,
# layout = layout.kamada.kawai(g2.unique),
# edge.arrow.size =0.4,
# vertex.size = 10,
# vertex.label.cex = 1.5,
# vertex.frame.color = "gray",
# vertex.color = "gray"
# )
#
# title( main="Unique Semantic Ties G2" )
simpleNetwork( as_data_frame( g2.unique ),
linkDistance = 70, charge = -50,
fontSize = 10, fontFamily = "serif",
linkColour = "#D3D3D3",
zoom = T )
mission <- readRDS( gzcon( url( "https://github.com/Nonprofit-Open-Data-Collective/machine_learning_mission_codes/blob/master/DATA/MISSION.rds?raw=true" )))
head( mission[ c("NAME","F9_03_PZ_MISSION") ] )
mission$docid <- paste( mission$EIN, mission$TAXYR, sep="-" )
mission <- mission[ ! duplicated( mission$docid ) , ]
names(mission) <- toupper( names(mission ) )
mission <- mission[ c("EIN","TAXYR","NAME","F9_03_PZ_MISSION","NTEECODE","NTEE",
"ORGPURPOSECHARITABLE","ORGPURPOSERELIGIOUS",
"ORGPURPOSEEDUCATIONAL","ORGPURPOSESCIENTIFIC",
"ORGPURPOSELITERARY","ORGPURPOSEPUBLICSAFETY",
"ORGPURPOSEAMATEURSPORTS","ORGPURPOSECRUELTYPREVENTION") ]
mission.corp <- corpus( mission, text_field = "F9_03_PZ_MISSION")
# add document IDs
docid <- paste( mission$EIN, mission$TAXYR, sep="-" )
docnames( mission.corp ) <- docid
summary( mission.corp, 5)
mission.corp2 <- tolower( mission.corp )
mission.corp3 <- tokens( mission.corp2, remove_punct = TRUE )
mission.corp4 <- tokens_remove( tokens(mission.corp3), stopwords("english"), padding = TRUE )
mission.corp5 <- tokens_wordstem( mission.corp4 )
head( mission.corp5, 3 )
## tokens from 3 documents.
## 10716217-2016 :
## [1] "" "corpor" "specif" "purpos" ""
## [6] "" "support" "afford" "hous" "communiti"
## [11] "develop" "" "econom" "develop" ""
## [16] "" "citi" "" "counti" ""
## [21] "san" "francisco" "econom" "disadvantag" "individu"
## [26] "" "communiti" "" "lend" ""
## [31] "invest" "" "" "direct" "acquir"
## [36] "" "afford" "hous" "" "relat"
## [41] "communiti" "develop" "real" "estat" "asset"
##
## 10842551-2015 :
## [1] "support" "hartland" "creeksid" "elementari" "school"
##
## 20792368-2011 :
## [1] "communiti" "sport" "program"
token.list <- as.list( mission.corp5 )
token.list <- lapply( token.list, function(x){ x[ ! grepl( "^$", x ) ] } )
token.list[[1]]
## [1] "corpor" "specif" "purpos" "support" "afford"
## [6] "hous" "communiti" "develop" "econom" "develop"
## [11] "citi" "counti" "san" "francisco" "econom"
## [16] "disadvantag" "individu" "communiti" "lend" "invest"
## [21] "direct" "acquir" "afford" "hous" "relat"
## [26] "communiti" "develop" "real" "estat" "asset"
listToNet <- function( x )
{
word.pairs <- list()
for( i in 1:length(x) )
{
x.i <- x[[i]]
word.pairs[[i]] <- NULL
if( length( x.i ) > 1 ) { word.pairs[[i]] <- data.frame( t( combn( x.i, 2) ) ) }
if( length( x.i ) > 1 ) { names( word.pairs[[i]] ) <- c("from","to") }
}
return( word.pairs )
}
g.list <- listToNet( token.list )
head( g.list[[1]] )
Sanity check, nets should have n * (n-1) / 2 nodes.
## [1] 793658 2
## IGRAPH 57993a6 UN-- 4826 793658 --
## + attr: name (v/c)
##
## 0 1
## 2961 447
g.list.1 <- g.list[ mission$ORGPURPOSERELIGIOUS == 1 ]
m1 <- bind_rows( g.list.1 )
length( g.list.1 )
## [1] 447
g.list.2 <- g.list[ mission$ORGPURPOSERELIGIOUS == 0 ]
m2 <- bind_rows( g.list.2 )
length( g.list.2 )
## [1] 2961
g.religious.yes <- graph.edgelist( as.matrix(m1), directed=FALSE )
g.religious.no <- graph.edgelist( as.matrix(m2), directed=FALSE )
summary( g.religious.yes )
## IGRAPH 5801fef UN-- 1452 104179 --
## + attr: name (v/c)
## IGRAPH 582dea7 UN-- 4438 689479 --
## + attr: name (v/c)
## IGRAPH 584eb40 UN-- 1452 45910 --
## + attr: name (v/c)
## IGRAPH 58634a7 UN-- 4438 211114 --
## + attr: name (v/c)
g.religious.yes.unique <- g.religious.yes %m% g.religious.no
# remove isolates
g.religious.yes.unique <- delete.vertices( g.religious.yes.unique,
degree(g.religious.yes.unique)==0 )
summary( g.religious.yes.unique )
## IGRAPH 587485f UN-- 1449 31804 --
## + attr: name (v/c)
plot( g.religious.yes.unique,
layout = layout.kamada.kawai(g.religious.yes.unique),
edge.arrow.size =0.4,
vertex.size = 10,
vertex.label.cex = 1.5,
vertex.frame.color = "gray",
vertex.color = "gray"
)
title( main="Unique Word Ties Religious Nonprofits" )
These need to be filtered using a probabalistic method to identify only the ties that are statistically meaningful (would not occur by chance) within each group.
g1 <- induced_subgraph( g.religious.yes.unique,
sample( 1:length( V(g.religious.yes.unique) ), 100 ) )
# remove isolates
g1 <- delete.vertices( g1, degree(g1)==0 )
simpleNetwork( as_data_frame( g1 ),
linkDistance = 70, charge = -50,
fontSize = 10, fontFamily = "serif",
linkColour = "#D3D3D3",
zoom = T )
g1 <- induced_subgraph( g.religious.yes.unique,
sample( 1:length( V(g.religious.yes.unique) ), 100 ) )
# remove isolates
g1 <- delete.vertices( g1, degree(g1)==0 )
simpleNetwork( as_data_frame( g1 ),
linkDistance = 70, charge = -50,
fontSize = 10, fontFamily = "serif",
linkColour = "#D3D3D3",
zoom = T )
g.religious.no.unique <- g.religious.no %m% g.religious.yes
# remove isolates
g.religious.no.unique <- delete.vertices( g.religious.no.unique,
degree(g.religious.no.unique)==0 )
g2 <- induced_subgraph( g.religious.no.unique,
sample( 1:length( V(g.religious.no.unique) ), 100 ) )
# remove isolates
g2 <- delete.vertices( g2, degree(g2)==0 )
g2 <- simplify( g2 )
simpleNetwork( as_data_frame( g2 ),
linkDistance = 70, charge = -50,
fontSize = 10, fontFamily = "serif",
linkColour = "#D3D3D3",
zoom = T )
g.religious.no.unique <- g.religious.no %m% g.religious.yes
# remove isolates
g.religious.no.unique <- delete.vertices( g.religious.no.unique,
degree(g.religious.no.unique)==0 )
g2 <- induced_subgraph( g.religious.no.unique,
sample( 1:length( V(g.religious.no.unique) ), 100 ) )
# remove isolates
g2 <- delete.vertices( g2, degree(g2)==0 )
g2 <- simplify( g2 )
simpleNetwork( as_data_frame( g2 ),
linkDistance = 70, charge = -50,
fontSize = 10, fontFamily = "serif",
linkColour = "#D3D3D3",
zoom = T )