In this first script we will compile all raw data and create three separate files:

NONPROFIT-ADDRESSES-2014-2019.rds: Data file with complete nonprofit data
NONPROFIT-2014-2019.rds: Nonprofit dataset with addressses and other essential information
PEOPLE-2014-2019.rds: Board member dataset with addresses and other essential information

Files 2 and 3 will then be enhanced with geolocation data.

Packages

library( dplyr )
library( tidyr )
library( gender )
library( pander )
library( readxl ) # package to read excel, install with: install.packages("readxl")

#update the path with your working directory:
wd <- "/Users/icps86/Dropbox/R Projects/Open_data_ignacio"
setwd(wd)

1. Downloading raw data and creating a comprenhensive dataset file

Data files have been previously manually downloaded from the following address under the heading “Files by Year”:

https://www.irs.gov/charities-non-profits/exempt-organizations-form-1023ez-approvals

Standard download format is .xlsx and the data is provided by year.

1.1. Transforming xlsx files into rds.

Loading 2014-2019 data and transforming it into rds files.

dd1 <- read_excel("Data/1_Raw/f1023ez_approvals_2014.xlsx")
dd1$EIN <- gsub( "-", "", dd1$EIN )
dd1$ID <- paste0( "ID-", 2014, "-", dd1$EIN )
saveRDS( dd1, "Data/1_Raw/f1023ez_approvals_2014.rds")

dd2 <- read_excel("Data/1_Raw/f1023ez_approvals_2015.xlsx")
dd2$EIN <- gsub( "-", "", dd2$EIN )
dd2$ID <- paste0( "ID-", 2015, "-", dd2$EIN )
saveRDS( dd2, "Data/1_Raw/f1023ez_approvals_2015.rds")

dd3 <- read_excel("Data/1_Raw/f1023ez_approvals_2016.xlsx")
dd3$EIN <- gsub( "-", "", dd3$EIN )
dd3$ID <- paste0( "ID-", 2016, "-", dd3$EIN )
saveRDS( dd3, "Data/1_Raw/f1023ez_approvals_2016.rds")

dd4 <- read_excel("Data/1_Raw/f1023ez_approvals_2017.xlsx")
dd4$EIN <- gsub( "-", "", dd4$EIN )
dd4$ID <- paste0( "ID-", 2017, "-", dd4$EIN )
saveRDS( dd4, "Data/1_Raw/f1023ez_approvals_2017.rds")

dd5 <- read_excel("Data/1_Raw/f1023ez_approvals_2018.xlsx")
dd5$EIN <- gsub( "-", "", dd5$EIN )
dd5$ID <- paste0( "ID-", 2018, "-", dd5$EIN )
saveRDS( dd5, "Data/1_Raw/f1023ez_approvals_2018.rds")

dd6 <- read_excel("Data/1_Raw/f1023ez_approvals_2019.xlsx")
colnames(dd6)[1] <- "EIN" # NOTE that this file has "Ein" -> fixing it to "EIN" 
dd6$EIN <- gsub( "-", "", dd6$EIN )
dd6$ID <- paste0( "ID-", 2019, "-", dd6$EIN )
saveRDS( dd6, "Data/1_Raw/f1023ez_approvals_2019.rds")

1.2. Binding all rds files into a a single comprenhensive rds

Before we bind, we need to adjust some variables. All variables remain consistent until 2018, where some additional variables are added.

# 2018 vs 2017, The elements of setdiff(x,y) are those elements in x but not in y
setdiff( names( dd5 ), names( dd4 ) )

## [1] "Gamingactyno...91"   "Gamingactyyes...92"  "Mission"            
## [4] "Gamingactyno...103"  "Gamingactyyes...104" "HospitalOrChurchNo" 
## [7] "HospitalOrChurchYes" "EZVersionNumber"

setdiff( names( dd4 ), names( dd5 ) )

## [1] "Gamingactyno"  "Gamingactyyes"

2018 datafile has duplicated “Gamingactyno” and “Gamingactyyes” columns, note that the excel inport process has added the column number to the name to avoid duplicate var names. In addition, we have four new variables:

“Mission”
“HospitalOrChurchNo”
“HospitalOrChurchYes”
“EZVersionNumber”

“Gamingactyno” and “Gamingactyyes” in 2018 datafile both have identical data.

identical(dd5$Gamingactyno...103, dd5$Gamingactyno...91)

## [1] TRUE

identical(dd5$Gamingactyyes...104, dd5$Gamingactyyes...92)

## [1] TRUE

2019 has the same additional variables as 2018

# 2019 vs 2017
setdiff( names( dd6 ), names( dd4 ) )

## [1] "Gamingactyno...91"   "Gamingactyyes...92"  "Mission"            
## [4] "Gamingactyno...103"  "Gamingactyyes...104" "HospitalOrChurchNo" 
## [7] "HospitalOrChurchYes" "EZVersionNumber"

setdiff( names( dd4 ), names( dd6 ) )

## [1] "Gamingactyno"  "Gamingactyyes"

However, in 2019 the duplicated variables hold different values.

identical(dd6$Gamingactyno...103, dd6$Gamingactyno...91)

## [1] FALSE

x <- dd6$Gamingactyno...103 == dd6$Gamingactyno...91

11559 discrepancies and 44587 identical values between the variales.
0 NAs in Gamingactyno…91 variable and 0 NAs in Gamingactyno…103 variable

identical(dd6$Gamingactyyes...104, dd6$Gamingactyyes...92)

## [1] FALSE

x <- dd6$Gamingactyyes...104 == dd6$Gamingactyyes...92

0 discrepancies and 44587 identical values between the variales
0 NAs in Gamingactyyes…92 variable and 11559 NAs in Gamingactyyes…104 variable

For binding with the data of previous years, the name of one of the duplicated columns will be set back to its original name (removing the column #). We will leave the duplicated variables present in 2018 and 2019 data and will rename them with a “.1” at the end.

colnames(dd5)[91] <- "Gamingactyno"
colnames(dd5)[92] <- "Gamingactyyes"
colnames(dd5)[103] <- "Gamingactyno.1"
colnames(dd5)[104] <- "Gamingactyyes.1"

colnames(dd6)[91] <- "Gamingactyno"
colnames(dd6)[92] <- "Gamingactyyes"
colnames(dd6)[103] <- "Gamingactyno.1"
colnames(dd6)[104] <- "Gamingactyyes.1"

Now, binding all years into one single database

dat <- bind_rows( dd1, dd2, dd3, dd4, dd5, dd6 )

1.3. Fixing some variables and saving 2016-2019 dataset rds file

Data set has no gaps in the id variables

EIN variable has 0 NAs
Case.Number variable has 0 NAs

Current dataset has two variables for ORGNAME and they seem to be a single name split into the two vars.

head(dat[,c("Orgname1", "Orgname2")],5)

Orgname1 has 0 NAs, while Orgname2 has 200780, NAs.

0 blanks in Orgname1" and 0 blanks in Orgname2

Looking further check to make sure the variables work when combined:

x <- dat[,c("Orgname1", "Orgname2")] #subsetting only orgnames to compare
x <- x[!is.na(x$Orgname2),] #removing NAs from Orgname2
x$Org2len <- nchar(as.character(x$Orgname2))
x <- x[order(x$Org2len, decreasing = T),]
head(x, 10)

Merging Orgname 1 and 2 to create variable ORGNAME

These are some orgname1 values before merging data in orgname2

dat$ORGNAME <- dat$Orgname1
x <- is.na(dat$Orgname2)
x <- dat[!x,c("Orgname1","Orgname2")]
x[1:10,]

After merging

x <- is.na(dat$Orgname2)
dat$ORGNAME[!x] <- paste0(dat$ORGNAME[!x], dat$Orgname2[!x])
x <- dat[!x,"ORGNAME"]
x[1:10,]

Some minor changes to variables and removing redundant information

colnames(dat)[2] <- "Case.Number"
dat <- unique(dat)
dat <- as_tibble(dat)
saveRDS( dat, "Data/2_InputData/NONPROFIT-ADDRESSES-2014-2019.rds")

The dataset has 263272 nonprofits, and 112 variables.

head(dat, 10)

2. Building a Nonprofit (NPO) Addresses dataset

Using the NONPROFIT-ADDRESSES-2014-2019.rds dataset to make a new file with only the NPO data necessary.

keep.these <- 
  c( "ORGNAME","ID","Mission",
     "EIN","Orgname1", "Orgname2",
     "Case.Number", "Formrevision", "Eligibilityworksheet", 
     "Address", "City", "State", "Zip", "Zippl4", 
     "Accountingperiodend", "Userfeesubmitted",
     "Orgurl",
     "Orgtypecorp", "Orgtypeunincorp", "Orgtypetrust", 
    "Necessaryorgdocs", "Incorporateddate", "Incorporatedstate", 
    "Containslimitation", "Doesnotexpresslyempower", "Containsdissolution", 
    "Nteecode", "Orgpurposecharitable", "Orgpurposereligious", 
    "Orgpurposeeducational", "Orgpurposescientific", 
    "Orgpurposeliterary", "Orgpurposepublicsafety", 
    "Orgpurposeamateursports", "Orgpurposecrueltyprevention", 
    "Qualifyforexemption","Leginflno", "Leginflyes", 
    "Compofcrdirtrustno", "Compofcrdirtrustyes", 
    "Donatefundsno", "Donatefundsyes", "Conductactyoutsideusno", 
    "Conductactyoutsideusyes", "Financialtransofcrsno", 
    "Financialtransofcrsyes", "Unrelgrossincm1000moreno", 
    "Unrelgrossincm1000moreyes", "Gamingactyno", 
    "Gamingactyyes", "Disasterreliefno", "Disasterreliefyes", 
    "Onethirdsupportpublic", "Onethirdsupportgifts", 
    "Benefitofcollege", "Privatefoundation508e", 
    "Seekingretroreinstatement", "Seekingsec7reinstatement", 
    "Gamingactyno.1", "Gamingactyyes.1", 
    "HospitalOrChurchNo", "HospitalOrChurchYes", 
    "Correctnessdeclaration", "Signaturename", 
    "Signaturetitle", "Signaturedate",    
    "EZVersionNumber" )

These will be dropped:

setdiff( names(dat), keep.these )

##  [1] "Primarycontactname"      "Primarycontactphone"    
##  [3] "Primarycontactphoneext"  "Primarycontactfax"      
##  [5] "Ofcrdirtrust1firstname"  "Ofcrdirtrust1lastname"  
##  [7] "Ofcrdirtrust1title"      "Ofcrdirtrust1streetaddr"
##  [9] "Ofcrdirtrust1city"       "Ofcrdirtrust1state"     
## [11] "Ofcrdirtrust1zip"        "Ofcrdirtrust1zippl4"    
## [13] "Ofcrdirtrust2firstname"  "Ofcrdirtrust2lastname"  
## [15] "Ofcrdirtrust2title"      "Ofcrdirtrust2streetaddr"
## [17] "Ofcrdirtrust2city"       "Ofcrdirtrust2state"     
## [19] "Ofcrdirtrust2zip"        "Ofcrdirtrust2zippl4"    
## [21] "Ofcrdirtrust3firstname"  "Ofcrdirtrust3lastname"  
## [23] "Ofcrdirtrust3title"      "Ofcrdirtrust3streetaddr"
## [25] "Ofcrdirtrust3city"       "Ofcrdirtrust3state"     
## [27] "Ofcrdirtrust3zip"        "Ofcrdirtrust3zippl4"    
## [29] "Ofcrdirtrust4firstname"  "Ofcrdirtrust4lastname"  
## [31] "Ofcrdirtrust4title"      "Ofcrdirtrust4streetaddr"
## [33] "Ofcrdirtrust4city"       "Ofcrdirtrust4state"     
## [35] "Ofcrdirtrust4zip"        "Ofcrdirtrust4zippl4"    
## [37] "Ofcrdirtrust5firstname"  "Ofcrdirtrust5lastname"  
## [39] "Ofcrdirtrust5title"      "Ofcrdirtrust5streetaddr"
## [41] "Ofcrdirtrust5city"       "Ofcrdirtrust5state"     
## [43] "Ofcrdirtrust5zip"        "Ofcrdirtrust5zippl4"    
## [45] "Orgemail"

npo <- select( dat, keep.these )

Adding a Year var, and removing redundant data

#adding YR var
x <- npo$ID
npo$YR <- substr(x, start = 4, stop = 7)

#ordering variables
npo <- npo[,c(2,1,4,68,3,5:67)]

#removing duplicates
npo <- unique(npo) #from 265,220 to 263,272
rownames(npo) <- NULL

We have 263272 cases with ID, of which 263138 are unique.

Exploring duplicates: how many? and how many repetitions?

id.count <- as.data.frame(table(npo$ID))
id.count <- id.count[order(id.count$Freq, decreasing = T),]
id.count$Var1 <- as.character(id.count$Var1)
rownames(id.count) <- NULL
names(id.count) <- c("ID", "IDdup")
head(id.count) #some IDs are repeated

#joining IDdup to the dataset
npo <- left_join(npo, id.count, by = "ID")

#subsetting those with duplicates
x <- npo$IDdup > 1
dups <- npo[x,] 
dups <- dups[order(dups$IDdup, decreasing = T),c(1,69,2:68)]

The dataset had ID-2019-123456789 repeated 6 times with different npo information The EIN looks made up (1-2-3-4…)

dups[1:6,c(1,3,7,8)]

ID-2019-833992638 is repeated 3 times, with different npo information.

dups[7:9,c(1,3,7,8)]

The rest are duplicates with what appears the same npo information but different Case.Number and Signaturedate.

Take, for example, cases 10 and 11, they differ in those variables:

#exploring where is the difference
Var <- names(dups)
Case1 <- as.character(dups[10,])
Case2 <- as.character(dups[11,])
IsitEqual <- Case1 == Case2 
tb <- data_frame(Var,Case1,Case2,IsitEqual)
tb[IsitEqual %in% FALSE,]

Or case 12 and 13:

Var <- names(dups)
Case1 <- as.character(dups[12,])
Case2 <- as.character(dups[13,])
IsitEqual <- Case1 == Case2
tb <- data_frame(Var,Case1,Case2,IsitEqual)
tb[IsitEqual %in% FALSE,]

Given ID is not unique, we will create a key variable that is unique

#Adding a key variable
npo <- npo[order(npo$ID),]
npo$key <- 1:nrow(npo)

#rearranging columns
npo <- npo[,c(1,70,2:69)]

Saving rds file

#saving
npo <- as_tibble(npo)
saveRDS( npo, "Data/2_InputData/NONPROFITS-2014-2019.rds")

The dataset has 263272 nonprofits, and 70 variables.

head(npo, 10)

3. Building a Board Members (PPL) Addresses dataset

Using the NONPROFIT-ADDRESSES-2014-2019.rds dataset to create a new dataset with key Board Members information.

Code chunk to upload the rds file - in case required

NONPROFIT-ADDRESSES-2014-2019.rds dataset is structured with a nonprofit for each row, holding informatino for five NPO members in mutiple columns. In the following code chunk we are reshaping that dataset so that each row is a board member with a unique ID.

Current ID variable for NPOs is ID-YEAR-EIN. We need to create a new ID variable which is unique for each board member in the same org. To make this ID variable we will add a count for each member, in the order reported on the 1023 forms.

Like this: ID-YEAR-EIN-##

This ID will be used when geo-coding as reference to re-join the geocoded data back to the original data.

#first member listed in the npos
d1 <- 
  dat %>%
  select( ID, ORGNAME, EIN, Signaturedate, Case.Number,  
          Ofcrdirtrust1firstname, Ofcrdirtrust1lastname, 
          Ofcrdirtrust1title, Ofcrdirtrust1streetaddr, 
          Ofcrdirtrust1city, Ofcrdirtrust1state, 
          Ofcrdirtrust1zip, Ofcrdirtrust1zippl4 ) 

#standardizing the variable names for binding
nmz <- names(d1)
nmz <- gsub( "Ofcrdirtrust[1-9]", "Ofcrdirtrust", nmz )
names( d1 ) <- nmz

#adding board member #
d1$ID <- paste0( d1$ID, "-01" )

#second member listed in the npos
d2 <- 
  dat %>% 
  select( ID, ORGNAME, EIN, Signaturedate, Case.Number,
          Ofcrdirtrust2firstname, Ofcrdirtrust2lastname, 
          Ofcrdirtrust2title, Ofcrdirtrust2streetaddr, 
          Ofcrdirtrust2city, Ofcrdirtrust2state, 
          Ofcrdirtrust2zip, Ofcrdirtrust2zippl4 )
nmz <- names(d2)
nmz <- gsub( "Ofcrdirtrust[1-9]", "Ofcrdirtrust", nmz )
names( d2 ) <- nmz
d2$ID <- paste0( d2$ID, "-02" )

#third member listed in the npos
d3 <- 
  dat %>% 
  select( ID, ORGNAME, EIN, Signaturedate, Case.Number,
          Ofcrdirtrust3firstname, Ofcrdirtrust3lastname, 
          Ofcrdirtrust3title, Ofcrdirtrust3streetaddr, 
          Ofcrdirtrust3city, Ofcrdirtrust3state,
          Ofcrdirtrust3zip, Ofcrdirtrust3zippl4 )
nmz <- names(d3)
nmz <- gsub( "Ofcrdirtrust[1-9]", "Ofcrdirtrust", nmz )
names( d3 ) <- nmz
d3$ID <- paste0( d3$ID, "-03" )

#fourth member listed in the npos
d4 <- 
  dat %>% 
  select( ID, ORGNAME, EIN, Signaturedate, Case.Number, 
          Ofcrdirtrust4firstname, Ofcrdirtrust4lastname, 
          Ofcrdirtrust4title, Ofcrdirtrust4streetaddr, 
          Ofcrdirtrust4city, Ofcrdirtrust4state, 
          Ofcrdirtrust4zip, Ofcrdirtrust4zippl4 )
nmz <- names(d4)
nmz <- gsub( "Ofcrdirtrust[1-9]", "Ofcrdirtrust", nmz )
names( d4 ) <- nmz
d4$ID <- paste0( d4$ID, "-04" )

#fifth member listed in the npos
d5 <- 
  dat %>% 
  select( ID, ORGNAME, EIN, Signaturedate, Case.Number, 
          Ofcrdirtrust5firstname, Ofcrdirtrust5lastname, 
          Ofcrdirtrust5title, Ofcrdirtrust5streetaddr, 
          Ofcrdirtrust5city, Ofcrdirtrust5state, 
          Ofcrdirtrust5zip, Ofcrdirtrust5zippl4 )
nmz <- names(d5)
nmz <- gsub( "Ofcrdirtrust[1-9]", "Ofcrdirtrust", nmz )
names( d5 ) <- nmz
d5$ID <- paste0( d5$ID, "-05" )

#binding all data
ppl <- bind_rows( d1, d2, d3, d4, d5 ) #all people data as individual cases.

Dropping empty cases and redundant data, adding a YR var, changing some var names and arranging the order

# identifying empty cases (NAs in all fields)
x <- is.na(ppl$Ofcrdirtrustfirstname) & 
           is.na(ppl$Ofcrdirtrustlastname) & 
           is.na(ppl$Ofcrdirtrusttitle) & 
           is.na(ppl$Ofcrdirtruststreetaddr) & 
           is.na(ppl$Ofcrdirtrustcity) & 
           is.na(ppl$Ofcrdirtruststate)

# removing empty cases
ppl <- ppl[!x, ] 

# removing duplicated data
ppl <- unique(ppl)
rownames(ppl) <- NULL

# adding YR var
x <- ppl$ID
ppl$YR <- substr(x, start = 4, stop = 7)
pander(table(ppl$YR))

2014	2015	2016	2017	2018	2019
56945	155945	172335	175196	192512	193160

# Renaming variables to more friendly names
names(ppl)

##  [1] "ID"                     "ORGNAME"                "EIN"                   
##  [4] "Signaturedate"          "Case.Number"            "Ofcrdirtrustfirstname" 
##  [7] "Ofcrdirtrustlastname"   "Ofcrdirtrusttitle"      "Ofcrdirtruststreetaddr"
## [10] "Ofcrdirtrustcity"       "Ofcrdirtruststate"      "Ofcrdirtrustzip"       
## [13] "Ofcrdirtrustzippl4"     "YR"

x <- c("Firstname", "Lastname", "Title", "Address", "City", "State", "Zip", "Zippl4")
names(ppl)[6:13] <- x
names(ppl)

##  [1] "ID"            "ORGNAME"       "EIN"           "Signaturedate"
##  [5] "Case.Number"   "Firstname"     "Lastname"      "Title"        
##  [9] "Address"       "City"          "State"         "Zip"          
## [13] "Zippl4"        "YR"

#arranging order
ppl <- arrange( ppl, ID ) #ordering by ID (all board members of the same org together)

Now we will add a Gender variable. We use the Gender package, which estimates the gender of names using historicl data.

library(gender)
#creating a unique list of first names
first.names <- unique( ppl$Firstname )
gender.codes <- gender(first.names)

#selecting relevant variables
gender.codes <- select( gender.codes, name, gender, proportion_male )
head(gender.codes)

#merging gender var to dataset
ppl <- left_join( ppl, gender.codes, by = c("Firstname" = "name"))

x <- round(prop.table(table( ppl$gender, useNA="ifany" )),2)
pander(x)

either	female	male	NA
0	0.49	0.42	0.09

Ordering variables

#ordering variables
nmz <- c("ID",
         "ORGNAME",
         "EIN",
         "YR",
         "Signaturedate",
         "Case.Number",
         "Firstname",
         "Lastname",
         "Title",
         "Address",
         "City",
         "State",
         "Zip",
         "Zippl4",
         "gender",
         "proportion_male") 

ppl <- ppl[,nmz]

Checking to see that ID variable is unique…

id.count <- as.data.frame(table(ppl$ID))
id.count <- id.count[order(id.count$Freq, decreasing = T),]
head(id.count) #some IDs are repeated

#making key variable =
names(id.count) <- c("ID", "IDdup")
id.count$ID <- as.character(id.count$ID)

#joining count
ppl <- left_join(ppl, id.count, by = "ID")

#subsetting those with duplicates
x <- ppl$IDdup > 1
dups <- ppl[x,] 
names(dups)

##  [1] "ID"              "ORGNAME"         "EIN"             "YR"             
##  [5] "Signaturedate"   "Case.Number"     "Firstname"       "Lastname"       
##  [9] "Title"           "Address"         "City"            "State"          
## [13] "Zip"             "Zippl4"          "gender"          "proportion_male"
## [17] "IDdup"

dups <- dups[order(dups$IDdup, decreasing = T),c(1,17,2:16)]