In this first script we will compile all raw data and create three separate files:
Files 2 and 3 will then be enhanced with geolocation data.
Data files have been previously manually downloaded from the following address under the heading “Files by Year”:
https://www.irs.gov/charities-non-profits/exempt-organizations-form-1023ez-approvals
Standard download format is .xlsx and the data is provided by year.
Loading 2014-2019 data and transforming it into rds files.
dd1 <- read_excel("Data/1_Raw/f1023ez_approvals_2014.xlsx")
dd1$EIN <- gsub( "-", "", dd1$EIN )
dd1$ID <- paste0( "ID-", 2014, "-", dd1$EIN )
saveRDS( dd1, "Data/1_Raw/f1023ez_approvals_2014.rds")
dd2 <- read_excel("Data/1_Raw/f1023ez_approvals_2015.xlsx")
dd2$EIN <- gsub( "-", "", dd2$EIN )
dd2$ID <- paste0( "ID-", 2015, "-", dd2$EIN )
saveRDS( dd2, "Data/1_Raw/f1023ez_approvals_2015.rds")
dd3 <- read_excel("Data/1_Raw/f1023ez_approvals_2016.xlsx")
dd3$EIN <- gsub( "-", "", dd3$EIN )
dd3$ID <- paste0( "ID-", 2016, "-", dd3$EIN )
saveRDS( dd3, "Data/1_Raw/f1023ez_approvals_2016.rds")
dd4 <- read_excel("Data/1_Raw/f1023ez_approvals_2017.xlsx")
dd4$EIN <- gsub( "-", "", dd4$EIN )
dd4$ID <- paste0( "ID-", 2017, "-", dd4$EIN )
saveRDS( dd4, "Data/1_Raw/f1023ez_approvals_2017.rds")
dd5 <- read_excel("Data/1_Raw/f1023ez_approvals_2018.xlsx")
dd5$EIN <- gsub( "-", "", dd5$EIN )
dd5$ID <- paste0( "ID-", 2018, "-", dd5$EIN )
saveRDS( dd5, "Data/1_Raw/f1023ez_approvals_2018.rds")
dd6 <- read_excel("Data/1_Raw/f1023ez_approvals_2019.xlsx")
colnames(dd6)[1] <- "EIN" # NOTE that this file has "Ein" -> fixing it to "EIN"
dd6$EIN <- gsub( "-", "", dd6$EIN )
dd6$ID <- paste0( "ID-", 2019, "-", dd6$EIN )
saveRDS( dd6, "Data/1_Raw/f1023ez_approvals_2019.rds")
Before we bind, we need to adjust some variables. All variables remain consistent until 2018, where some additional variables are added.
# 2018 vs 2017, The elements of setdiff(x,y) are those elements in x but not in y
setdiff( names( dd5 ), names( dd4 ) )
## [1] "Gamingactyno...91" "Gamingactyyes...92" "Mission"
## [4] "Gamingactyno...103" "Gamingactyyes...104" "HospitalOrChurchNo"
## [7] "HospitalOrChurchYes" "EZVersionNumber"
## [1] "Gamingactyno" "Gamingactyyes"
2018 datafile has duplicated “Gamingactyno” and “Gamingactyyes” columns, note that the excel inport process has added the column number to the name to avoid duplicate var names. In addition, we have four new variables:
“Gamingactyno” and “Gamingactyyes” in 2018 datafile both have identical data.
## [1] TRUE
## [1] TRUE
2019 has the same additional variables as 2018
## [1] "Gamingactyno...91" "Gamingactyyes...92" "Mission"
## [4] "Gamingactyno...103" "Gamingactyyes...104" "HospitalOrChurchNo"
## [7] "HospitalOrChurchYes" "EZVersionNumber"
## [1] "Gamingactyno" "Gamingactyyes"
However, in 2019 the duplicated variables hold different values.
## [1] FALSE
## [1] FALSE
For binding with the data of previous years, the name of one of the duplicated columns will be set back to its original name (removing the column #). We will leave the duplicated variables present in 2018 and 2019 data and will rename them with a “.1” at the end.
colnames(dd5)[91] <- "Gamingactyno"
colnames(dd5)[92] <- "Gamingactyyes"
colnames(dd5)[103] <- "Gamingactyno.1"
colnames(dd5)[104] <- "Gamingactyyes.1"
colnames(dd6)[91] <- "Gamingactyno"
colnames(dd6)[92] <- "Gamingactyyes"
colnames(dd6)[103] <- "Gamingactyno.1"
colnames(dd6)[104] <- "Gamingactyyes.1"
Now, binding all years into one single database
Data set has no gaps in the id variables
Current dataset has two variables for ORGNAME and they seem to be a single name split into the two vars.
Orgname1 has 0 NAs, while Orgname2 has 200780, NAs.
0 blanks in Orgname1" and 0 blanks in Orgname2
Looking further check to make sure the variables work when combined:
x <- dat[,c("Orgname1", "Orgname2")] #subsetting only orgnames to compare
x <- x[!is.na(x$Orgname2),] #removing NAs from Orgname2
x$Org2len <- nchar(as.character(x$Orgname2))
x <- x[order(x$Org2len, decreasing = T),]
head(x, 10)
Merging Orgname 1 and 2 to create variable ORGNAME
These are some orgname1 values before merging data in orgname2
After merging
x <- is.na(dat$Orgname2)
dat$ORGNAME[!x] <- paste0(dat$ORGNAME[!x], dat$Orgname2[!x])
x <- dat[!x,"ORGNAME"]
x[1:10,]
Some minor changes to variables and removing redundant information
colnames(dat)[2] <- "Case.Number"
dat <- unique(dat)
dat <- as_tibble(dat)
saveRDS( dat, "Data/2_InputData/NONPROFIT-ADDRESSES-2014-2019.rds")
The dataset has 263272 nonprofits, and 112 variables.
Using the NONPROFIT-ADDRESSES-2014-2019.rds dataset to make a new file with only the NPO data necessary.
keep.these <-
c( "ORGNAME","ID","Mission",
"EIN","Orgname1", "Orgname2",
"Case.Number", "Formrevision", "Eligibilityworksheet",
"Address", "City", "State", "Zip", "Zippl4",
"Accountingperiodend", "Userfeesubmitted",
"Orgurl",
"Orgtypecorp", "Orgtypeunincorp", "Orgtypetrust",
"Necessaryorgdocs", "Incorporateddate", "Incorporatedstate",
"Containslimitation", "Doesnotexpresslyempower", "Containsdissolution",
"Nteecode", "Orgpurposecharitable", "Orgpurposereligious",
"Orgpurposeeducational", "Orgpurposescientific",
"Orgpurposeliterary", "Orgpurposepublicsafety",
"Orgpurposeamateursports", "Orgpurposecrueltyprevention",
"Qualifyforexemption","Leginflno", "Leginflyes",
"Compofcrdirtrustno", "Compofcrdirtrustyes",
"Donatefundsno", "Donatefundsyes", "Conductactyoutsideusno",
"Conductactyoutsideusyes", "Financialtransofcrsno",
"Financialtransofcrsyes", "Unrelgrossincm1000moreno",
"Unrelgrossincm1000moreyes", "Gamingactyno",
"Gamingactyyes", "Disasterreliefno", "Disasterreliefyes",
"Onethirdsupportpublic", "Onethirdsupportgifts",
"Benefitofcollege", "Privatefoundation508e",
"Seekingretroreinstatement", "Seekingsec7reinstatement",
"Gamingactyno.1", "Gamingactyyes.1",
"HospitalOrChurchNo", "HospitalOrChurchYes",
"Correctnessdeclaration", "Signaturename",
"Signaturetitle", "Signaturedate",
"EZVersionNumber" )
These will be dropped:
## [1] "Primarycontactname" "Primarycontactphone"
## [3] "Primarycontactphoneext" "Primarycontactfax"
## [5] "Ofcrdirtrust1firstname" "Ofcrdirtrust1lastname"
## [7] "Ofcrdirtrust1title" "Ofcrdirtrust1streetaddr"
## [9] "Ofcrdirtrust1city" "Ofcrdirtrust1state"
## [11] "Ofcrdirtrust1zip" "Ofcrdirtrust1zippl4"
## [13] "Ofcrdirtrust2firstname" "Ofcrdirtrust2lastname"
## [15] "Ofcrdirtrust2title" "Ofcrdirtrust2streetaddr"
## [17] "Ofcrdirtrust2city" "Ofcrdirtrust2state"
## [19] "Ofcrdirtrust2zip" "Ofcrdirtrust2zippl4"
## [21] "Ofcrdirtrust3firstname" "Ofcrdirtrust3lastname"
## [23] "Ofcrdirtrust3title" "Ofcrdirtrust3streetaddr"
## [25] "Ofcrdirtrust3city" "Ofcrdirtrust3state"
## [27] "Ofcrdirtrust3zip" "Ofcrdirtrust3zippl4"
## [29] "Ofcrdirtrust4firstname" "Ofcrdirtrust4lastname"
## [31] "Ofcrdirtrust4title" "Ofcrdirtrust4streetaddr"
## [33] "Ofcrdirtrust4city" "Ofcrdirtrust4state"
## [35] "Ofcrdirtrust4zip" "Ofcrdirtrust4zippl4"
## [37] "Ofcrdirtrust5firstname" "Ofcrdirtrust5lastname"
## [39] "Ofcrdirtrust5title" "Ofcrdirtrust5streetaddr"
## [41] "Ofcrdirtrust5city" "Ofcrdirtrust5state"
## [43] "Ofcrdirtrust5zip" "Ofcrdirtrust5zippl4"
## [45] "Orgemail"
Adding a Year var, and removing redundant data
#adding YR var
x <- npo$ID
npo$YR <- substr(x, start = 4, stop = 7)
#ordering variables
npo <- npo[,c(2,1,4,68,3,5:67)]
#removing duplicates
npo <- unique(npo) #from 265,220 to 263,272
rownames(npo) <- NULL
We have 263272 cases with ID, of which 263138 are unique.
Exploring duplicates: how many? and how many repetitions?
id.count <- as.data.frame(table(npo$ID))
id.count <- id.count[order(id.count$Freq, decreasing = T),]
id.count$Var1 <- as.character(id.count$Var1)
rownames(id.count) <- NULL
names(id.count) <- c("ID", "IDdup")
head(id.count) #some IDs are repeated
#joining IDdup to the dataset
npo <- left_join(npo, id.count, by = "ID")
#subsetting those with duplicates
x <- npo$IDdup > 1
dups <- npo[x,]
dups <- dups[order(dups$IDdup, decreasing = T),c(1,69,2:68)]
The dataset had ID-2019-123456789 repeated 6 times with different npo information The EIN looks made up (1-2-3-4…)
ID-2019-833992638 is repeated 3 times, with different npo information.
The rest are duplicates with what appears the same npo information but different Case.Number and Signaturedate.
Take, for example, cases 10 and 11, they differ in those variables:
#exploring where is the difference
Var <- names(dups)
Case1 <- as.character(dups[10,])
Case2 <- as.character(dups[11,])
IsitEqual <- Case1 == Case2
tb <- data_frame(Var,Case1,Case2,IsitEqual)
tb[IsitEqual %in% FALSE,]
Or case 12 and 13:
Var <- names(dups)
Case1 <- as.character(dups[12,])
Case2 <- as.character(dups[13,])
IsitEqual <- Case1 == Case2
tb <- data_frame(Var,Case1,Case2,IsitEqual)
tb[IsitEqual %in% FALSE,]
Given ID is not unique, we will create a key variable that is unique
#Adding a key variable
npo <- npo[order(npo$ID),]
npo$key <- 1:nrow(npo)
#rearranging columns
npo <- npo[,c(1,70,2:69)]
Saving rds file
The dataset has 263272 nonprofits, and 70 variables.
Using the NONPROFIT-ADDRESSES-2014-2019.rds dataset to create a new dataset with key Board Members information.
Code chunk to upload the rds file - in case required
NONPROFIT-ADDRESSES-2014-2019.rds dataset is structured with a nonprofit for each row, holding informatino for five NPO members in mutiple columns. In the following code chunk we are reshaping that dataset so that each row is a board member with a unique ID.
Current ID variable for NPOs is ID-YEAR-EIN. We need to create a new ID variable which is unique for each board member in the same org. To make this ID variable we will add a count for each member, in the order reported on the 1023 forms.
Like this: ID-YEAR-EIN-##
This ID will be used when geo-coding as reference to re-join the geocoded data back to the original data.
#first member listed in the npos
d1 <-
dat %>%
select( ID, ORGNAME, EIN, Signaturedate, Case.Number,
Ofcrdirtrust1firstname, Ofcrdirtrust1lastname,
Ofcrdirtrust1title, Ofcrdirtrust1streetaddr,
Ofcrdirtrust1city, Ofcrdirtrust1state,
Ofcrdirtrust1zip, Ofcrdirtrust1zippl4 )
#standardizing the variable names for binding
nmz <- names(d1)
nmz <- gsub( "Ofcrdirtrust[1-9]", "Ofcrdirtrust", nmz )
names( d1 ) <- nmz
#adding board member #
d1$ID <- paste0( d1$ID, "-01" )
#second member listed in the npos
d2 <-
dat %>%
select( ID, ORGNAME, EIN, Signaturedate, Case.Number,
Ofcrdirtrust2firstname, Ofcrdirtrust2lastname,
Ofcrdirtrust2title, Ofcrdirtrust2streetaddr,
Ofcrdirtrust2city, Ofcrdirtrust2state,
Ofcrdirtrust2zip, Ofcrdirtrust2zippl4 )
nmz <- names(d2)
nmz <- gsub( "Ofcrdirtrust[1-9]", "Ofcrdirtrust", nmz )
names( d2 ) <- nmz
d2$ID <- paste0( d2$ID, "-02" )
#third member listed in the npos
d3 <-
dat %>%
select( ID, ORGNAME, EIN, Signaturedate, Case.Number,
Ofcrdirtrust3firstname, Ofcrdirtrust3lastname,
Ofcrdirtrust3title, Ofcrdirtrust3streetaddr,
Ofcrdirtrust3city, Ofcrdirtrust3state,
Ofcrdirtrust3zip, Ofcrdirtrust3zippl4 )
nmz <- names(d3)
nmz <- gsub( "Ofcrdirtrust[1-9]", "Ofcrdirtrust", nmz )
names( d3 ) <- nmz
d3$ID <- paste0( d3$ID, "-03" )
#fourth member listed in the npos
d4 <-
dat %>%
select( ID, ORGNAME, EIN, Signaturedate, Case.Number,
Ofcrdirtrust4firstname, Ofcrdirtrust4lastname,
Ofcrdirtrust4title, Ofcrdirtrust4streetaddr,
Ofcrdirtrust4city, Ofcrdirtrust4state,
Ofcrdirtrust4zip, Ofcrdirtrust4zippl4 )
nmz <- names(d4)
nmz <- gsub( "Ofcrdirtrust[1-9]", "Ofcrdirtrust", nmz )
names( d4 ) <- nmz
d4$ID <- paste0( d4$ID, "-04" )
#fifth member listed in the npos
d5 <-
dat %>%
select( ID, ORGNAME, EIN, Signaturedate, Case.Number,
Ofcrdirtrust5firstname, Ofcrdirtrust5lastname,
Ofcrdirtrust5title, Ofcrdirtrust5streetaddr,
Ofcrdirtrust5city, Ofcrdirtrust5state,
Ofcrdirtrust5zip, Ofcrdirtrust5zippl4 )
nmz <- names(d5)
nmz <- gsub( "Ofcrdirtrust[1-9]", "Ofcrdirtrust", nmz )
names( d5 ) <- nmz
d5$ID <- paste0( d5$ID, "-05" )
#binding all data
ppl <- bind_rows( d1, d2, d3, d4, d5 ) #all people data as individual cases.
Dropping empty cases and redundant data, adding a YR var, changing some var names and arranging the order
# identifying empty cases (NAs in all fields)
x <- is.na(ppl$Ofcrdirtrustfirstname) &
is.na(ppl$Ofcrdirtrustlastname) &
is.na(ppl$Ofcrdirtrusttitle) &
is.na(ppl$Ofcrdirtruststreetaddr) &
is.na(ppl$Ofcrdirtrustcity) &
is.na(ppl$Ofcrdirtruststate)
# removing empty cases
ppl <- ppl[!x, ]
# removing duplicated data
ppl <- unique(ppl)
rownames(ppl) <- NULL
# adding YR var
x <- ppl$ID
ppl$YR <- substr(x, start = 4, stop = 7)
pander(table(ppl$YR))
2014 | 2015 | 2016 | 2017 | 2018 | 2019 |
---|---|---|---|---|---|
56945 | 155945 | 172335 | 175196 | 192512 | 193160 |
## [1] "ID" "ORGNAME" "EIN"
## [4] "Signaturedate" "Case.Number" "Ofcrdirtrustfirstname"
## [7] "Ofcrdirtrustlastname" "Ofcrdirtrusttitle" "Ofcrdirtruststreetaddr"
## [10] "Ofcrdirtrustcity" "Ofcrdirtruststate" "Ofcrdirtrustzip"
## [13] "Ofcrdirtrustzippl4" "YR"
x <- c("Firstname", "Lastname", "Title", "Address", "City", "State", "Zip", "Zippl4")
names(ppl)[6:13] <- x
names(ppl)
## [1] "ID" "ORGNAME" "EIN" "Signaturedate"
## [5] "Case.Number" "Firstname" "Lastname" "Title"
## [9] "Address" "City" "State" "Zip"
## [13] "Zippl4" "YR"
#arranging order
ppl <- arrange( ppl, ID ) #ordering by ID (all board members of the same org together)
Now we will add a Gender variable. We use the Gender package, which estimates the gender of names using historicl data.
library(gender)
#creating a unique list of first names
first.names <- unique( ppl$Firstname )
gender.codes <- gender(first.names)
#selecting relevant variables
gender.codes <- select( gender.codes, name, gender, proportion_male )
head(gender.codes)
#merging gender var to dataset
ppl <- left_join( ppl, gender.codes, by = c("Firstname" = "name"))
x <- round(prop.table(table( ppl$gender, useNA="ifany" )),2)
pander(x)
either | female | male | NA |
---|---|---|---|
0 | 0.49 | 0.42 | 0.09 |
Ordering variables
#ordering variables
nmz <- c("ID",
"ORGNAME",
"EIN",
"YR",
"Signaturedate",
"Case.Number",
"Firstname",
"Lastname",
"Title",
"Address",
"City",
"State",
"Zip",
"Zippl4",
"gender",
"proportion_male")
ppl <- ppl[,nmz]
Checking to see that ID variable is unique…
id.count <- as.data.frame(table(ppl$ID))
id.count <- id.count[order(id.count$Freq, decreasing = T),]
head(id.count) #some IDs are repeated
#making key variable =
names(id.count) <- c("ID", "IDdup")
id.count$ID <- as.character(id.count$ID)
#joining count
ppl <- left_join(ppl, id.count, by = "ID")
#subsetting those with duplicates
x <- ppl$IDdup > 1
dups <- ppl[x,]
names(dups)
## [1] "ID" "ORGNAME" "EIN" "YR"
## [5] "Signaturedate" "Case.Number" "Firstname" "Lastname"
## [9] "Title" "Address" "City" "State"
## [13] "Zip" "Zippl4" "gender" "proportion_male"
## [17] "IDdup"
Given ID is not unique, we will create a key variable that is unique
## [1] "ID" "ORGNAME" "EIN" "YR"
## [5] "Signaturedate" "Case.Number" "Firstname" "Lastname"
## [9] "Title" "Address" "City" "State"
## [13] "Zip" "Zippl4" "gender" "proportion_male"
## [17] "IDdup" "key"
Saving rds file
The dataset has 946093 nonprofits, and 18 variables.