PENDING
In this script we will geocode POBs and failed addresses using zip and city centroids. The new dataset will be saved as a new version of the main files:
STEPS
PACKAGES
INTERNAL NOTE: this zipcodes were provided by Prof Lecy.
The zips are from this webpage: https://public.opendatasoft.com/explore/dataset/us-zip-code-latitude-and-longitude/export/
# this chunk generates a local file
zips2 <- read.csv("https://public.opendatasoft.com/explore/dataset/us-zip-code-latitude-and-longitude/download/?format=csv&timezone=America/New_York&use_labels_for_header=true", sep=';', stringsAsFactors = F, colClasses= c("character", "character", "character", "numeric", "numeric", "integer", "integer", "character"))
# removing some columns we dont need
zips2 <- zips2[,c(1:5)]
# saving it as an RDS
saveRDS(zips2, "Data/5_ZipandCity/zips2.rds")
Loading file
Adding zips centroids to the files:
The dataset with new geocodes will be saved as:
Loading NPO main file
Adding Zip1 and Zip2 geocode data and adding a geocode_type value
#zip1
npo.main <- left_join(npo.main, zips1, by = "Zip")
x <- which(is.na(npo.main$geocode_type)==TRUE & is.na(npo.main$lat_zip1)==FALSE)
npo.main$geocode_type[x] <- "zip1"
#zip2
npo.main <- left_join(npo.main, zips2, by= "Zip")
x <- which(is.na(npo.main$geocode_type)==TRUE & is.na(npo.main$lat_zip2)==FALSE)
npo.main$geocode_type[x] <- "zip2"
table(npo.main$geocode_type, useNA = "ifany") %>% pander()
census | zip1 | zip2 | NA | |
---|---|---|---|---|
182042 | 37794 | 33638 | 9557 | 241 |
Only 241 addresses have no geocode.
Saving file
Loading PPL main file
Adding Zip1 and Zip2 geocode data and geocode_type value
#zip1
ppl.main <- left_join(ppl.main, zips1, by = "Zip")
x <- which(is.na(ppl.main$geocode_type)==TRUE & is.na(ppl.main$lat_zip1)==FALSE)
ppl.main$geocode_type[x] <- "zip1"
#zip2
ppl.main <- left_join(ppl.main, zips2, by= "Zip")
x <- which(is.na(ppl.main$geocode_type)==TRUE & is.na(ppl.main$lat_zip2)==FALSE)
ppl.main$geocode_type[x] <- "zip2"
table(ppl.main$geocode_type, useNA = "ifany") %>% pander()
census | zip1 | zip2 | NA | |
---|---|---|---|---|
689983 | 149739 | 87597 | 15509 | 3265 |
Only 3265 addresses have no geocode.
Saving file
Getting City Centroids from: https://public.opendatasoft.com/explore/dataset/1000-largest-us-cities-by-population-with-geographic-coordinates/table/?sort=-rank
Note: City names may be repeated across the US, so we need to look at the state to be certain we are matching the precise one.
# getting city data from source
city <- read.csv("https://public.opendatasoft.com/explore/dataset/1000-largest-us-cities-by-population-with-geographic-coordinates/download/?format=csv&timezone=America/New_York&use_labels_for_header=true", sep=';', stringsAsFactors = F)
x <- city$Coordinates
x <- strsplit(x, ",")
x <- data.frame(matrix(unlist(x), nrow=1000, byrow=T),stringsAsFactors=FALSE)
city <- cbind(city,x)
colnames(city)[7:8] <- c("Lat", "Lon")
city$Lat <- as.numeric(city$Lat)
city$Lon <- as.numeric(city$Lon)
city$State <- state.abb[match(city$State,state.name)]
city <- city[,c(1,3,7,8)]
city$City <- toupper(city$City)
names(city) <- c("City", "State_cty", "lat_cty", "lon_cty")
# Cities repeat unless we include the state
city$city_st <- paste(city$City, city$State_cty, sep= ", ")
# saving
saveRDS(city, "Data/5_ZipandCity/cities-geo.rds")
Loading the city file
Cities repeat unless we use the city+state:
Preparing the file for merge, removing unwanted vars
Creating a city_st variable in the NPO dataset
Now merging with main npo data
Adding a geocode_type value
x <- which(is.na(npo.main$geocode_type)==TRUE & is.na(npo.main$lat_cty)==FALSE)
npo.main$geocode_type[x] <- "city"
table(npo.main$geocode_type, useNA = "ifany") %>% pander()
census | city | zip1 | zip2 | NA | |
---|---|---|---|---|---|
182042 | 114 | 37794 | 33638 | 9557 | 127 |
Only 127 addresses have no geocode.
Saving file
Creating a city_st variable in the PPL dataset
Now merging with main npo data
Adding a geocode_type value
x <- which(is.na(ppl.main$geocode_type)==TRUE & is.na(ppl.main$lat_cty)==FALSE)
ppl.main$geocode_type[x] <- "city"
table(ppl.main$geocode_type, useNA = "ifany") %>% pander()
census | city | zip1 | zip2 | NA | |
---|---|---|---|---|---|
689983 | 617 | 149739 | 87597 | 15509 | 2648 |
Only 2648 addresses have no geocode.
Saving file
NOTE: this priotization list should be backed by a comparison. why is Google better? why is Zip1 better than Zip2? How much is the difference between Google and Census? etc.
Geocode information has comes from different sources. As the database evolves over time, geocodes might be updated. Which geocode source we use when available can be summarized in the geocode prioritization list below:
Note: for more detail on this priority list see the Research Note.
Following this list, we will generate Lat and Lon variables in each data set.
Loading file
Creating the new variables
Now adding the prioritized lat/lon data by overwritting the values in the priority order.
npo.main$geocode_type <- NA
# city
x <- which(is.na(npo.main$lat_cty) == FALSE)
npo.main$lat[x] <- npo.main$lat_cty[x]
npo.main$lon[x] <- npo.main$lon_cty[x]
npo.main$geocode_type[x] <- "city"
# zip2
x <- which(is.na(npo.main$lat_zip2) == FALSE)
npo.main$lat[x] <- npo.main$lat_zip2[x]
npo.main$lon[x] <- npo.main$lon_zip2[x]
npo.main$geocode_type[x] <- "zip2"
# zip1
x <- which(is.na(npo.main$lat_zip1) == FALSE)
npo.main$lat[x] <- npo.main$lat_zip1[x]
npo.main$lon[x] <- npo.main$lon_zip1[x]
npo.main$geocode_type[x] <- "zip1"
# census
x <- which(is.na(npo.main$lat_cen) == FALSE)
npo.main$lat[x] <- npo.main$lat_cen[x]
npo.main$lon[x] <- npo.main$lon_cen[x]
npo.main$geocode_type[x] <- "census"
# google
x <- which(is.na(npo.main$lat_ggl) == FALSE)
npo.main$lat[x] <- npo.main$lat_ggl[x]
npo.main$lon[x] <- npo.main$lon_ggl[x]
npo.main$geocode_type[x] <- "google"
Checking to see the information is consistent
# checking the number of NAs
x <- is.na(npo.main$lat_cty) &
is.na(npo.main$lat_zip2) &
is.na(npo.main$lat_zip1) &
is.na(npo.main$lat_cen) &
is.na(npo.main$lat_ggl)
x %>% table()
## .
## FALSE TRUE
## 263145 127
##
## FALSE TRUE
## 263145 127
##
## census city google zip1 zip2 <NA>
## 182042 114 37794 33638 9557 127
# google
x <- which(npo.main$geocode_type == "google")
x <- npo.main$lat[x] == npo.main$lat_ggl[x]
table(x, useNA = "ifany")
## x
## TRUE
## 37794
# census
x <- which(npo.main$geocode_type == "census")
x <- npo.main$lat[x] == npo.main$lat_cen[x]
table(x, useNA = "ifany")
## x
## TRUE
## 182042
# zip1
x <- which(npo.main$geocode_type == "zip1")
x <- npo.main$lat[x] == npo.main$lat_zip1[x]
table(x, useNA = "ifany")
## x
## TRUE
## 33638
# zip2
x <- which(npo.main$geocode_type == "zip2")
x <- npo.main$lat[x] == npo.main$lat_zip2[x]
table(x, useNA = "ifany")
## x
## TRUE
## 9557
# cty
x <- which(npo.main$geocode_type == "city")
x <- npo.main$lat[x] == npo.main$lat_cty[x]
table(x, useNA = "ifany")
## x
## TRUE
## 114
Saving the file
Loading file
Creating the new variables
Now adding the prioritized lat/lon data by overwritting the values in the priority order.
ppl.main$geocode_type <- NA
# city
x <- which(is.na(ppl.main$lat_cty) == FALSE)
ppl.main$lat[x] <- ppl.main$lat_cty[x]
ppl.main$lon[x] <- ppl.main$lon_cty[x]
ppl.main$geocode_type[x] <- "city"
# zip2
x <- which(is.na(ppl.main$lat_zip2) == FALSE)
ppl.main$lat[x] <- ppl.main$lat_zip2[x]
ppl.main$lon[x] <- ppl.main$lon_zip2[x]
ppl.main$geocode_type[x] <- "zip2"
# zip1
x <- which(is.na(ppl.main$lat_zip1) == FALSE)
ppl.main$lat[x] <- ppl.main$lat_zip1[x]
ppl.main$lon[x] <- ppl.main$lon_zip1[x]
ppl.main$geocode_type[x] <- "zip1"
# census
x <- which(is.na(ppl.main$lat_cen) == FALSE)
ppl.main$lat[x] <- ppl.main$lat_cen[x]
ppl.main$lon[x] <- ppl.main$lon_cen[x]
ppl.main$geocode_type[x] <- "census"
# google
x <- which(is.na(ppl.main$lat_ggl) == FALSE)
ppl.main$lat[x] <- ppl.main$lat_ggl[x]
ppl.main$lon[x] <- ppl.main$lon_ggl[x]
ppl.main$geocode_type[x] <- "google"
Checking to see the information is consistent
# checking the number of NAs
x <- is.na(ppl.main$lat_cty) &
is.na(ppl.main$lat_zip2) &
is.na(ppl.main$lat_zip1) &
is.na(ppl.main$lat_cen) &
is.na(ppl.main$lat_ggl)
x %>% table()
## .
## FALSE TRUE
## 943445 2648
##
## FALSE TRUE
## 943445 2648
##
## census city google zip1 zip2 <NA>
## 689983 617 149739 87597 15509 2648
# google
x <- which(ppl.main$geocode_type == "google")
x <- ppl.main$lat[x] == ppl.main$lat_ggl[x]
table(x, useNA = "ifany")
## x
## TRUE
## 149739
# census
x <- which(ppl.main$geocode_type == "census")
x <- ppl.main$lat[x] == ppl.main$lat_cen[x]
table(x, useNA = "ifany")
## x
## TRUE
## 689983
# zip1
x <- which(ppl.main$geocode_type == "zip1")
x <- ppl.main$lat[x] == ppl.main$lat_zip1[x]
table(x, useNA = "ifany")
## x
## TRUE
## 87597
# zip2
x <- which(ppl.main$geocode_type == "zip2")
x <- ppl.main$lat[x] == ppl.main$lat_zip2[x]
table(x, useNA = "ifany")
## x
## TRUE
## 15509
# cty
x <- which(ppl.main$geocode_type == "city")
x <- ppl.main$lat[x] == ppl.main$lat_cty[x]
table(x, useNA = "ifany")
## x
## TRUE
## 617
Saving the file
npo.main <- readRDS("Data/5_ZipandCity/NONPROFITS-2014-2019v5.rds")
ppl.main <- readRDS("Data/5_ZipandCity/PEOPLE-2014-2019v5.rds")
For the NPO dataset
x <- table(npo.main$geocode_type, useNA = "ifany")
y <- prop.table(x)
summary <- as.data.frame(t(rbind(x,y)))
colnames(summary) <- c("frequency", "percent")
summary[nrow(summary)+1,] <- c(sum(summary$frequency), 1)
summary$percent <- paste0(round(summary$percent*100,2)," %")
rownames(summary)[nrow(summary)] <- "TOTAL"
summary <- summary[c(3,1,4,5,2,6,7),]
pander(summary)
frequency | percent | |
---|---|---|
37794 | 14.36 % | |
census | 182042 | 69.15 % |
zip1 | 33638 | 12.78 % |
zip2 | 9557 | 3.63 % |
city | 114 | 0.04 % |
NA. | 127 | 0.05 % |
TOTAL | 263272 | 100 % |
For the PPL dataset
x <- table(ppl.main$geocode_type, useNA = "ifany")
y <- prop.table(x)
summary <- as.data.frame(t(rbind(x,y)))
colnames(summary) <- c("frequency", "percent")
summary[nrow(summary)+1,] <- c(sum(summary$frequency), 1)
summary$percent <- paste0(round(summary$percent*100,2)," %")
rownames(summary)[nrow(summary)] <- "TOTAL"
summary <- summary[c(3,1,4,5,2,6,7),]
pander(summary)
frequency | percent | |
---|---|---|
149739 | 15.83 % | |
census | 689983 | 72.93 % |
zip1 | 87597 | 9.26 % |
zip2 | 15509 | 1.64 % |
city | 617 | 0.07 % |
NA. | 2648 | 0.28 % |
TOTAL | 946093 | 100 % |
Zips2 matches more than Zips.
x <- c(sum(!is.na(npo.main$lat_zip1)), sum(!is.na(npo.main$lat_zip2)))
dat <- data.frame(zips = c("zip1", "zip2"), matches = x, row.names = NULL)
dat$percent <- paste(round((dat$matches / nrow(npo.main))*100, 1), "%")
dat %>% pander()
zips | matches | percent |
---|---|---|
zip1 | 251466 | 95.5 % |
zip2 | 259894 | 98.7 % |
Which cases did not match in Zips1?
Which cases did not match in zips2?
Are the lat lon the same between zips1 and zips2?
FALSE | TRUE | NA |
---|---|---|
248827 | 46 | 14399 |
Is the city the same between original data and the city inducted from the zip file (zips2)?
x <- toupper(npo.main$City)
y <- toupper(npo.main$City_zip2)
table( x == y, useNA = "ifany") %>% pander()
FALSE | TRUE | NA |
---|---|---|
23427 | 236455 | 3390 |
FALSE | TRUE | NA |
---|---|---|
8.9 | 89.8 | 1.3 |
Is the State the same between original and the state inducted from the zip file (zips2)?
x <- toupper(npo.main$State)
y <- toupper(npo.main$State_zip2)
table( x == y, useNA = "ifany") %>% pander()
FALSE | TRUE | NA |
---|---|---|
1504 | 258336 | 3432 |
FALSE | TRUE | NA |
---|---|---|
0.6 | 98.1 | 1.3 |