Introduction

PENDING

show analysis of po box length
geocode po boxes with addresses.

In this script we will geocode POBs and failed addresses using zip and city centroids. The new dataset will be saved as a new version of the main files:

NONPROFITS-2014-2019v5.rds
PEOPLE-2014-2019v5.rds

STEPS

Geocoding with zip code centroids: we will add zip code centroids to the NPO and PPL files.
Geocoding with city centroids: city centroids will be added.
Generating a final lat lon variable.
Exploring the results

PACKAGES

#setting up the environment
library( dplyr )
library( tidyr )
library( pander )

# update the path with your working directory:
wd <- "/Users/icps86/Dropbox/R Projects/Open_data_ignacio"
setwd(wd)

1. Adding Zipcode Centroids

1.1 Loading zipcodes

Loading the zip-code-centroid data to use in throughout the script

INTERNAL NOTE: this zipcodes were provided by Prof Lecy.

zips1 <- readRDS("Data/5_ZipandCity/zips1.rds")
names(zips1) <- c("Zip", "lat_zip1", "lon_zip1")

Loading a secondary zip-code-centroid data to use in throughout the script

The zips are from this webpage: https://public.opendatasoft.com/explore/dataset/us-zip-code-latitude-and-longitude/export/

# this chunk generates a local file
zips2 <- read.csv("https://public.opendatasoft.com/explore/dataset/us-zip-code-latitude-and-longitude/download/?format=csv&timezone=America/New_York&use_labels_for_header=true", sep=';', stringsAsFactors = F, colClasses= c("character", "character", "character", "numeric", "numeric", "integer", "integer", "character"))

# removing some columns we dont need
zips2 <- zips2[,c(1:5)] 

# saving it as an RDS 
saveRDS(zips2, "Data/5_ZipandCity/zips2.rds")

Loading file

zips2 <- readRDS("Data/5_ZipandCity/zips2.rds")
names(zips2) <- c("Zip", "City_zip2", "State_zip2", "lat_zip2", "lon_zip2")

1.2 NPO dataset

Adding zips centroids to the files:

NONPROFITS-2014-2019v4.rds
PEOPLE-2014-2019v4.rds

The dataset with new geocodes will be saved as:

NONPROFITS-2014-2019v5.rds
PEOPLE-2014-2019v5.rds

Loading NPO main file

setwd(wd)
npo.main <- readRDS("Data/4_GeoGoogle/NONPROFITS-2014-2019v4.rds")

Adding Zip1 and Zip2 geocode data and adding a geocode_type value

#zip1
npo.main <- left_join(npo.main, zips1, by = "Zip")
x <- which(is.na(npo.main$geocode_type)==TRUE & is.na(npo.main$lat_zip1)==FALSE)
npo.main$geocode_type[x] <- "zip1"

#zip2
npo.main <- left_join(npo.main, zips2, by= "Zip")
x <- which(is.na(npo.main$geocode_type)==TRUE & is.na(npo.main$lat_zip2)==FALSE)
npo.main$geocode_type[x] <- "zip2"

table(npo.main$geocode_type, useNA = "ifany") %>% pander()

census	google	zip1	zip2	NA
182042	37794	33638	9557	241

Only 241 addresses have no geocode.

Saving file

saveRDS(npo.main, "Data/5_ZipandCity/NONPROFITS-2014-2019v5.rds")

1.3 PPL dataset

Loading PPL main file

setwd(wd)
ppl.main <- readRDS("Data/4_GeoGoogle/PEOPLE-2014-2019v4.rds")

Adding Zip1 and Zip2 geocode data and geocode_type value

#zip1
ppl.main <- left_join(ppl.main, zips1, by = "Zip")
x <- which(is.na(ppl.main$geocode_type)==TRUE & is.na(ppl.main$lat_zip1)==FALSE)
ppl.main$geocode_type[x] <- "zip1"

#zip2
ppl.main <- left_join(ppl.main, zips2, by= "Zip")
x <- which(is.na(ppl.main$geocode_type)==TRUE & is.na(ppl.main$lat_zip2)==FALSE)
ppl.main$geocode_type[x] <- "zip2"

table(ppl.main$geocode_type, useNA = "ifany") %>% pander()

census	google	zip1	zip2	NA
689983	149739	87597	15509	3265

Only 3265 addresses have no geocode.

Saving file

saveRDS(ppl.main, "Data/5_ZipandCity/PEOPLE-2014-2019v5.rds")

2. Adding City centroids to the remaining addresses

Getting City Centroids from: https://public.opendatasoft.com/explore/dataset/1000-largest-us-cities-by-population-with-geographic-coordinates/table/?sort=-rank

Note: City names may be repeated across the US, so we need to look at the state to be certain we are matching the precise one.

# getting city data from source
city <- read.csv("https://public.opendatasoft.com/explore/dataset/1000-largest-us-cities-by-population-with-geographic-coordinates/download/?format=csv&timezone=America/New_York&use_labels_for_header=true", sep=';', stringsAsFactors = F)

x <- city$Coordinates
x <- strsplit(x, ",")
x <- data.frame(matrix(unlist(x), nrow=1000, byrow=T),stringsAsFactors=FALSE)
city <- cbind(city,x)
colnames(city)[7:8] <- c("Lat", "Lon")
city$Lat <- as.numeric(city$Lat)
city$Lon <- as.numeric(city$Lon)
city$State <- state.abb[match(city$State,state.name)]
city <- city[,c(1,3,7,8)]
city$City <- toupper(city$City)

names(city) <- c("City", "State_cty", "lat_cty", "lon_cty")

# Cities repeat unless we include the state
city$city_st <- paste(city$City, city$State_cty, sep= ", ")

# saving 
saveRDS(city, "Data/5_ZipandCity/cities-geo.rds")

Loading the city file

city <- readRDS("Data/5_ZipandCity/cities-geo.rds")

Cities repeat unless we use the city+state:

75 duplicated cities.
0 duplicated city_st

Preparing the file for merge, removing unwanted vars

city <- city[,c(5,3,4)]

2.1 NPO

Creating a city_st variable in the NPO dataset

npo.main$city_st <- paste(npo.main$City, npo.main$State, sep= ", ")

Now merging with main npo data

npo.main <- left_join(npo.main, city, by= "city_st")

Adding a geocode_type value

x <- which(is.na(npo.main$geocode_type)==TRUE & is.na(npo.main$lat_cty)==FALSE)
npo.main$geocode_type[x] <- "city"

table(npo.main$geocode_type, useNA = "ifany") %>% pander()

census	city	google	zip1	zip2	NA
182042	114	37794	33638	9557	127

Only 127 addresses have no geocode.

Saving file

saveRDS(npo.main, "Data/5_ZipandCity/NONPROFITS-2014-2019v5.rds")

2.2 PPL

Creating a city_st variable in the PPL dataset

ppl.main$city_st <- paste(ppl.main$City, ppl.main$State, sep= ", ")

Now merging with main npo data

ppl.main <- left_join(ppl.main, city, by= "city_st")

Adding a geocode_type value

x <- which(is.na(ppl.main$geocode_type)==TRUE & is.na(ppl.main$lat_cty)==FALSE)
ppl.main$geocode_type[x] <- "city"
table(ppl.main$geocode_type, useNA = "ifany") %>% pander()

census	city	google	zip1	zip2	NA
689983	617	149739	87597	15509	2648

Only 2648 addresses have no geocode.

Saving file

saveRDS(ppl.main, "Data/5_ZipandCity/PEOPLE-2014-2019v5.rds")

3. Generating a Final lat and lon value

NOTE: this priotization list should be backed by a comparison. why is Google better? why is Zip1 better than Zip2? How much is the difference between Google and Census? etc.

Geocode information has comes from different sources. As the database evolves over time, geocodes might be updated. Which geocode source we use when available can be summarized in the geocode prioritization list below:

Google
Census
Zip (Zip1 > Zip2)
City

Note: for more detail on this priority list see the Research Note.

Following this list, we will generate Lat and Lon variables in each data set.

3.1 NPOs

Loading file

npo.main <- readRDS("Data/5_ZipandCity/NONPROFITS-2014-2019v5.rds")

Creating the new variables

npo.main$lat <- NA
npo.main$lon <- NA

Now adding the prioritized lat/lon data by overwritting the values in the priority order.

npo.main$geocode_type <- NA

# city
x <- which(is.na(npo.main$lat_cty) == FALSE)
npo.main$lat[x] <- npo.main$lat_cty[x]
npo.main$lon[x] <- npo.main$lon_cty[x]
npo.main$geocode_type[x] <- "city"

# zip2
x <- which(is.na(npo.main$lat_zip2) == FALSE)
npo.main$lat[x] <- npo.main$lat_zip2[x]
npo.main$lon[x] <- npo.main$lon_zip2[x]
npo.main$geocode_type[x] <- "zip2"

# zip1
x <- which(is.na(npo.main$lat_zip1) == FALSE)
npo.main$lat[x] <- npo.main$lat_zip1[x]
npo.main$lon[x] <- npo.main$lon_zip1[x]
npo.main$geocode_type[x] <- "zip1"

# census
x <- which(is.na(npo.main$lat_cen) == FALSE)
npo.main$lat[x] <- npo.main$lat_cen[x]
npo.main$lon[x] <- npo.main$lon_cen[x]
npo.main$geocode_type[x] <- "census"

# google
x <- which(is.na(npo.main$lat_ggl) == FALSE)
npo.main$lat[x] <- npo.main$lat_ggl[x]
npo.main$lon[x] <- npo.main$lon_ggl[x]
npo.main$geocode_type[x] <- "google"

Checking to see the information is consistent

# checking the number of NAs
x <- is.na(npo.main$lat_cty) &
  is.na(npo.main$lat_zip2) &
  is.na(npo.main$lat_zip1) &
  is.na(npo.main$lat_cen) &
  is.na(npo.main$lat_ggl)
x %>% table()

## .
##  FALSE   TRUE 
## 263145    127

table(is.na(npo.main$lat))

## 
##  FALSE   TRUE 
## 263145    127

# checking using geocode_type
table(npo.main$geocode_type, useNA = "ifany")

## 
## census   city google   zip1   zip2   <NA> 
## 182042    114  37794  33638   9557    127

# google
x <- which(npo.main$geocode_type == "google")
x <- npo.main$lat[x] == npo.main$lat_ggl[x]
table(x, useNA = "ifany")

## x
##  TRUE 
## 37794

# census
x <- which(npo.main$geocode_type == "census")
x <- npo.main$lat[x] == npo.main$lat_cen[x]
table(x, useNA = "ifany")

## x
##   TRUE 
## 182042

# zip1
x <- which(npo.main$geocode_type == "zip1")
x <- npo.main$lat[x] == npo.main$lat_zip1[x]
table(x, useNA = "ifany")

## x
##  TRUE 
## 33638

# zip2
x <- which(npo.main$geocode_type == "zip2")
x <- npo.main$lat[x] == npo.main$lat_zip2[x]
table(x, useNA = "ifany")

## x
## TRUE 
## 9557

# cty
x <- which(npo.main$geocode_type == "city")
x <- npo.main$lat[x] == npo.main$lat_cty[x]
table(x, useNA = "ifany")

## x
## TRUE 
##  114

Saving the file

saveRDS(npo.main, "Data/5_ZipandCity/NONPROFITS-2014-2019v5.rds")

3.2 PPL

Loading file

ppl.main <- readRDS("Data/5_ZipandCity/PEOPLE-2014-2019v5.rds")

Creating the new variables

ppl.main$lat <- NA
ppl.main$lon <- NA

Now adding the prioritized lat/lon data by overwritting the values in the priority order.

ppl.main$geocode_type <- NA

# city
x <- which(is.na(ppl.main$lat_cty) == FALSE)
ppl.main$lat[x] <- ppl.main$lat_cty[x]
ppl.main$lon[x] <- ppl.main$lon_cty[x]
ppl.main$geocode_type[x] <- "city"

# zip2
x <- which(is.na(ppl.main$lat_zip2) == FALSE)
ppl.main$lat[x] <- ppl.main$lat_zip2[x]
ppl.main$lon[x] <- ppl.main$lon_zip2[x]
ppl.main$geocode_type[x] <- "zip2"

# zip1
x <- which(is.na(ppl.main$lat_zip1) == FALSE)
ppl.main$lat[x] <- ppl.main$lat_zip1[x]
ppl.main$lon[x] <- ppl.main$lon_zip1[x]
ppl.main$geocode_type[x] <- "zip1"

# census
x <- which(is.na(ppl.main$lat_cen) == FALSE)
ppl.main$lat[x] <- ppl.main$lat_cen[x]
ppl.main$lon[x] <- ppl.main$lon_cen[x]
ppl.main$geocode_type[x] <- "census"

# google
x <- which(is.na(ppl.main$lat_ggl) == FALSE)
ppl.main$lat[x] <- ppl.main$lat_ggl[x]
ppl.main$lon[x] <- ppl.main$lon_ggl[x]
ppl.main$geocode_type[x] <- "google"

Checking to see the information is consistent

# checking the number of NAs
x <- is.na(ppl.main$lat_cty) &
  is.na(ppl.main$lat_zip2) &
  is.na(ppl.main$lat_zip1) &
  is.na(ppl.main$lat_cen) &
  is.na(ppl.main$lat_ggl)
x %>% table()

## .
##  FALSE   TRUE 
## 943445   2648

table(is.na(ppl.main$lat))

## 
##  FALSE   TRUE 
## 943445   2648

# checking using geocode_type
table(ppl.main$geocode_type, useNA = "ifany")

## 
## census   city google   zip1   zip2   <NA> 
## 689983    617 149739  87597  15509   2648

# google
x <- which(ppl.main$geocode_type == "google")
x <- ppl.main$lat[x] == ppl.main$lat_ggl[x]
table(x, useNA = "ifany")

## x
##   TRUE 
## 149739

# census
x <- which(ppl.main$geocode_type == "census")
x <- ppl.main$lat[x] == ppl.main$lat_cen[x]
table(x, useNA = "ifany")

## x
##   TRUE 
## 689983

# zip1
x <- which(ppl.main$geocode_type == "zip1")
x <- ppl.main$lat[x] == ppl.main$lat_zip1[x]
table(x, useNA = "ifany")

## x
##  TRUE 
## 87597

# zip2
x <- which(ppl.main$geocode_type == "zip2")
x <- ppl.main$lat[x] == ppl.main$lat_zip2[x]
table(x, useNA = "ifany")

## x
##  TRUE 
## 15509

# cty
x <- which(ppl.main$geocode_type == "city")
x <- ppl.main$lat[x] == ppl.main$lat_cty[x]
table(x, useNA = "ifany")

## x
## TRUE 
##  617

Saving the file

saveRDS(ppl.main, "Data/5_ZipandCity/PEOPLE-2014-2019v5.rds")

4. Exploring data

npo.main <- readRDS("Data/5_ZipandCity/NONPROFITS-2014-2019v5.rds")
ppl.main <- readRDS("Data/5_ZipandCity/PEOPLE-2014-2019v5.rds")

4.1 Summary of geocoding types

For the NPO dataset

x <- table(npo.main$geocode_type, useNA = "ifany")
y <- prop.table(x)
summary <- as.data.frame(t(rbind(x,y)))
colnames(summary) <- c("frequency", "percent")
summary[nrow(summary)+1,] <- c(sum(summary$frequency), 1)
summary$percent <- paste0(round(summary$percent*100,2)," %")
rownames(summary)[nrow(summary)] <- "TOTAL"
summary <- summary[c(3,1,4,5,2,6,7),]
pander(summary)

	frequency	percent
google	37794	14.36 %
census	182042	69.15 %
zip1	33638	12.78 %
zip2	9557	3.63 %
city	114	0.04 %
NA.	127	0.05 %
TOTAL	263272	100 %

Google and Census geocodes add up to 83.5014738%
While zips and city add to 16.4502872%

For the PPL dataset

x <- table(ppl.main$geocode_type, useNA = "ifany")
y <- prop.table(x)
summary <- as.data.frame(t(rbind(x,y)))
colnames(summary) <- c("frequency", "percent")
summary[nrow(summary)+1,] <- c(sum(summary$frequency), 1)
summary$percent <- paste0(round(summary$percent*100,2)," %")
rownames(summary)[nrow(summary)] <- "TOTAL"
summary <- summary[c(3,1,4,5,2,6,7),]
pander(summary)

	frequency	percent
google	149739	15.83 %
census	689983	72.93 %
zip1	87597	9.26 %
zip2	15509	1.64 %
city	617	0.07 %
NA.	2648	0.28 %
TOTAL	946093	100 %

Google and Census geocodes add up to 88.7568135%
While zips and city add to 10.9632985%

Zips2 matches more than Zips.

x <- c(sum(!is.na(npo.main$lat_zip1)), sum(!is.na(npo.main$lat_zip2)))
dat <- data.frame(zips = c("zip1", "zip2"), matches = x, row.names = NULL)
dat$percent <- paste(round((dat$matches / nrow(npo.main))*100, 1), "%")
dat %>% pander()

zips	matches	percent
zip1	251466	95.5 %
zip2	259894	98.7 %

Which cases did not match in Zips1?

Which cases did not match in zips2?

Are the lat lon the same between zips1 and zips2?

table( npo.main$lat_zip1 == npo.main$lat_zip2, useNA = "ifany") %>% pander()

FALSE	TRUE	NA
248827	46	14399

Is the city the same between original data and the city inducted from the zip file (zips2)?

x <- toupper(npo.main$City)
y <- toupper(npo.main$City_zip2)

table( x == y, useNA = "ifany") %>% pander()

FALSE	TRUE	NA
23427	236455	3390

round(prop.table(table( x == y, useNA = "ifany"))*100,1)  %>% pander()

FALSE	TRUE	NA
8.9	89.8	1.3

Is the State the same between original and the state inducted from the zip file (zips2)?

x <- toupper(npo.main$State)
y <- toupper(npo.main$State_zip2)

table( x == y, useNA = "ifany") %>% pander()

FALSE	TRUE	NA
1504	258336	3432

round(prop.table(table( x == y, useNA = "ifany"))*100,1)  %>% pander()

FALSE	TRUE	NA
0.6	98.1	1.3

Step 5: Geocoding Zips and POBs

9/5/2019

Introduction

1. Adding Zipcode Centroids

1.1 Loading zipcodes

1.2 NPO dataset

1.3 PPL dataset

2. Adding City centroids to the remaining addresses

2.1 NPO

2.2 PPL

3. Generating a Final lat and lon value

3.1 NPOs

3.2 PPL

4. Exploring data

4.1 Summary of geocoding types