In this script we will augment the dataset with selected census variables using the geolocations.
We will use multiple methods to include census data…
We are using the IPUMS GeoMaker Tool, which attaches contextual data to your point data by determining the census geographic unit in which each point lies and attaching characteristics of that unit to the point record. The initial release of GeoMarker attaches data from the 2017 American Community Survey 5-year data at the census tract level.
Steven Manson, Jonathan Schroeder, David Van Riper, and Steven Ruggles. IPUMS National Historical Geographic Information System: Version 14.0 [Database]. Minneapolis, MN: IPUMS. 2019. http://doi.org/10.18128/D050.V14.0
NOTE: using this data needs a request, see https://nhgis.org/research/citation
STEPS
Input files:
Output Files:
NOTES
PACKAGES
Loading file
Preparing input file fot the IPUMS Geomarker
Loading NPO census data results
The output of the census merge has a duplicated data point.
## [1] 263145
## [1] 263145
## [1] 263146
## [1] 263145
# identifying the duplicates
x <- npo.ipumsGEO$key
npo.dup <- npo.ipumsGEO$key[which(duplicated(x))]
# subsetting the duplicated values to take a look
x <- which(npo.ipumsGEO$key %in% npo.dup)
npo.dat <- npo.ipumsGEO[x,]
Measures for the duplicated values seem very different:
key | lat | lon | GISJOIN | STATE | STATEA | |
---|---|---|---|---|---|---|
41575 | 41591 | 37.12 | -120.3 | G0600390000300 | California | 6 |
41576 | 41591 | 37.12 | -120.3 | G0600390000202 | California | 6 |
COUNTY | COUNTYA | TRACTA | GM001_2017 | GM002_2017 | |
---|---|---|---|---|---|
41575 | Madera County | 39 | 300 | 0.04142 | 0.3172 |
41576 | Madera County | 39 | 202 | 0.1036 | 0.194 |
GM003_2017 | GM004_2017 | GM005_2017 | GM006_2017 | GM007_2017 | |
---|---|---|---|---|---|
41575 | 34841 | 0.432 | 0.2861 | 0.417 | 0.02469 |
41576 | 58568 | 0.4436 | 0.1307 | 0.6042 | 0.0103 |
GM008_2017 | GM009_2017 | GM010_2017 | |
---|---|---|---|
41575 | 0.6271 | 887.1 | 287.5 |
41576 | 0.8058 | 20.76 | 7.676 |
We test different ways to solve the duplicated case
x <- which(npo.ipums$key %in% npo.dup)
dup <- npo.ipums[x,]
write.csv(dup, "Data/6_CensusData/dups/NPOdup-again.csv", row.names = F)
Loading results
dupGEO <- read.csv("Data/6_CensusData/dups/NPOdup-againGEO.csv", row.names = NULL, stringsAsFactors = F)
dupGEO %>% pander()
key | lat | lon | GISJOIN | STATE | STATEA | COUNTY |
---|---|---|---|---|---|---|
41591 | 37.12 | -120.3 | G0600390000202 | California | 6 | Madera County |
41591 | 37.12 | -120.3 | G0600390000300 | California | 6 | Madera County |
COUNTYA | TRACTA | GM001_2017 | GM002_2017 | GM003_2017 | GM004_2017 |
---|---|---|---|---|---|
39 | 202 | 0.1036 | 0.194 | 58568 | 0.4436 |
39 | 300 | 0.04142 | 0.3172 | 34841 | 0.432 |
GM005_2017 | GM006_2017 | GM007_2017 | GM008_2017 | GM009_2017 | GM010_2017 |
---|---|---|---|---|---|
0.1307 | 0.6042 | 0.0103 | 0.8058 | 20.76 | 7.676 |
0.2861 | 0.417 | 0.02469 | 0.6271 | 887.1 | 287.5 |
We get the same issue, double results.
# subsetting by ADDRESS
dup <- select( npo, key, Address, City, State, Zip)
x <- which(dup$key %in% npo.dup)
dup <- dup[x,]
write.csv(dup, "Data/6_CensusData/dups/NPOdup-add.csv", row.names = F)
The query fails to make the match.
# getting the actual address in the npo main.
x <- which(npo$key == npo.dup)
npo$input_address[x] # 240 N 1ST STREET, CHOWCHILLA, CA, 93610
## [1] "240 N 1ST STREET, CHOWCHILLA, CA, 93610"
# we use google maps to get the lat/lon
x <- c(37.125284, -120.260293)
dup <- data.frame(key=1, lat=x[1], lon = x[2], row.names = NULL)
write.csv(dup, "Data/6_CensusData/dups/NPOdup-ggl-latlon.csv", row.names = F)
Loading results
dupGEO <- read.csv("Data/6_CensusData/dups/NPOdup-ggl-latlonGEO.csv", row.names = NULL, stringsAsFactors = F)
dupGEO %>% pander()
key | lat | lon | GISJOIN | STATE | STATEA | COUNTY |
---|---|---|---|---|---|---|
1 | 37.13 | -120.3 | G0600390000202 | California | 6 | Madera County |
1 | 37.13 | -120.3 | G0600390000300 | California | 6 | Madera County |
COUNTYA | TRACTA | GM001_2017 | GM002_2017 | GM003_2017 | GM004_2017 |
---|---|---|---|---|---|
39 | 202 | 0.1036 | 0.194 | 58568 | 0.4436 |
39 | 300 | 0.04142 | 0.3172 | 34841 | 0.432 |
GM005_2017 | GM006_2017 | GM007_2017 | GM008_2017 | GM009_2017 | GM010_2017 |
---|---|---|---|---|---|
0.1307 | 0.6042 | 0.0103 | 0.8058 | 20.76 | 7.676 |
0.2861 | 0.417 | 0.02469 | 0.6271 | 887.1 | 287.5 |
Despite using different lat/lons, we get the same issue, double results.
Using this website and google maps, we were able to determine that the location is within tract 300.
Updating the IPUMS results to exclude the duplicate case that is not in tract 300
Preparing the results for the merge
# removing lat lons
npo.ipumsGEO <- npo.ipumsGEO[,-c(2,3)]
# Changing the names of the variables using the Data Dictionary - from the GM codebook
# Geographic Unit Identifiers:
# GISJOIN: GIS Join Match Code
# STATE: State Name
# STATEA: State Code
# COUNTY: County Name
# COUNTYA: County Code
# TRACTA: Census Tract Code
# Contextual variables: (Your file will include only those variables you requested)
# GM001_2017: Proportion unemployed
# GM002_2017: Proportion population in poverty
# GM003_2017: Median household income
# GM004_2017: Income inequality
# GM005_2017: Proportion family households headed by single woman
# GM006_2017: Proportion occupied housing units that are owner occupied
# GM007_2017: Proportion African American
# GM008_2017: Proportion of adults who completed high school
# GM009_2017: Persons per square kilometer
# GM010_2017: Housing units per square kilometer
npo.ipumsGEO <- rename( npo.ipumsGEO,
STATEFIPS = STATEA,
COUNTYFIPS = COUNTYA,
TRACTFIPS = TRACTA,
unemp = GM001_2017,
poverty = GM002_2017,
medinc = GM003_2017,
inequality = GM004_2017,
single=GM005_2017,
ownerocc = GM006_2017,
black = GM007_2017,
hs = GM008_2017,
p.density = GM009_2017,
h.density = GM010_2017 )
Merging and saving
Cases with census data
x <- is.na(npo.cen$poverty) %>% table()
x <- as.data.frame(x)
x <- cbind(x, paste0(round(prop.table(x$Freq) * 100,1),"%"))
names(x) <- c("No Census data", "Freq", "%")
pander(x)
No Census data | Freq | % |
---|---|---|
FALSE | 262811 | 99.8% |
TRUE | 461 | 0.2% |
Cases by geocode_type
x <- table(npo.cen$geocode_type, useNA = "ifany")
x <- as.data.frame(x)
x <- cbind(x, paste0(round(prop.table(x$Freq) * 100,1),"%"))
names(x) <- c("geocode_type", "Freq", "%")
x <- x[c(3,1,4,5,2,6),]
row.names(x) <- NULL
pander(x)
geocode_type | Freq | % |
---|---|---|
37794 | 14.4% | |
census | 182042 | 69.1% |
zip1 | 33638 | 12.8% |
zip2 | 9557 | 3.6% |
city | 114 | 0% |
NA | 127 | 0% |
Cases with final latitude/longitude
x <- table(is.na(npo.cen$lat), useNA = "ifany")
x <- as.data.frame(x)
x <- cbind(x, paste0(round(prop.table(x$Freq) * 100,1),"%"))
names(x) <- c("No lat/lon", "Freq", "%")
pander(x)
No lat/lon | Freq | % |
---|---|---|
FALSE | 263145 | 100% |
TRUE | 127 | 0% |
Loading file
Preparing input file for the IPUMS Geomarker. In this case, we need to divide the addresses in chunks
ppl.ipums <- select( ppl, key, lat, lon )
ppl.ipums <- na.omit( ppl.ipums )
write.csv( ppl.ipums, "Data/6_CensusData/PPL-ipums.csv", row.names=F )
nrow(ppl.ipums)
## [1] 943445
ppl.ipums1 <- ppl.ipums[1:235000,]
ppl.ipums2 <- ppl.ipums[235001:470000,]
ppl.ipums3 <- ppl.ipums[470001:705000,]
ppl.ipums4 <- ppl.ipums[705001:nrow(ppl.ipums),]
write.csv( ppl.ipums1, "Data/6_CensusData/PPL-ipums1.csv", row.names=F )
write.csv( ppl.ipums2, "Data/6_CensusData/PPL-ipums2.csv", row.names=F )
write.csv( ppl.ipums3, "Data/6_CensusData/PPL-ipums3.csv", row.names=F )
write.csv( ppl.ipums4, "Data/6_CensusData/PPL-ipums4.csv", row.names=F )
After manually getting the census data from IPUMS, we load the results
ppl.ipums1GEO <- read.csv("Data/6_CensusData/PPL-ipums1GEO.csv", stringsAsFactors = F, row.names = NULL)
ppl.ipums2GEO <- read.csv("Data/6_CensusData/PPL-ipums2GEO.csv", stringsAsFactors = F, row.names = NULL)
ppl.ipums3GEO <- read.csv("Data/6_CensusData/PPL-ipums3GEO.csv", stringsAsFactors = F, row.names = NULL)
ppl.ipums4GEO <- read.csv("Data/6_CensusData/PPL-ipums4GEO.csv", stringsAsFactors = F, row.names = NULL)
# binding all
ppl.ipumsGEO <- rbind(ppl.ipums1GEO, ppl.ipums2GEO, ppl.ipums3GEO, ppl.ipums4GEO)
# writting a compiled results rds
write.csv(ppl.ipumsGEO, "Data/6_CensusData/PPL-ipumsGEO.csv", row.names = FALSE)
The output of the census merge has duplicates
# ppl.ipumsGEO <- read.csv("Data/6_CensusData/PPL-ipumsGEO.csv", row.names = NULL, stringsAsFactors = F)
# input file
x <- ppl.ipums$key
length(x)
## [1] 943445
## [1] 943445
## [1] 943452
## [1] 943445
# identifying the duplicate IDs
x <- ppl.ipumsGEO$key
ppl.dup <- ppl.ipumsGEO$key[which(duplicated(x))]
# subsetting the duplicated values to take a look
x <- which(ppl.ipumsGEO$key %in% ppl.dup)
ppl.dat <- ppl.ipumsGEO[x,]
IPUMS Geomaker results shows all the repeated cases as in Madera County, CA, which is not accurate for all of them.
key | lat | lon | GISJOIN | STATE | STATEA | |
---|---|---|---|---|---|---|
55975 | 56103 | 37.12 | -120.3 | G0600390000202 | California | 6 |
55976 | 56103 | 37.12 | -120.3 | G0600390000300 | California | 6 |
55979 | 56106 | 37.12 | -120.3 | G0600390000202 | California | 6 |
55980 | 56106 | 37.12 | -120.3 | G0600390000300 | California | 6 |
154659 | 155140 | 27.52 | -82.65 | G0600390000300 | California | 6 |
154660 | 155140 | 27.52 | -82.65 | G0600390000202 | California | 6 |
154662 | 155142 | 37.34 | -121.9 | G0600390000202 | California | 6 |
154663 | 155142 | 37.34 | -121.9 | G0600390000300 | California | 6 |
349310 | 350685 | 40.13 | -75.01 | G0600390000202 | California | 6 |
349311 | 350685 | 40.13 | -75.01 | G0600390000300 | California | 6 |
349312 | 350686 | 40.12 | -75.51 | G0600390000300 | California | 6 |
349313 | 350686 | 40.12 | -75.51 | G0600390000202 | California | 6 |
794100 | 796630 | 31.33 | -94.71 | G0600390000202 | California | 6 |
794101 | 796630 | 31.33 | -94.71 | G0600390000300 | California | 6 |
COUNTY | COUNTYA | TRACTA | GM001_2017 | GM002_2017 | |
---|---|---|---|---|---|
55975 | Madera County | 39 | 202 | 0.1036 | 0.194 |
55976 | Madera County | 39 | 300 | 0.04142 | 0.3172 |
55979 | Madera County | 39 | 202 | 0.1036 | 0.194 |
55980 | Madera County | 39 | 300 | 0.04142 | 0.3172 |
154659 | Madera County | 39 | 300 | 0.04142 | 0.3172 |
154660 | Madera County | 39 | 202 | 0.1036 | 0.194 |
154662 | Madera County | 39 | 202 | 0.1036 | 0.194 |
154663 | Madera County | 39 | 300 | 0.04142 | 0.3172 |
349310 | Madera County | 39 | 202 | 0.1036 | 0.194 |
349311 | Madera County | 39 | 300 | 0.04142 | 0.3172 |
349312 | Madera County | 39 | 300 | 0.04142 | 0.3172 |
349313 | Madera County | 39 | 202 | 0.1036 | 0.194 |
794100 | Madera County | 39 | 202 | 0.1036 | 0.194 |
794101 | Madera County | 39 | 300 | 0.04142 | 0.3172 |
GM003_2017 | GM004_2017 | GM005_2017 | GM006_2017 | GM007_2017 | |
---|---|---|---|---|---|
55975 | 58568 | 0.4436 | 0.1307 | 0.6042 | 0.0103 |
55976 | 34841 | 0.432 | 0.2861 | 0.417 | 0.02469 |
55979 | 58568 | 0.4436 | 0.1307 | 0.6042 | 0.0103 |
55980 | 34841 | 0.432 | 0.2861 | 0.417 | 0.02469 |
154659 | 34841 | 0.432 | 0.2861 | 0.417 | 0.02469 |
154660 | 58568 | 0.4436 | 0.1307 | 0.6042 | 0.0103 |
154662 | 58568 | 0.4436 | 0.1307 | 0.6042 | 0.0103 |
154663 | 34841 | 0.432 | 0.2861 | 0.417 | 0.02469 |
349310 | 58568 | 0.4436 | 0.1307 | 0.6042 | 0.0103 |
349311 | 34841 | 0.432 | 0.2861 | 0.417 | 0.02469 |
349312 | 34841 | 0.432 | 0.2861 | 0.417 | 0.02469 |
349313 | 58568 | 0.4436 | 0.1307 | 0.6042 | 0.0103 |
794100 | 58568 | 0.4436 | 0.1307 | 0.6042 | 0.0103 |
794101 | 34841 | 0.432 | 0.2861 | 0.417 | 0.02469 |
GM008_2017 | GM009_2017 | GM010_2017 | |
---|---|---|---|
55975 | 0.8058 | 20.76 | 7.676 |
55976 | 0.6271 | 887.1 | 287.5 |
55979 | 0.8058 | 20.76 | 7.676 |
55980 | 0.6271 | 887.1 | 287.5 |
154659 | 0.6271 | 887.1 | 287.5 |
154660 | 0.8058 | 20.76 | 7.676 |
154662 | 0.8058 | 20.76 | 7.676 |
154663 | 0.6271 | 887.1 | 287.5 |
349310 | 0.8058 | 20.76 | 7.676 |
349311 | 0.6271 | 887.1 | 287.5 |
349312 | 0.6271 | 887.1 | 287.5 |
349313 | 0.8058 | 20.76 | 7.676 |
794100 | 0.8058 | 20.76 | 7.676 |
794101 | 0.6271 | 887.1 | 287.5 |
We test different ways to solve the duplicated case
# subsetting the duplicated values to take a look
x <- which(ppl.ipums$key %in% ppl.dup)
dup <- ppl.ipums[x,]
write.csv(dup, "Data/6_CensusData/dups/PPLdup-again.csv", row.names = F)
Loading results
dupGEO <- read.csv("Data/6_CensusData/dups/PPLdup-againGEO.csv", row.names = NULL, stringsAsFactors = F)
dupGEO %>% pander()
key | lat | lon | GISJOIN | STATE | STATEA | COUNTY | COUNTYA | TRACTA |
---|---|---|---|---|---|---|---|---|
56103 | 37.12 | -120.3 | NA | NA | NA | NA | NA | NA |
56106 | 37.12 | -120.3 | NA | NA | NA | NA | NA | NA |
155140 | 27.52 | -82.65 | NA | NA | NA | NA | NA | NA |
155142 | 37.34 | -121.9 | NA | NA | NA | NA | NA | NA |
350685 | 40.13 | -75.01 | NA | NA | NA | NA | NA | NA |
350686 | 40.12 | -75.51 | NA | NA | NA | NA | NA | NA |
796630 | 31.33 | -94.71 | NA | NA | NA | NA | NA | NA |
GM001_2017 | GM002_2017 | GM003_2017 | GM004_2017 | GM006_2017 | GM005_2017 |
---|---|---|---|---|---|
NA | NA | NA | NA | NA | NA |
NA | NA | NA | NA | NA | NA |
NA | NA | NA | NA | NA | NA |
NA | NA | NA | NA | NA | NA |
NA | NA | NA | NA | NA | NA |
NA | NA | NA | NA | NA | NA |
NA | NA | NA | NA | NA | NA |
GM007_2017 | GM008_2017 | GM009_2017 | GM010_2017 |
---|---|---|---|
NA | NA | NA | NA |
NA | NA | NA | NA |
NA | NA | NA | NA |
NA | NA | NA | NA |
NA | NA | NA | NA |
NA | NA | NA | NA |
NA | NA | NA | NA |
We get NAs for all
# subsetting by ADDRESS
dup <- select( ppl, key, Address, City, State, Zip)
# subsetting the duplicated values to take a look
x <- which(dup$key %in% ppl.dup)
dup <- dup[x,]
write.csv(ppl, "Data/6_CensusData/dups/PPLdup-add.csv", row.names = F)
The query fails to make the match.
#subsetting the duplicate addresses
x <- which(ppl$key %in% ppl.dup)
dup <- ppl[x,]
dup <- select(dup, key, input_address)
# using google geocoding service to get the lat lons.
library( ggmap )
api <- readLines("../google1.api") # reading my personal API key from a local file
register_google(key = api) #The register_google function stores the API key.
getOption("ggmap") #summarises the Google credentials to check how you are connected.
dup <- mutate_geocode(these, input_address, output = "latlona", source = "google", messaging = T) #generates an object where the original dataset is binded with the geocode results.
saveRDS(dup, "Data/6_CensusData/dups/GoogleResults1.rds")
# formatting the new lat/lons for submitting to the Geomaker
dup <- dup[,c(1,3,4)]
write.csv(dup, "Data/6_CensusData/dups/PPLdup-ggl-latlon.csv", row.names = F)
Loading results
dupGEO <- read.csv("Data/6_CensusData/dups/PPLdup-ggl-latlonGEO.csv", row.names = NULL, stringsAsFactors = F)
dupGEO %>% pander()
key | lat | lon | GISJOIN | STATE | STATEA |
---|---|---|---|---|---|
56103 | 37.12 | -120.3 | G0600390000300 | California | 6 |
56103 | 37.12 | -120.3 | G0600390000202 | California | 6 |
56106 | 37.11 | -120.3 | G0600390000202 | California | 6 |
56106 | 37.11 | -120.3 | G0600390000300 | California | 6 |
155140 | 27.52 | -82.65 | G1200810001204 | Florida | 12 |
155142 | 37.34 | -121.9 | G0600850500200 | California | 6 |
350685 | 40.13 | -75.01 | G4201010036501 | Pennsylvania | 42 |
350686 | 40.12 | -75.51 | G4200290300502 | Pennsylvania | 42 |
796630 | 31.33 | -94.72 | G4800050000800 | Texas | 48 |
COUNTY | COUNTYA | TRACTA | GM001_2017 | GM002_2017 | GM003_2017 |
---|---|---|---|---|---|
Madera County | 39 | 300 | 0.04142 | 0.3172 | 34841 |
Madera County | 39 | 202 | 0.1036 | 0.194 | 58568 |
Madera County | 39 | 202 | 0.1036 | 0.194 | 58568 |
Madera County | 39 | 300 | 0.04142 | 0.3172 | 34841 |
Manatee County | 81 | 1204 | 0.03951 | 0.04004 | 74955 |
Santa Clara County | 85 | 500200 | 0.05171 | 0.1639 | 99942 |
Philadelphia County | 101 | 36501 | 0.05577 | 0.1258 | 45610 |
Chester County | 29 | 300502 | 0.03798 | 0.05477 | 89353 |
Angelina County | 5 | 800 | 0.03821 | 0.1128 | 51775 |
GM004_2017 | GM005_2017 | GM006_2017 | GM007_2017 | GM008_2017 | GM009_2017 |
---|---|---|---|---|---|
0.432 | 0.2861 | 0.417 | 0.02469 | 0.6271 | 887.1 |
0.4436 | 0.1307 | 0.6042 | 0.0103 | 0.8058 | 20.76 |
0.4436 | 0.1307 | 0.6042 | 0.0103 | 0.8058 | 20.76 |
0.432 | 0.2861 | 0.417 | 0.02469 | 0.6271 | 887.1 |
0.4246 | 0.07564 | 0.8201 | 0.03455 | 0.9836 | 449.4 |
0.3925 | 0.2143 | 0.2833 | 0.05557 | 0.8229 | 3727 |
0.4486 | 0.1857 | 0.4692 | 0.09502 | 0.9118 | 2897 |
0.4693 | 0.1572 | 0.8906 | 0.03036 | 0.9565 | 482.8 |
0.3936 | 0.2931 | 0.751 | 0.1521 | 0.8824 | 317.5 |
GM010_2017 |
---|
287.5 |
7.676 |
7.676 |
287.5 |
275.4 |
1331 |
1300 |
189.9 |
135.6 |
We solved the duplication in all cases except two that are from CA, Madera County.
Using this website and google maps, we were able to determine both locations are within tract 300.
Note: When imputting the address manually through google maps, the lat/lons we get are a bit different, than the ones we get from the google gecoding service. They are very close, though.
x <- rbind(c(56103, "gmaps", 37.123258, -120.267232),
c(56103, "gglgeo", 37.12309, -120.26754),
c(56106, "gmaps", 37.114607, -120.262027),
c(56106, "gglgeo", 37.11442, -120.26255))
x <- as.data.frame(x)
names(x) <- c("key", "source", "lat", "lon")
pander(x)
key | source | lat | lon |
---|---|---|---|
56103 | gmaps | 37.123258 | -120.267232 |
56103 | gglgeo | 37.12309 | -120.26754 |
56106 | gmaps | 37.114607 | -120.262027 |
56106 | gglgeo | 37.11442 | -120.26255 |
Selecting the data
Combining the original results with the new duplicates
Preparing the results for the merge
## [1] "key" "lat" "lon" "GISJOIN" "STATE"
## [6] "STATEA" "COUNTY" "COUNTYA" "TRACTA" "GM001_2017"
## [11] "GM002_2017" "GM003_2017" "GM004_2017" "GM005_2017" "GM006_2017"
## [16] "GM007_2017" "GM008_2017" "GM009_2017" "GM010_2017"
# removing lat lons
ppl.ipumsGEO <- ppl.ipumsGEO[,-c(2,3)]
# Changing the names of the variables using the Data Dictionary - from the GM codebook
# Geographic Unit Identifiers:
# GISJOIN: GIS Join Match Code
# STATE: State Name
# STATEA: State Code
# COUNTY: County Name
# COUNTYA: County Code
# TRACTA: Census Tract Code
# Contextual variables: (Your file will include only those variables you requested)
# GM001_2017: Proportion unemployed
# GM002_2017: Proportion population in poverty
# GM003_2017: Median household income
# GM004_2017: Income inequality
# GM005_2017: Proportion family households headed by single woman
# GM006_2017: Proportion occupied housing units that are owner occupied
# GM007_2017: Proportion African American
# GM008_2017: Proportion of adults who completed high school
# GM009_2017: Persons per square kilometer
# GM010_2017: Housing units per square kilometer
ppl.ipumsGEO <- rename( ppl.ipumsGEO,
STATEFIPS = STATEA,
COUNTYFIPS = COUNTYA,
TRACTFIPS = TRACTA,
unemp = GM001_2017,
poverty = GM002_2017,
medinc = GM003_2017,
inequality = GM004_2017,
single=GM005_2017,
ownerocc = GM006_2017,
black = GM007_2017,
hs = GM008_2017,
p.density = GM009_2017,
h.density = GM010_2017 )
Merging
Finall, we add the the google geocode informaiton we got for the 7 duplicated cases and update their geocode_type from census to google.
## [1] "key" "input_address" "lon" "lat"
## [5] "address"
## [1] "ID" "key" "ORGNAME" "EIN"
## [5] "YR" "Signaturedate" "Case.Number" "Firstname"
## [9] "Lastname" "Title" "Address" "City"
## [13] "State" "Zip" "Zippl4" "gender"
## [17] "proportion_male" "IDdup" "pob" "add.len"
## [21] "input_address" "match" "match_type" "out_address"
## [25] "lat_lon_cen" "tiger_line_id" "tiger_line_side" "state_fips"
## [29] "county_fips" "tract_fips" "block_fips" "lon_cen"
## [33] "lat_cen" "geocode_type" "lon_ggl" "lat_ggl"
## [37] "address_ggl" "lat_zip1" "lon_zip1" "City_zip2"
## [41] "State_zip2" "lat_zip2" "lon_zip2" "city_st"
## [45] "lat_cty" "lon_cty" "lat" "lon"
## [49] "GISJOIN" "STATE" "STATEFIPS" "COUNTY"
## [53] "COUNTYFIPS" "TRACTFIPS" "unemp" "poverty"
## [57] "medinc" "inequality" "single" "ownerocc"
## [61] "black" "hs" "p.density" "h.density"
# identifying the cases to edit
x <- which(ppl.cen$key %in% ggl.ppl$key)
# test to check the reference is OK
ppl.cen$key[x] == ggl.ppl$key
## [1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE TRUE
key | lon_ggl | lat_ggl | address_ggl |
---|---|---|---|
56103 | NA | NA | NA |
56103 | NA | NA | NA |
56106 | NA | NA | NA |
56106 | NA | NA | NA |
155140 | NA | NA | NA |
155140 | NA | NA | NA |
155142 | NA | NA | NA |
155142 | NA | NA | NA |
350685 | NA | NA | NA |
350685 | NA | NA | NA |
350686 | NA | NA | NA |
350686 | NA | NA | NA |
796630 | NA | NA | NA |
796630 | NA | NA | NA |
# replacing "lon_ggl", "lat_ggl", "address_ggl"
ppl.cen[x,c(35,36,37)] <- ggl.ppl[,-c(1,2)]
# making "google" as geocode_type and updating the final lat lons with the google geocodes.
ppl.cen$geocode_type[x] <- "google"
ppl.cen$lat[x] <- ppl.cen$lat_ggl[x]
ppl.cen$lon[x] <- ppl.cen$lon_ggl[x]
Saving new main dataset
Cases with census data
x <- is.na(ppl.cen$poverty) %>% table()
x <- as.data.frame(x)
x <- cbind(x, paste0(round(prop.table(x$Freq) * 100,1),"%"))
names(x) <- c("No Census data", "Freq", "%")
pander(x)
No Census data | Freq | % |
---|---|---|
FALSE | 942133 | 99.6% |
TRUE | 3967 | 0.4% |
Cases by geocode_type
x <- table(ppl.cen$geocode_type, useNA = "ifany")
x <- as.data.frame(x)
x <- cbind(x, paste0(round(prop.table(x$Freq) * 100,1),"%"))
names(x) <- c("geocode_type", "Freq", "%")
x <- x[c(3,1,4,5,2,6),]
row.names(x) <- NULL
pander(x)
geocode_type | Freq | % |
---|---|---|
149753 | 15.8% | |
census | 689976 | 72.9% |
zip1 | 87597 | 9.3% |
zip2 | 15509 | 1.6% |
city | 617 | 0.1% |
NA | 2648 | 0.3% |
Cases with final latitude/longitude
x <- table(is.na(ppl.cen$lat), useNA = "ifany")
x <- as.data.frame(x)
x <- cbind(x, paste0(round(prop.table(x$Freq) * 100,1),"%"))
names(x) <- c("No lat/lon", "Freq", "%")
pander(x)
No lat/lon | Freq | % |
---|---|---|
FALSE | 943452 | 99.7% |
TRUE | 2648 | 0.3% |