Graduate Urban Economics Lecture 11, May 10, 2018
Gentzkow and Shapiro wrote a very helpful document called "Code and Data for the Social Sciences: A Practitioner's Guide"
My Stata trick:
Write a test procedure
Know your data: I have caught countless errors by realizing data changed unexpectedly
library(rgdal) #manipulating shape files (.shp) library(ggplot2) #very advanced plotting/graphics library library(sp) #spatial data utility library(maptools) #mapping utility library(rgeos) #additional geographic utility
## Warning: package 'rgeos' was built under R version 3.3.2
library(raster) #raster data
## Warning: package 'raster' was built under R version 3.3.2
library(spatstat) #spatial statistics #library(ggmap) #interfacing with Google Maps (VPN required) library(OpenStreetMap)
## Warning: package 'OpenStreetMap' was built under R version 3.3.2
#chinaMap<-getData('GADM',country="CHN",level=1) #download data from GADM; getData is part of package raster
ChinaRD0<-readRDS("data/CHN_adm0.rds") #Country outline; data folder is local
ChinaRD1<-readRDS("data/CHN_adm1.rds") #provinces
ChinaRD2<-readRDS("data/CHN_adm2.rds") #prefectures
ChinaRD3<-readRDS("data/CHN_adm3.rds") #counties
plot(ChinaRD0,main="Mainland China country outline") #Plot Mainland China
ChinaRD1@data[1:5,1:7] #shows first 5 rows 7 cols of ChinaRD1; could also use: slot(ChinaRD1,"data")[1:10,]
## OBJECTID ID_0 ISO NAME_0 ID_1 NAME_1 HASC_1 ## 1 1 49 CHN China 1 Anhui CN.AH ## 2 2 49 CHN China 2 Beijing CN.BJ ## 3 3 49 CHN China 3 Chongqing CN.CQ ## 4 4 49 CHN China 4 Fujian CN.FJ ## 5 5 49 CHN China 5 Gansu CN.GS
names(ChinaRD3) #names of columns in data
## [1] "OBJECTID" "ID_0" "ISO" "NAME_0" "ID_1" ## [6] "NAME_1" "ID_2" "NAME_2" "ID_3" "NAME_3" ## [11] "CCN_3" "CCA_3" "TYPE_3" "ENGTYPE_3" "NL_NAME_3" ## [16] "VARNAME_3"
shanghaiCities<-ChinaRD3[ChinaRD3$NAME_2=="Shanghai",] #Selects only cities in Shanghai municipality
plot(shanghaiCities)
plot(shanghaiCities[shanghaiCities$NAME_3=="Shanghai",],border="blue",add=TRUE) #add=T draws on current plot
text(coordinates(shanghaiCities),
labels=shanghaiCities$NAME_3,col="red",cex=0.7) #Adds names
popData<-read.table("data/china_pop_9293.csv",header=T,sep=",",stringsAsFactors=FALSE) #examine: str(popData)
ChinaData2 <- slot(ChinaRD2, "data")
names(ChinaData2)[c(6,8)]<-c("province_name","city_name")
table(ChinaData2$province_name) #Show frequency table of prefectures by province
prefectureData<-merge(ChinaData2, popData, by=c("province_name", "city_name"),all.x=TRUE) #merging by two columns
prefectureData$city_name[is.na(prefectureData$pop92)] #not a great way to match
names(ChinaRD2)[c(6,8)]<-c("province_name","city_name") #SP package allows merging onto SpatialPolygonsDataFrame
popDataSP<-merge(ChinaRD2, popData, by=c("province_name", "city_name"),all.x=TRUE)
#rm(ChinaRD2) #Removes old object--not really necessary
summary(popDataSP) #shows summary statistics on new object
gg_map_obj<-fortify(ChinaRD2, region= "OBJECTID")
china_map<-ggplot() + geom_map(data = slot(popDataSP, "data"),
aes(map_id = OBJECTID, fill = pop92), map = gg_map_obj) +
expand_limits(x = gg_map_obj$long, y = gg_map_obj$lat) + ggtitle("Population in 1992, Selected Prefectures")
china_map
#ggsave("china_pop92.pdf", china_map, width = 10, height = 6)
provinceCentroids<-coordinates(ChinaRD1) #gets coordinates plot(ChinaRD1) points(provinceCentroids,col="red") #adds points to current plot
#plot(ChinaRD3,border="blue",add=T) #draws borders of prefectures
spdf_pts <- SpatialPointsDataFrame(coords=provinceCentroids, data=data.frame(ChinaRD1$NAME_1))
proj4string(spdf_pts) <-proj4string(popDataSP) #Make sure projection is the same (should be since comes from same data source)
#proj4string(spdf_pts)<-CRS(" +proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0") #could use this code to adjust
pref_pts<-over(spdf_pts,popDataSP)
pref_pts$x<-slot(spdf_pts,"coords")[,1]
pref_pts$y<-slot(spdf_pts,"coords")[,2]
#write.table(pref_pts, file = "centroids_in_cities.CSV", sep = ",", col.names=names(pref_pts),row.names=FALSE,na="") #write CSV
plot(ChinaRD1) points(provinceCentroids,col="red") #adds points to current plot text(pref_pts$x,pref_pts$y, labels=pref_pts$city_name,col="blue",cex=0.7) #Add names
sufe_names="SUFE"
sufe_pts<-SpatialPointsDataFrame(coords=cbind(121.5153883,31.2948375), data=data.frame(sufe_names))
sufe_pts$lon<-coordinates(sufe_pts)[1,1]
sufe_pts$lat<-coordinates(sufe_pts)[1,2]
sufe_map <- invisible(openmap(c(sufe_pts$lat+0.03,sufe_pts$lon-0.03), c(sufe_pts$lat-0.03,sufe_pts$lon+0.03),
minNumTiles=10,type="osm")) #invisible suppresses warning messages
plot(sufe_map)
Geocoding is the process of converting street addresses to latitude and longitude
This can be done with a geodatabase or by calling a service (ex: Google, Baidu)
In R, there are multiple packages to do this, including geoChina and ggmap
You will need an API and the output will often come back as a JSON; Google gives 2500 free requests per day, Baidu seems to allow even more
Google will also allow network analysis (distance between two points by travel mode)
Note that Chinese maps use a shifted coordinate system (GCJ02) that may require additional work to use
We use a projection (NAD1983 Long Island, feet) to do simple distance calcuations
rest_data<-read.table("data/menupages_Wb126.csv", header=T, sep=",",stringsAsFactors =FALSE)
nyc_tracts<-readOGR("data","manhattan_tracts") #Read shapefiles with readOGR
ggplot() + geom_polygon(data = nyc_tracts, aes(x=long, y = lat, group = group), fill = NA, color = "black") +
geom_point(data=rest_data,aes(x=x_feet, y=y_feet), size=1, color = "blue") + ggtitle("Manhattan Restaurants")
## Regions defined for each Polygons
ggplot() + geom_polygon(data = nyc_tracts, aes(x=long, y = lat, group = group), fill = NA, color = "black") +
geom_point(data=rest_data[rest_data$cuisName1=="Chinese",],aes(x=x_feet, y=y_feet), size=1, color = "red") + ggtitle("Chinese Restaurants")
## Regions defined for each Polygons
min_x<-min(rest_data$x_feet) max_x<-max(rest_data$x_feet) min_y<-min(rest_data$y_feet) max_y<-max(rest_data$y_feet) W <- owin(xrange=c(min_x,max_x),yrange=c(min_y,max_y)) #a window defined by limits of data allrests.ppp<-ppp(rest_data$x_feet,rest_data$y_feet,window=W)
## Warning in ppp(rest_data$x_feet, rest_data$y_feet, window = W): data ## contain duplicated points
italian.ppp<-allrests.ppp[rest_data$cuisName1=="Italian",] chinese.ppp<-allrests.ppp[rest_data$cuisName1=="Chinese",] pizza.ppp<-allrests.ppp[rest_data$cuisName1=="Pizza",]
neighDist<-pairdist(chinese.ppp)
radii_set<-seq(200,3600,200)
chinese_mat<-array(,c(length(radii_set),2))
for (i in 1:length(radii_set)) {
chinese_mat[i,1]<-radii_set[i]
chinese_mat[i,2]<-mean(rowSums((neighDist<radii_set[i]), na.rm=TRUE))-1 #remove own distance
}
plot(chinese_mat,xlab="radius",ylab="Count",main="Count Chinese Restaurants within Radius")
numSims<-1000
sim_mat<-array(,c(length(chinese_mat[,1]),numSims))
for (i in 1:numSims) {
sim.ppp<-allrests.ppp[sample(1:length(allrests.ppp$x),length(chinese.ppp$x),replace=F)]
neighDist_sim<-pairdist(sim.ppp)
for (j in 1:length(radii_set)) {
sim_mat[j,i]<-mean(rowSums(neighDist_sim<radii_set[j], na.rm=TRUE))-1
}
}
conf_mat<-array(,c(length(chinese_mat[,1]),2))
for (j in 1:length(radii_set)) {
conf_mat[j,1]<-quantile(sim_mat[j,],probs=.025,na.rm=TRUE)
conf_mat[j,2]<-quantile(sim_mat[j,],probs=.975,na.rm=TRUE)
}
chinese_mat<-cbind(chinese_mat,conf_mat)