"Contact Hypothesis"
in social psychology (Allport, 1954), which states that “intergroup contact under appropriate conditions can effectively reduce prejudice between majority and minority group members” (source:Wikipedia).Source: Statistics Bureau, Ministry of Internal Affairs and Communications “2015 national census results” - https://www.e-stat.go.jp/stat-search/file-download?statInfId=000031594311&fileKind=0
download.file("https://www.e-stat.go.jp/stat-search/file-download?statInfId=000031594311&fileKind=0", destfile="./jinkou.csv")
library(readr)
jinkou <- read_csv("./jinkou.csv", skip = 10)
Source: Ministry of Justice “Statistics of Residential Foreigners 2018 June” - https://www.e-stat.go.jp/stat-search/file-download?statInfId=000031770336&fileKind=0
download.file("https://www.e-stat.go.jp/stat-search/file-download?statInfId=000031770336&fileKind=0", destfile="./18-06-07.xlsx")
library(readxl)
zairyu <- read_excel("./18-06-07.xlsx", skip = 3)
Amano Spatial Technologies Institute. Location Data of Local Governments in Japan ver.1.0.15 - http://www.amano-tec.com/data/localgovernments.html
download.file("http://www.amano-tec.com/data/localgovernments.html", destfile="./h3010puboffice_utf8.xlsx")
address <- read_delim("./h3010puboffice_utf8.csv", "\t", escape_double = FALSE, trim_ws = TRUE)
library(mapdata)
japan_map <- map_data("japan")
save(japan_map, file="./japan_map.rdata")
## rename columns
colnames(jinkou) <- 1:ncol(jinkou)
jinkou$`2` <- as.numeric(jinkou$`2`)
jinkou$`6` <- as.numeric(jinkou$`6`)
## choose cities where population is more than 100000
jinkou <- jinkou[jinkou$`6` %in% c(0,2,3) & jinkou$`9` > 100000, c("2","8","9")]
## fill blank of data
rows <- grep("区",zairyu$...3)
for (i in 1:length(rows)){
zairyu$...2[rows[i]] <-
paste(strsplit(zairyu$...2[rows[i]-1]," ")[[1]][1], zairyu$...3[rows[i]])
}
zairyu$...2[grep("特別区", zairyu$...2)] <- zairyu$...3[grep("特別区", zairyu$...2)]
zairyu[,4:13] <- lapply(zairyu[,4:13], FUN=as.numeric)
zairyu$...2[1285] <- "名古屋市 中村区"
zairyu$...2[1559] <- jinkou$'8'[319]
## merge data
library(dplyr)
munic <- jinkou %>%
left_join(address[,c("jiscode","lat","lon")], by=c("2"="jiscode")) %>%
left_join(zairyu[,c(2,4:13)], by=c("8"="...2"))
colnames(munic) <- c("jiscode","name","total","latitude","longitude","total_f",
"China","Korea","Vietnam","Phillipines","Brazil","Nepal",
"Chinese Taipei","U.S.","Others")
save(munic, file="./munic.rdata")
## load prepared data
load("./munic.rdata")
load("./japan_map.rdata")
library(dplyr)
library(ggplot2)
g <- japan_map %>%
filter(between(long, 125, 150),
between(lat, 25, 47)) %>%
ggplot() +
geom_polygon(aes(x=long, y=lat, group=group),fill='white', color='grey60')
g <- g + theme(panel.background = element_rect(fill = "white")) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
## Map(1): Total foreigner population in each municipality
(p1 <- g +
xlim(low=125, high=150) + ylim(low=25, high=47) +
geom_point(munic, mapping=aes(x=longitude, y=latitude,
size=total_f, colour=log10(total_f)), alpha=.5) +
scale_colour_gradientn(colours = terrain.colors(4)) +
guides(size=F, color=guide_legend(title="Sum of Foreign Population (log)"))
)
## Map(2): The rate of foreigner population in each municipality
(p2 <- g +
xlim(low=125, high=150) + ylim(low=25, high=47) +
geom_point(munic, mapping=aes(x=longitude, y=latitude,
size=total_f/total*100, colour=log10(total_f/total*100)), alpha=.5) +
scale_colour_gradientn(colours = heat.colors(10)) +
guides(size=F, color=guide_legend(title="Rate of Foreign Population (log10 of %)")))
## Map(3): Nationality of the foreigners who live the most in each municipality
munic$max <- colnames(munic)[7:14][apply(munic[,c(7:14)], MARGIN=1, FUN=which.max) %>% unlist()]
(p3 <- g +
xlim(low=125, high=150) + ylim(low=25, high=47) +
geom_point(munic, mapping=aes(x=longitude, y=latitude,
size=total_f, colour=max), alpha=.5) +
guides(size=F, color=guide_legend(title="Most Frequent Nationality")))
load("./all_tweets.rdata")
## Create dfm (document frequency matrix)
library(quanteda)
dfm_local <- corpus(all_tweets$text) %>%
tokens(remove_punct=TRUE, remove_numbers=TRUE, remove_twitter=TRUE) %>%
tokens_remove(pattern=c("移民","外国","外人","人",
"t.co", "https", "rt", "amp", "http", "t.c", "can")) %>%
tokens_select(min_nchar = 2) %>%
dfm() %>%
dfm_select('^[0-9ァ-ヶー一-龠]+$', valuetype = 'regex') %>%
dfm_trim(min_termfreq = 2)
## plot(1): frequent word
features_dfm_local <- textstat_frequency(dfm_local, n = 50)
# Sort by reverse frequency order
features_dfm_local$feature <- with(features_dfm_local, reorder(feature, -frequency))
ggplot(features_dfm_local, aes(x = feature, y = frequency)) +
geom_point() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(text = element_text(family = "HiraKakuProN-W3"))
## Download Japanese sentiment dictionary
sowdic <- read.table("http://www.lr.pi.titech.ac.jp/~takamura/pubs/pn_ja.dic",sep=":",col.names=c("term","kana","pos","value"),colClasses=c("character","character","factor","numeric"),fileEncoding="Shift_JIS")
## Set positive and negative words as below
positive <- sowdic$term[sowdic$value > 0]
negative <- sowdic$term[sowdic$value < -0.5]
## Create dictionary
sowdic2 <- dictionary(list(positive=positive, negative=negative))
## Calculate sentiment scores
senti_dfm <- dfm_lookup(dfm_local, dictionary = sowdic2)
n_toks <- ntoken(dfm_local)
all_tweets$senti_score <- ((senti_dfm[,1] - senti_dfm[,2])/n_toks)[1:nrow(all_tweets)]
## Calculate average score in each municipality
all_tweets$jiscode <- as.numeric(all_tweets$jiscode)
senti_jiscode <- all_tweets[,c("jiscode","senti_score")] %>%
na.omit() %>%
group_by(jiscode) %>%
summarise(average=mean(senti_score), count=n())
## Merge data
senti_jiscode <- senti_jiscode %>%
left_join(munic, by="jiscode")
## Plot(1): Boxplot by Most Frequent Nationality
p <- ggplot(senti_jiscode)
p + geom_boxplot(aes(x=max,y=average, color=max)) +
labs(x="Most Frequent Nationality",y="Average sentiment score") +
theme(legend.position="none")
## Plot(2): Scatter plot by rate of foreign population
p + geom_point(aes(y=total_f/total,x=average,size=count,color=count)) +
scale_y_log10() +
geom_smooth(aes(y=total_f/total,x=average), method="lm") +
guides(size=F, color=guide_legend(title="Count of tweet in dataset")) +
ylab("Rate of foreign population")
## Map(3): Scatter plot on the map
(p4 <- g +
xlim(low=125, high=150) + ylim(low=25, high=47) +
geom_point(senti_jiscode,
mapping=aes(x=longitude, y=latitude, size=log10(count),
colour=average), alpha=.5) +
scale_colour_gradientn(colours = heat.colors(10)) +
guides(size=F, color=guide_legend(title="Sentiment score")))