This project is a short analysis project about anti-foreign sentiment in Japan.

Step1: Plot the rate of foreign population to the map

1) data preparation

1. Municipality data

Source: Statistics Bureau, Ministry of Internal Affairs and Communications “2015 national census results” - https://www.e-stat.go.jp/stat-search/file-download?statInfId=000031594311&fileKind=0

download.file("https://www.e-stat.go.jp/stat-search/file-download?statInfId=000031594311&fileKind=0", destfile="./jinkou.csv")
library(readr)
jinkou <- read_csv("./jinkou.csv", skip = 10)
2. Migrants data

Source: Ministry of Justice “Statistics of Residential Foreigners 2018 June” - https://www.e-stat.go.jp/stat-search/file-download?statInfId=000031770336&fileKind=0

download.file("https://www.e-stat.go.jp/stat-search/file-download?statInfId=000031770336&fileKind=0", destfile="./18-06-07.xlsx")
library(readxl)
zairyu <- read_excel("./18-06-07.xlsx", skip = 3)
3. Municipality location data

Amano Spatial Technologies Institute. Location Data of Local Governments in Japan ver.1.0.15 - http://www.amano-tec.com/data/localgovernments.html

download.file("http://www.amano-tec.com/data/localgovernments.html", destfile="./h3010puboffice_utf8.xlsx")
address <- read_delim("./h3010puboffice_utf8.csv", "\t", escape_double = FALSE, trim_ws = TRUE)
4. Map of Japan
library(mapdata)
japan_map <- map_data("japan")
save(japan_map, file="./japan_map.rdata")

2) data cleaning

## rename columns
colnames(jinkou) <- 1:ncol(jinkou)
jinkou$`2` <- as.numeric(jinkou$`2`)
jinkou$`6` <- as.numeric(jinkou$`6`)
## choose cities where population is more than 100000
jinkou <- jinkou[jinkou$`6` %in% c(0,2,3) & jinkou$`9` > 100000, c("2","8","9")]

## fill blank of data
rows <- grep("区",zairyu$...3)
for (i in 1:length(rows)){
  zairyu$...2[rows[i]] <- 
    paste(strsplit(zairyu$...2[rows[i]-1]," ")[[1]][1], zairyu$...3[rows[i]])
}
zairyu$...2[grep("特別区", zairyu$...2)] <- zairyu$...3[grep("特別区", zairyu$...2)]
zairyu[,4:13] <- lapply(zairyu[,4:13], FUN=as.numeric)
zairyu$...2[1285] <- "名古屋市 中村区"
zairyu$...2[1559] <- jinkou$'8'[319]

## merge data
library(dplyr)
munic <- jinkou %>%
  left_join(address[,c("jiscode","lat","lon")], by=c("2"="jiscode")) %>%
  left_join(zairyu[,c(2,4:13)], by=c("8"="...2"))
colnames(munic) <- c("jiscode","name","total","latitude","longitude","total_f",
                     "China","Korea","Vietnam","Phillipines","Brazil","Nepal",
                     "Chinese Taipei","U.S.","Others")
save(munic, file="./munic.rdata")

3) plot data

## load prepared data
load("./munic.rdata")
load("./japan_map.rdata")

library(dplyr)
library(ggplot2)
g <- japan_map %>%
  filter(between(long, 125, 150),
         between(lat, 25, 47)) %>%
  ggplot() +
  geom_polygon(aes(x=long, y=lat, group=group),fill='white', color='grey60')

g <- g + theme(panel.background = element_rect(fill = "white")) + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

## Map(1): Total foreigner population in each municipality
(p1 <- g +
    xlim(low=125, high=150) + ylim(low=25, high=47) +
    geom_point(munic, mapping=aes(x=longitude, y=latitude, 
                                  size=total_f, colour=log10(total_f)), alpha=.5) +
    scale_colour_gradientn(colours = terrain.colors(4)) +
    guides(size=F, color=guide_legend(title="Sum of Foreign Population (log)"))
)

## Map(2): The rate of foreigner population in each municipality
(p2 <- g +
    xlim(low=125, high=150) + ylim(low=25, high=47) +
    geom_point(munic, mapping=aes(x=longitude, y=latitude, 
                                  size=total_f/total*100, colour=log10(total_f/total*100)), alpha=.5) +
    scale_colour_gradientn(colours = heat.colors(10)) +
    guides(size=F, color=guide_legend(title="Rate of Foreign Population (log10 of %)")))

## Map(3): Nationality of the foreigners who live the most in each municipality
munic$max <- colnames(munic)[7:14][apply(munic[,c(7:14)], MARGIN=1, FUN=which.max) %>% unlist()]

(p3 <- g +
    xlim(low=125, high=150) + ylim(low=25, high=47) +
    geom_point(munic, mapping=aes(x=longitude, y=latitude, 
                                  size=total_f, colour=max), alpha=.5) +
    guides(size=F, color=guide_legend(title="Most Frequent Nationality")))

Findings in Step1:

Step2: Collect Tweets

  • Through Twitter API, package “rtweet”
  • Collected from Dec 18, 2018 to Feb 18, 2019
  • by each municipality (designated by latitude and longtitude)

Step3: Text Analysis

load("./all_tweets.rdata")

## Create dfm (document frequency matrix)
library(quanteda)
dfm_local <- corpus(all_tweets$text) %>%
  tokens(remove_punct=TRUE, remove_numbers=TRUE, remove_twitter=TRUE) %>%
  tokens_remove(pattern=c("移民","外国","外人","人",
                          "t.co", "https", "rt", "amp", "http", "t.c", "can")) %>%
  tokens_select(min_nchar = 2) %>%
  dfm() %>%
  dfm_select('^[0-9ァ-ヶー一-龠]+$', valuetype = 'regex') %>%
  dfm_trim(min_termfreq = 2)

## plot(1): frequent word
features_dfm_local <- textstat_frequency(dfm_local, n = 50)
# Sort by reverse frequency order
features_dfm_local$feature <- with(features_dfm_local, reorder(feature, -frequency))
ggplot(features_dfm_local, aes(x = feature, y = frequency)) +
  geom_point() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(text = element_text(family = "HiraKakuProN-W3"))

Findings in Step3:

Step4: Sentiment Analysis

## Download Japanese sentiment dictionary
sowdic <- read.table("http://www.lr.pi.titech.ac.jp/~takamura/pubs/pn_ja.dic",sep=":",col.names=c("term","kana","pos","value"),colClasses=c("character","character","factor","numeric"),fileEncoding="Shift_JIS")

## Set positive and negative words as below
positive <- sowdic$term[sowdic$value > 0]
negative <- sowdic$term[sowdic$value < -0.5]
## Create dictionary
sowdic2 <- dictionary(list(positive=positive, negative=negative))

## Calculate sentiment scores
senti_dfm <- dfm_lookup(dfm_local, dictionary = sowdic2)
n_toks <- ntoken(dfm_local)
all_tweets$senti_score <- ((senti_dfm[,1] - senti_dfm[,2])/n_toks)[1:nrow(all_tweets)]

## Calculate average score in each municipality
all_tweets$jiscode <- as.numeric(all_tweets$jiscode)
senti_jiscode <- all_tweets[,c("jiscode","senti_score")] %>%
  na.omit() %>%
  group_by(jiscode) %>%
  summarise(average=mean(senti_score), count=n())

## Merge data
senti_jiscode <- senti_jiscode %>%
  left_join(munic, by="jiscode")

## Plot(1): Boxplot by Most Frequent Nationality
p <- ggplot(senti_jiscode)
p + geom_boxplot(aes(x=max,y=average, color=max)) +
  labs(x="Most Frequent Nationality",y="Average sentiment score") +
  theme(legend.position="none")

## Plot(2): Scatter plot by rate of foreign population
p + geom_point(aes(y=total_f/total,x=average,size=count,color=count)) +
  scale_y_log10() +
  geom_smooth(aes(y=total_f/total,x=average), method="lm") +
  guides(size=F, color=guide_legend(title="Count of tweet in dataset")) +
  ylab("Rate of foreign population")

## Map(3): Scatter plot on the map
(p4 <- g +
    xlim(low=125, high=150) + ylim(low=25, high=47) +
    geom_point(senti_jiscode, 
               mapping=aes(x=longitude, y=latitude, size=log10(count),
                           colour=average), alpha=.5) +
    scale_colour_gradientn(colours = heat.colors(10)) +
    guides(size=F, color=guide_legend(title="Sentiment score")))

Findings in Step4:

Conclusion: