Social Media Analysis 1: Sentiment Analysis in Japanese local area

This project is a short analysis project about anti-foreign sentiment in Japan.

The motivation of the analysis is to confirm the "Contact Hypothesis" in social psychology (Allport, 1954), which states that “intergroup contact under appropriate conditions can effectively reduce prejudice between majority and minority group members” (source:Wikipedia).
In Japan, only 2% of total population are foreigners, which includes temporary residence such as technical interns and students. However, the rate is higher in some areas such as around Tokyo and Osaka.
This research seeked for anti-foreign sentiment on Twitter, by each municipality to check the hypothesis. If the hypothesis is true, the higher rate of foreign population, the lower citizens have anti-foreign sentiment.
I collected Tweets through Twitter API, by each local municipality having more than 100,000 population (approximately 390 munics)

Step1: Plot the rate of foreign population to the map

1) data preparation

1. Municipality data

Source: Statistics Bureau, Ministry of Internal Affairs and Communications “2015 national census results” - https://www.e-stat.go.jp/stat-search/file-download?statInfId=000031594311&fileKind=0

download.file("https://www.e-stat.go.jp/stat-search/file-download?statInfId=000031594311&fileKind=0", destfile="./jinkou.csv")
library(readr)
jinkou <- read_csv("./jinkou.csv", skip = 10)

2. Migrants data

Source: Ministry of Justice “Statistics of Residential Foreigners 2018 June” - https://www.e-stat.go.jp/stat-search/file-download?statInfId=000031770336&fileKind=0

download.file("https://www.e-stat.go.jp/stat-search/file-download?statInfId=000031770336&fileKind=0", destfile="./18-06-07.xlsx")
library(readxl)
zairyu <- read_excel("./18-06-07.xlsx", skip = 3)

3. Municipality location data

Amano Spatial Technologies Institute. Location Data of Local Governments in Japan ver.1.0.15 - http://www.amano-tec.com/data/localgovernments.html

download.file("http://www.amano-tec.com/data/localgovernments.html", destfile="./h3010puboffice_utf8.xlsx")
address <- read_delim("./h3010puboffice_utf8.csv", "\t", escape_double = FALSE, trim_ws = TRUE)

4. Map of Japan

library(mapdata)
japan_map <- map_data("japan")
save(japan_map, file="./japan_map.rdata")

2) data cleaning

## rename columns
colnames(jinkou) <- 1:ncol(jinkou)
jinkou$`2` <- as.numeric(jinkou$`2`)
jinkou$`6` <- as.numeric(jinkou$`6`)
## choose cities where population is more than 100000
jinkou <- jinkou[jinkou$`6` %in% c(0,2,3) & jinkou$`9` > 100000, c("2","8","9")]

## fill blank of data
rows <- grep("区",zairyu$...3)
for (i in 1:length(rows)){
  zairyu$...2[rows[i]] <- 
    paste(strsplit(zairyu$...2[rows[i]-1]," ")[[1]][1], zairyu$...3[rows[i]])
}
zairyu$...2[grep("特別区", zairyu$...2)] <- zairyu$...3[grep("特別区", zairyu$...2)]
zairyu[,4:13] <- lapply(zairyu[,4:13], FUN=as.numeric)
zairyu$...2[1285] <- "名古屋市 中村区"
zairyu$...2[1559] <- jinkou$'8'[319]

## merge data
library(dplyr)
munic <- jinkou %>%
  left_join(address[,c("jiscode","lat","lon")], by=c("2"="jiscode")) %>%
  left_join(zairyu[,c(2,4:13)], by=c("8"="...2"))
colnames(munic) <- c("jiscode","name","total","latitude","longitude","total_f",
                     "China","Korea","Vietnam","Phillipines","Brazil","Nepal",
                     "Chinese Taipei","U.S.","Others")
save(munic, file="./munic.rdata")

3) plot data

## load prepared data
load("./munic.rdata")
load("./japan_map.rdata")

library(dplyr)
library(ggplot2)
g <- japan_map %>%
  filter(between(long, 125, 150),
         between(lat, 25, 47)) %>%
  ggplot() +
  geom_polygon(aes(x=long, y=lat, group=group),fill='white', color='grey60')

g <- g + theme(panel.background = element_rect(fill = "white")) + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

## Map(1): Total foreigner population in each municipality
(p1 <- g +
    xlim(low=125, high=150) + ylim(low=25, high=47) +
    geom_point(munic, mapping=aes(x=longitude, y=latitude, 
                                  size=total_f, colour=log10(total_f)), alpha=.5) +
    scale_colour_gradientn(colours = terrain.colors(4)) +
    guides(size=F, color=guide_legend(title="Sum of Foreign Population (log)"))
)

## Map(2): The rate of foreigner population in each municipality
(p2 <- g +
    xlim(low=125, high=150) + ylim(low=25, high=47) +
    geom_point(munic, mapping=aes(x=longitude, y=latitude, 
                                  size=total_f/total*100, colour=log10(total_f/total*100)), alpha=.5) +
    scale_colour_gradientn(colours = heat.colors(10)) +
    guides(size=F, color=guide_legend(title="Rate of Foreign Population (log10 of %)")))

## Map(3): Nationality of the foreigners who live the most in each municipality
munic$max <- colnames(munic)[7:14][apply(munic[,c(7:14)], MARGIN=1, FUN=which.max) %>% unlist()]

(p3 <- g +
    xlim(low=125, high=150) + ylim(low=25, high=47) +
    geom_point(munic, mapping=aes(x=longitude, y=latitude, 
                                  size=total_f, colour=max), alpha=.5) +
    guides(size=F, color=guide_legend(title="Most Frequent Nationality")))

Findings in Step1:

Foreigners are living in specific area condensely.
In those condense area, the rates of foreigners are also higher than others.
Nationality: Korean - Kansai region, Brazilian - Tokai region, Vietnamise - Western Japan, Chinese - Kanto area and others

Step2: Collect Tweets

Through Twitter API, package “rtweet”
Collected from Dec 18, 2018 to Feb 18, 2019
by each municipality (designated by latitude and longtitude)

Step3: Text Analysis

load("./all_tweets.rdata")

## Create dfm (document frequency matrix)
library(quanteda)
dfm_local <- corpus(all_tweets$text) %>%
  tokens(remove_punct=TRUE, remove_numbers=TRUE, remove_twitter=TRUE) %>%
  tokens_remove(pattern=c("移民","外国","外人","人",
                          "t.co", "https", "rt", "amp", "http", "t.c", "can")) %>%
  tokens_select(min_nchar = 2) %>%
  dfm() %>%
  dfm_select('^[０-９ァ-ヶー一-龠]+$', valuetype = 'regex') %>%
  dfm_trim(min_termfreq = 2)

## plot(1): frequent word
features_dfm_local <- textstat_frequency(dfm_local, n = 50)
# Sort by reverse frequency order
features_dfm_local$feature <- with(features_dfm_local, reorder(feature, -frequency))
ggplot(features_dfm_local, aes(x = feature, y = frequency)) +
  geom_point() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(text = element_text(family = "HiraKakuProN-W3"))

Findings in Step3:

frequent terms are about sightseeing(“観光”) and working(“労働”)
but some words(welfare“生活保護”, anti-Japan“反日” etc.) are related to racism
Korea(“韓国”) is the most frequently tweeted word probably because there has been political conflicts

Step4: Sentiment Analysis

## Download Japanese sentiment dictionary
sowdic <- read.table("http://www.lr.pi.titech.ac.jp/~takamura/pubs/pn_ja.dic",sep=":",col.names=c("term","kana","pos","value"),colClasses=c("character","character","factor","numeric"),fileEncoding="Shift_JIS")

## Set positive and negative words as below
positive <- sowdic$term[sowdic$value > 0]
negative <- sowdic$term[sowdic$value < -0.5]
## Create dictionary
sowdic2 <- dictionary(list(positive=positive, negative=negative))

## Calculate sentiment scores
senti_dfm <- dfm_lookup(dfm_local, dictionary = sowdic2)
n_toks <- ntoken(dfm_local)
all_tweets$senti_score <- ((senti_dfm[,1] - senti_dfm[,2])/n_toks)[1:nrow(all_tweets)]

## Calculate average score in each municipality
all_tweets$jiscode <- as.numeric(all_tweets$jiscode)
senti_jiscode <- all_tweets[,c("jiscode","senti_score")] %>%
  na.omit() %>%
  group_by(jiscode) %>%
  summarise(average=mean(senti_score), count=n())

## Merge data
senti_jiscode <- senti_jiscode %>%
  left_join(munic, by="jiscode")

## Plot(1): Boxplot by Most Frequent Nationality
p <- ggplot(senti_jiscode)
p + geom_boxplot(aes(x=max,y=average, color=max)) +
  labs(x="Most Frequent Nationality",y="Average sentiment score") +
  theme(legend.position="none")

## Plot(2): Scatter plot by rate of foreign population
p + geom_point(aes(y=total_f/total,x=average,size=count,color=count)) +
  scale_y_log10() +
  geom_smooth(aes(y=total_f/total,x=average), method="lm") +
  guides(size=F, color=guide_legend(title="Count of tweet in dataset")) +
  ylab("Rate of foreign population")

## Map(3): Scatter plot on the map
(p4 <- g +
    xlim(low=125, high=150) + ylim(low=25, high=47) +
    geom_point(senti_jiscode, 
               mapping=aes(x=longitude, y=latitude, size=log10(count),
                           colour=average), alpha=.5) +
    scale_colour_gradientn(colours = heat.colors(10)) +
    guides(size=F, color=guide_legend(title="Sentiment score")))

Findings in Step4:

there is no difference in the sentiment score by the nationality
there is no relation between the average sentiment score and the rate of foreign population

Conclusion:

There is no difference between the rate and the sum of foreign population in each municipality
In other words, we cannot see the effect of “contact hypothesis” in this analysis.