But the government stated that “this Government will not revoke Article 50” on 26 March 2019.
This research sheds light on the dissemination of the “#Revoke50” hashtag on Twitter.
library(tweetscores)
## designate a data store folder the folder which contain API key
data.folder <- "./data"
oauth.folder <- "./credentials"
## search tweets by the hashtag, and store them into the data folder
searchTweets("#RevokeArticle50", paste0(data.folder, "/50revoke.json"), n=100000, oauth = oauth.folder)
library(streamR)
library(dplyr)
## convert json file to dataframe with eliminating duplicates
tw <- parseTweets("./50revoke.json") %>% distinct()
## convert datetime
tw$created_at <- as.POSIXct(
paste(substr(tw$created_at,5,19),substr(tweets$created_at,27,30)),
format="%b %d %H:%M:%OS %Y")
library(DBI)
SQL.path <- "./50revoke.sqlite"
db <- dbConnect(RSQLite::SQLite(), SQL.path)
dbWriteTable(db, "50revoke", tw, append=T)
dbDisconnect(db)
download.file("https://petition.parliament.uk/petitions/241584.json", "./241584.json")
## connect to database
library(DBI)
SQL.path <- "./50revoke.sqlite"
db <- dbConnect(RSQLite::SQLite(), SQL.path)
## get data from database
tweets_daily <- dbGetQuery(db,"
SELECT date(created_at, 'unixepoch') AS date, COUNT(*) AS count
FROM '50revoke'
GROUP BY date")
tweets_daily$date <- as.Date(tweets_daily$date) ## convert to date type
## petition data
library(rjson)
peti <- fromJSON(file="https://petition.parliament.uk/petitions/241584.json")
## important dates
library(dplyr)
dates <- c(peti$data$attributes$response_threshold_reached_at,
peti$data$attributes$debate_threshold_reached_at,
peti$data$attributes$government_response_at,
peti$data$attributes$debate_outcome_at) %>% as.Date()
library(ggplot2)
p <- ggplot(na.omit(tweets_daily))
p + geom_line(aes(date,count), color="red") +
geom_vline(xintercept=dates, linetype="dotted") +
annotate("text", x = dates[1]-0.5, y = 60000, label = "reach 10,000 (response threshold)",
angle = 90) +
annotate("text", x = dates[2]-0.5, y = 60000, label = "reach 100,000 (debate threshold)",
angle = 90) +
annotate("text", x = dates[3]-0.5, y = 60000, label = "Government response",
angle = 90) +
annotate("text", x = dates[4]-0.5, y = 60000, label = "Parliament debate",
angle = 90)
library(DBI)
tweets <- dbGetQuery(db,"SELECT * FROM '50revoke'")
dbDisconnect(db)
## extract retweeted screen names
rows <- grep("^RT @[0-9_A-Za-z]+", tweets$text) ## rows of retweets
library(stringr)
rt_screen_name <- str_extract(tweets$text[rows], '^RT @[0-9_A-Za-z]+') %>%
gsub("^RT @", "", .)
## create frequency table
rt_freq <- table(rt_screen_name)
rt_freq <- rt_freq[order(rt_freq, decreasing = T)] %>% data.frame(stringsAsFactors = F)
rt_freq$rt_screen_name <- as.character(rt_freq$rt_screen_name)
## put users under ranked under 50 together
others <- c("Others", sum(rt_freq[51:nrow(rt_freq),"Freq"]))
rt_freq50 <- rbind(head(rt_freq, 50), others)
rt_freq50$category <- rep(NA, nrow(rt_freq50)) ## add category column
## save data
write.csv(rt_freq50, "./rt_freq50.csv", row.names = F)
rt_freq50 <- read.csv("./rt_freq50.csv") ## load data
## use tweetbotornot package
library(tweetbotornot)
## create token on environment
library(rtweet)
token <- create_token(
app = "XXXXXXXXXX",
consumer_key = "XXXXXXXXXXXXXXXXXXXX",
consumer_secret = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
access_token = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
access_secret = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
)
## detect bot probability
pbot <- tweetbotornot(rt_freq50$rt_screen_name[1:50])
## regard accounts whose possibility are more than 80% as bot
bots <- pbot$screen_name[pbot$prob_bot > 0.8]
rt_freq50 <- read.csv("./rt_freq50.csv") ## load manipulated data
## plot treemap
library(treemap)
library(RColorBrewer)
color <- brewer.pal(unique(rt_freq50$category) %>% length(),"Set3")
treemap(rt_freq50, index="rt_screen_name", vSize="Freq", vColor="category",
type="categorical", title="Treemap of Retweeted users",
palette=color)
## proportion of categories for top 50 retweeted users
rt_freq50$category <- as.character(rt_freq50$category) ## convert to character
tab_ctg <- table(rt_freq50$category[1:50])
color <- brewer.pal(nrow(tab_ctg),"Set3")
pie(tab_ctg, clockwise=TRUE,
col=color, main="Categories of Top 50 retweeted users")
## proportion of retweets for top 50 retweeted users by category
tab_rt_ctg <- rt_freq50[1:50,c("Freq","category")] %>% group_by(category) %>%
summarise(count=sum(Freq))
color <- brewer.pal(nrow(tab_rt_ctg),"Set3")
pie(tab_rt_ctg$count, clockwise=TRUE, labels=tab_rt_ctg$category,
col=color, main="Retweeted numbers of Top 50 retweeted users by categories")
library(DBI)
tweets <- dbGetQuery(db,"SELECT * FROM '50revoke'")
dbDisconnect(db)
## create document frequency matrix
library(quanteda)
## Package version: 1.4.3
## Parallel computing: 2 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
toks_tweets <- tweets$text %>%
corpus() %>%
tokens(remove_punct = TRUE)
dfmat_tweets <- dfm(toks_tweets, select = "#*") %>%
dfm_remove("#revokearticle50")
## wordcloud
textplot_wordcloud(dfmat_tweets, max_words = 100)
## Top15 hashtags
library(ggplot2)
dfmat_tweets %>%
textstat_frequency(n = 15) %>%
ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
geom_point() +
coord_flip() +
labs(x = NULL, y = "Frequency") +
theme_minimal()
## convert recorded time
tweets$created_at <- as.POSIXct(tweets$created_at, origin="1970-01-01 00:00")
## Top5 retweeted posts
rows <- grep("^RT @[0-9_A-Za-z]+", tweets$text) ## rows of retweets
tab_rt <- tweets$text[rows] %>% table(dnn="text")
tab_rt <- tab_rt[order(tab_rt, decreasing = T)][1:5] %>% data.frame(stringsAsFactors = F)
## subset top 5 retweets
tweets_top5 <- tweets[tweets$text %in% tab_rt$text,c("text","created_at")]
save(tweets_top5, file="./tweets_top5.rdata")
load("./tweets_top5.rdata")
## plot
library(ggplot2)
p <- ggplot(tweets_top5 %>%
group_by(text) %>%
arrange(created_at) %>%
mutate(rn=row_number()))
p + geom_step(aes(x=created_at,y=rn,color=text)) +
scale_color_discrete(labels=paste(substr(unique(as.character(tweets_top5$text)),1,60),"...")) +
labs(title="Top5 retweeted posts", x="Date", y="retweeted number") +
theme(legend.position="bottom", legend.direction = "vertical")
## create frequency table
tab_loc <- table(tweets$location, dnn="location")
## subset top 100 (omit blank users)
tab_loc100 <- tab_loc[order(tab_loc, decreasing = T)][2:101] %>% data.frame(stringsAsFactors = F)
## add location category
tab_loc100$category <- rep(NA, nrow(tab_loc100))
write.csv(tab_loc100, file="./tab_loc100.csv")
tab_loc100 <- read.csv("./tab_loc100.csv")
library(treemap)
color <- brewer.pal(unique(tab_loc100$category) %>% length(),"Set3")
treemap(tab_loc100, index="location", vSize="Freq", vColor="category",
type="categorical", title="Treemap of Retweeted users",
palette=color)
## proportion of categories for top 100 location
tab_loc_ctg <- table(tab_loc100$category, dnn="category") %>%
data.frame(stringsAsFactors = F)
pie(tab_loc_ctg$Freq, clockwise=TRUE, labels=tab_loc_ctg$category,
col=color, main="Categories of Top 100 locations")
## compare with the rate of population
ctg_pop <- data.frame(
region = c("London", "England", "Scotland", "Wales", "N Ireland"),
population = c(8673713, 46112614, 5373000, 3099086, 1851621), ## 2015 population
count = c(tab_loc_ctg$Freq[tab_loc_ctg$category=="London"],
tab_loc_ctg$Freq[tab_loc_ctg$category=="England"],
tab_loc_ctg$Freq[tab_loc_ctg$category=="Scotland"],
tab_loc_ctg$Freq[tab_loc_ctg$category=="Wales"],
tab_loc_ctg$Freq[tab_loc_ctg$category=="N Ireland"])
)
library(tidyr)
ctg_pop <- ctg_pop %>%
gather(key="x_axis", value="count", -region) ## process to create graph
p <- ggplot(ctg_pop)
p + geom_bar(aes(x=x_axis, y=count, fill=region),
position="fill", stat="identity") +
labs(title="The proportion of regions", y="rate", x="")