This project is my original short-term analysis project about “#50revoke” hashtag activism in March 2019.

1. Background

  • “Article 50” is one of the articles in the Treaty on European Union (TEU), which mentions the EU member state’s withdrawal.
  • On 29 March 2017, the United Kingdom (UK) invoked the article and UK was scheduled to leave EU at the end of 29 March 2019.
  • In December 2018, the European Court of Justice mentioned that UK may legally revoke Article 50.
  • Based on this statement, the petition named “Revoke Article 50 and remain in the EU.” was started on 20 February 2019 in UK parliament petition website. (https://petition.parliament.uk/petitions/241584)
  • The petition was disseminated by social media (e.g. Twitter), and it reached one million signatures on 21 March 2019.
  • But the government stated that “this Government will not revoke Article 50” on 26 March 2019.

  • This research sheds light on the dissemination of the “#Revoke50” hashtag on Twitter.

2. Data Collection

(1) search tweets by the hashtag and store them to the working folder by json format

library(tweetscores)

## designate a data store folder the folder which contain API key
data.folder <- "./data"
oauth.folder <- "./credentials"

## search tweets by the hashtag, and store them into the data folder
searchTweets("#RevokeArticle50", paste0(data.folder, "/50revoke.json"), n=100000, oauth = oauth.folder)

(2) data cleaning

library(streamR)
library(dplyr)

## convert json file to dataframe with eliminating duplicates
tw <- parseTweets("./50revoke.json") %>% distinct()

## convert datetime
tw$created_at <- as.POSIXct(
  paste(substr(tw$created_at,5,19),substr(tweets$created_at,27,30)),
  format="%b %d %H:%M:%OS %Y")

(3) store data into SQL

library(DBI)
SQL.path <- "./50revoke.sqlite"
db <- dbConnect(RSQLite::SQLite(), SQL.path)
dbWriteTable(db, "50revoke", tw, append=T)
dbDisconnect(db)

(4) petition data from UK government site

  • Also, there is an open data in the UK parliament petition website in json format.
download.file("https://petition.parliament.uk/petitions/241584.json", "./241584.json")

3. Data Analysis

## connect to database
library(DBI)
SQL.path <- "./50revoke.sqlite"
db <- dbConnect(RSQLite::SQLite(), SQL.path)

(1) daily trend of tweets and petition status

## get data from database
tweets_daily <- dbGetQuery(db,"
 SELECT date(created_at, 'unixepoch') AS date, COUNT(*) AS count
 FROM '50revoke'
 GROUP BY date")
tweets_daily$date <- as.Date(tweets_daily$date)  ## convert to date type

## petition data
library(rjson)
peti <- fromJSON(file="https://petition.parliament.uk/petitions/241584.json")
## important dates
library(dplyr)
dates <- c(peti$data$attributes$response_threshold_reached_at,
           peti$data$attributes$debate_threshold_reached_at,
           peti$data$attributes$government_response_at,
           peti$data$attributes$debate_outcome_at) %>% as.Date()

library(ggplot2)
p <- ggplot(na.omit(tweets_daily))
p + geom_line(aes(date,count), color="red") +
  geom_vline(xintercept=dates, linetype="dotted") +
  annotate("text", x = dates[1]-0.5, y = 60000, label = "reach 10,000 (response threshold)",
           angle = 90) +
  annotate("text", x = dates[2]-0.5, y = 60000, label = "reach 100,000 (debate threshold)",
           angle = 90) +
  annotate("text", x = dates[3]-0.5, y = 60000, label = "Government response",
           angle = 90) +
  annotate("text", x = dates[4]-0.5, y = 60000, label = "Parliament debate",
           angle = 90)

(1) Result: The number of tweets spiked during March 21th to 23th, then gradually decreased toward April.

(2) Frequently Retweeted accounts

library(DBI)
tweets <- dbGetQuery(db,"SELECT * FROM '50revoke'")
dbDisconnect(db)

## extract retweeted screen names
rows <- grep("^RT @[0-9_A-Za-z]+", tweets$text) ## rows of retweets
library(stringr)
rt_screen_name <- str_extract(tweets$text[rows], '^RT @[0-9_A-Za-z]+') %>%
  gsub("^RT @", "", .)

## create frequency table
rt_freq <- table(rt_screen_name)
rt_freq <- rt_freq[order(rt_freq, decreasing = T)] %>% data.frame(stringsAsFactors = F)
rt_freq$rt_screen_name <- as.character(rt_freq$rt_screen_name)

## put users under ranked under 50 together
others <- c("Others", sum(rt_freq[51:nrow(rt_freq),"Freq"]))
rt_freq50 <- rbind(head(rt_freq, 50), others)
rt_freq50$category <- rep(NA, nrow(rt_freq50))  ## add category column

## save data
write.csv(rt_freq50, "./rt_freq50.csv", row.names = F)
    1. detect bot probability
rt_freq50 <- read.csv("./rt_freq50.csv")  ## load data

## use tweetbotornot package
library(tweetbotornot)

## create token on environment
library(rtweet)
token <- create_token(
  app = "XXXXXXXXXX",
  consumer_key = "XXXXXXXXXXXXXXXXXXXX",
  consumer_secret = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
  access_token = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
  access_secret = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
)

## detect bot probability
pbot <- tweetbotornot(rt_freq50$rt_screen_name[1:50])

## regard accounts whose possibility are more than 80% as bot
bots <- pbot$screen_name[pbot$prob_bot > 0.8]
    1. Manipulate data manually (anonymised, categorised)
    1. Plot
rt_freq50 <- read.csv("./rt_freq50.csv")  ## load manipulated data

## plot treemap
library(treemap)
library(RColorBrewer)
color <- brewer.pal(unique(rt_freq50$category) %>% length(),"Set3")
treemap(rt_freq50, index="rt_screen_name", vSize="Freq", vColor="category", 
        type="categorical", title="Treemap of Retweeted users",
        palette=color)

## proportion of categories for top 50 retweeted users
rt_freq50$category <- as.character(rt_freq50$category) ## convert to character
tab_ctg <- table(rt_freq50$category[1:50])
color <- brewer.pal(nrow(tab_ctg),"Set3")
pie(tab_ctg, clockwise=TRUE,
    col=color, main="Categories of Top 50 retweeted users")

## proportion of retweets for top 50 retweeted users by category
tab_rt_ctg <- rt_freq50[1:50,c("Freq","category")] %>% group_by(category) %>%
  summarise(count=sum(Freq))
color <- brewer.pal(nrow(tab_rt_ctg),"Set3")
pie(tab_rt_ctg$count, clockwise=TRUE, labels=tab_rt_ctg$category,
    col=color, main="Retweeted numbers of Top 50 retweeted users by categories")

(2) Result: Top 50 frequently retweeted users account for more than 60%. Most of influential accounts are individuals, specialists, and politicians. When we compare the share of categories in the number of users and that of retweets, politicians and individuals have more impact in terms of the number of retweets.

(3) Hashtag analysis

library(DBI)
tweets <- dbGetQuery(db,"SELECT * FROM '50revoke'")
dbDisconnect(db)

## create document frequency matrix
library(quanteda)
## Package version: 1.4.3
## Parallel computing: 2 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
toks_tweets <- tweets$text %>%
  corpus() %>%
  tokens(remove_punct = TRUE) 
dfmat_tweets <- dfm(toks_tweets, select = "#*") %>%
  dfm_remove("#revokearticle50")

## wordcloud
textplot_wordcloud(dfmat_tweets, max_words = 100)

## Top15 hashtags
library(ggplot2)
dfmat_tweets %>% 
  textstat_frequency(n = 15) %>% 
  ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
  geom_point() +
  coord_flip() +
  labs(x = NULL, y = "Frequency") +
  theme_minimal()

(3) Result: The hashtags which are used with the spread of “#Revokearticle50” are “#brexit”, “#peoplesvote” and “#peoplesvotemarch”.

(4) Key tweets

## convert recorded time
tweets$created_at <- as.POSIXct(tweets$created_at, origin="1970-01-01 00:00")

## Top5 retweeted posts
rows <- grep("^RT @[0-9_A-Za-z]+", tweets$text) ## rows of retweets
tab_rt <- tweets$text[rows] %>% table(dnn="text")
tab_rt <- tab_rt[order(tab_rt, decreasing = T)][1:5] %>% data.frame(stringsAsFactors = F)

## subset top 5 retweets
tweets_top5 <- tweets[tweets$text %in% tab_rt$text,c("text","created_at")]
save(tweets_top5, file="./tweets_top5.rdata")
  • Anonymise individual accounts
load("./tweets_top5.rdata")

## plot
library(ggplot2)
p <- ggplot(tweets_top5 %>%
              group_by(text) %>%
              arrange(created_at) %>%
              mutate(rn=row_number()))
p + geom_step(aes(x=created_at,y=rn,color=text)) +
  scale_color_discrete(labels=paste(substr(unique(as.character(tweets_top5$text)),1,60),"...")) +
  labs(title="Top5 retweeted posts", x="Date", y="retweeted number") +
  theme(legend.position="bottom", legend.direction = "vertical")

(4) Result: Top5 retweeted tweets were posted by two politicians, a writer, a musician and individual. All of them were posted after March 19, and three of them were during the spike of tweets (March 21-23).

(5) location of users

## create frequency table
tab_loc <- table(tweets$location, dnn="location")

## subset top 100 (omit blank users)
tab_loc100 <- tab_loc[order(tab_loc, decreasing = T)][2:101] %>% data.frame(stringsAsFactors = F)

## add location category
tab_loc100$category <- rep(NA, nrow(tab_loc100))
write.csv(tab_loc100, file="./tab_loc100.csv")
  • add category (“London”, “England”, “Scotland”, “Wales”, “N Ireland”, and “Others” in UK. People outside UK are marked “Abroad”, and those who emphasise EU are marked as “EU”)
tab_loc100 <- read.csv("./tab_loc100.csv")

library(treemap)
color <- brewer.pal(unique(tab_loc100$category) %>% length(),"Set3")
treemap(tab_loc100, index="location", vSize="Freq", vColor="category", 
        type="categorical", title="Treemap of Retweeted users",
        palette=color)

## proportion of categories for top 100 location
tab_loc_ctg <- table(tab_loc100$category, dnn="category") %>%
  data.frame(stringsAsFactors = F)
pie(tab_loc_ctg$Freq, clockwise=TRUE, labels=tab_loc_ctg$category,
    col=color, main="Categories of Top 100 locations")

## compare with the rate of population
ctg_pop <- data.frame(
  region = c("London", "England", "Scotland", "Wales", "N Ireland"),
  population = c(8673713, 46112614, 5373000, 3099086, 1851621), ## 2015 population
  count = c(tab_loc_ctg$Freq[tab_loc_ctg$category=="London"],
            tab_loc_ctg$Freq[tab_loc_ctg$category=="England"],
            tab_loc_ctg$Freq[tab_loc_ctg$category=="Scotland"],
            tab_loc_ctg$Freq[tab_loc_ctg$category=="Wales"],
            tab_loc_ctg$Freq[tab_loc_ctg$category=="N Ireland"])
)

library(tidyr)
ctg_pop <- ctg_pop %>% 
  gather(key="x_axis", value="count", -region) ## process to create graph

p <- ggplot(ctg_pop)
p + geom_bar(aes(x=x_axis, y=count, fill=region),
             position="fill", stat="identity") +
  labs(title="The proportion of regions", y="rate", x="")

(5) Result: The rates of Scotland and Wales are a little higher than population.