This is a data project on using the Twitter Developer API to find tweets related to a hashtag (used $Biden). Afterwards, we create a linguistic corpus and clean the corpus by changing the case to lowercase, removing numbers, removing stopwords, removing punctuation, and stripping out the white space. Afterwards, we create a bar graph of the most frequently used words and create a wordcloud as a data visualization.

Install Packages

install.packages('rtweet', repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

## package 'rtweet' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Steve\AppData\Local\Temp\RtmpU3dsQ1\downloaded_packages

install.packages('tidyverse', repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Steve\AppData\Local\Temp\RtmpU3dsQ1\downloaded_packages

install.packages('ggplot2', repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Steve\AppData\Local\Temp\RtmpU3dsQ1\downloaded_packages

install.packages('dplyr', repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

## package 'dplyr' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'dplyr'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\Steve\AppData\Local\R\win-library\4.2\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\Steve\AppData\Local\R\win-library\4.2\dplyr\libs\x64\dplyr.dll:
## Permission denied

## Warning: restored 'dplyr'

## 
## The downloaded binary packages are in
##  C:\Users\Steve\AppData\Local\Temp\RtmpU3dsQ1\downloaded_packages

install.packages('tm', repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

## package 'tm' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'tm'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\Steve\AppData\Local\R\win-library\4.2\00LOCK\tm\libs\x64\tm.dll to C:
## \Users\Steve\AppData\Local\R\win-library\4.2\tm\libs\x64\tm.dll: Permission
## denied

## Warning: restored 'tm'

## 
## The downloaded binary packages are in
##  C:\Users\Steve\AppData\Local\Temp\RtmpU3dsQ1\downloaded_packages

install.packages('wordcloud', repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

## package 'wordcloud' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'wordcloud'

## Warning in file.copy(savedcopy, lib, recursive = TRUE):
## problem copying C:\Users\Steve\AppData\Local\R\win-
## library\4.2\00LOCK\wordcloud\libs\x64\wordcloud.dll to C:
## \Users\Steve\AppData\Local\R\win-library\4.2\wordcloud\libs\x64\wordcloud.dll:
## Permission denied

## Warning: restored 'wordcloud'

## 
## The downloaded binary packages are in
##  C:\Users\Steve\AppData\Local\Temp\RtmpU3dsQ1\downloaded_packages

library(rtweet)
library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()  masks stats::filter()
## ✖ purrr::flatten() masks rtweet::flatten()
## ✖ dplyr::lag()     masks stats::lag()

library(ggplot2)
library(dplyr)
library(tm)

## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(wordcloud)

## Loading required package: RColorBrewer

Check Twitter API connection

auth_setup_default()

## Using default authentication available.
## Reading auth from 'C:\Users\Steve\AppData\Roaming/R/config/R/rtweet/default.rds'

Search for 1000 #Biden Tweets

rstats_tweets <- search_tweets(q = "#Biden", n = 1000)

Head

head(rstats_tweets)

## # A tibble: 6 × 43
##   created_at               id id_str        full_…¹ trunc…² displ…³ entities    
##   <dttm>                <dbl> <chr>         <chr>   <lgl>     <dbl> <list>      
## 1 2022-12-01 12:14:25 1.60e18 159839510912… "House… FALSE       151 <named list>
## 2 2022-12-01 13:33:00 1.60e18 159841488483… "Pdt q… FALSE       279 <named list>
## 3 2022-11-30 17:13:56 1.60e18 159810809895… "Tonig… FALSE       249 <named list>
## 4 2022-12-01 21:22:10 1.60e18 159853295322… "An el… FALSE       193 <named list>
## 5 2022-12-01 21:21:54 1.60e18 159853288949… "RT @j… FALSE       144 <named list>
## 6 2022-12-01 21:21:38 1.60e18 159853282010… "#POTU… FALSE        20 <named list>
## # … with 36 more variables: metadata <list>, source <chr>,
## #   in_reply_to_status_id <dbl>, in_reply_to_status_id_str <chr>,
## #   in_reply_to_user_id <dbl>, in_reply_to_user_id_str <chr>,
## #   in_reply_to_screen_name <chr>, geo <list>, coordinates <list>,
## #   place <list>, contributors <lgl>, is_quote_status <lgl>,
## #   retweet_count <int>, favorite_count <int>, favorited <lgl>,
## #   retweeted <lgl>, possibly_sensitive <lgl>, lang <chr>, …

Head of actual Tweet Text Strings

head(rstats_tweets$full_text)

## [1] "House Democrat new leader Hakeem Jeffries  called for a probe into the sexual assault claim against #Biden  #BidenCrimeFamilly\n\nhttps://t.co/SnYA1nqowU"                                                                                                                                                      
## [2] "Pdt que Macron pleurniche devant #Biden, sont annoncées des coupures d'électricité pour 60% des Français. À noter, les Corses seront préservés. Pourquoi ? Car l'électricité corse est... italienne. Faut choisir ses amis ! À quand une visite d'Etat à #Meloni ? #CoupuresElectricite https://t.co/H64Z1JTWjk"
## [3] "Tonight on Real America!--- Forgiato Blow talks about his new song with Bryson Gray about the recent Balenciaga scandal. @DanNewsManBall @ForgiatoBlow47 @RealBrysonGray #RealAmerica #OAN #OneAmericaNews #TalkShow #Trump #Biden #Conservative #DanBall https://t.co/TPkYWzuhkd"                              
## [4] "An eloquent exposition of the current legal action against #Biden and the National Archives in light of #Trump’s failure to further transparency in the matter of the assassination in 2017. #JFK https://t.co/LPmp6Zmw2E"                                                                                      
## [5] "RT @jacquiejae: Fentanyl, Human Trafficking, #Gottaways, the Cartel &amp; the staggering high death rate on the Southern Border is forever @Joe…"                                                                                                                                                               
## [6] "#POTUS #Biden #Trump https://t.co/3IfWyDbZuB"

#text data

textdata <- rstats_tweets$text

Create Corpus

dirty_corpus <- Corpus(VectorSource(textdata))

Clean Corpus

Change to lowercase, remove numbers, remove stopwords, remove punctuation, strip white space

Tweet_document1 <- tm_map(dirty_corpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(dirty_corpus, content_transformer(tolower)):
## transformation drops documents

Tweet_document2 <- tm_map(Tweet_document1, removeNumbers)

## Warning in tm_map.SimpleCorpus(Tweet_document1, removeNumbers): transformation
## drops documents

Tweet_document3 <- tm_map(Tweet_document2, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(Tweet_document2, removeWords,
## stopwords("english")): transformation drops documents

Tweet_document4 <- tm_map(Tweet_document3, removePunctuation)

## Warning in tm_map.SimpleCorpus(Tweet_document3, removePunctuation):
## transformation drops documents

Tweet_document5 <- tm_map(Tweet_document4, stripWhitespace)

## Warning in tm_map.SimpleCorpus(Tweet_document4, stripWhitespace): transformation
## drops documents

Convert Corpus to dataframe

dtm <- TermDocumentMatrix(Tweet_document5)
m <- as.matrix(dtm)
v <- sort(rowSums(m), decreasing=TRUE)
d<- data.frame(frequency=v)
d<-data.frame(word=names(v), frequency=v)
head(d,10)

##                word frequency
## biden         biden       688
## president president       102
## macron       macron        97
## joe             joe        73
## trump         trump        71
## amp             amp        69
## les             les        65
## new             new        61
## vehicles   vehicles        54
## …                 …        50

Plot the modest frequent words

barplot(d[1:10,]$frequency, las=2, names.arg = d[1:10,]$word,
        col = "lightgreen", main="Top Ten Most Frequent Words",
        ylab = "Word Frequencies")

Generate word cloud

set.seed(12342)
wordcloud(words = d$word, freq = d$frequency, min.freq = 5,
            max.words=50, random.order = FALSE, rot.per = 0.40,
          colors=brewer.pal(8,"Dark2"))

Using Twitter API to find Word Frequency and create a Word Cloud

Stephen Situ

2022-12-02