This is a data project on using the Twitter Developer API to find tweets related to a hashtag (used $Biden). Afterwards, we create a linguistic corpus and clean the corpus by changing the case to lowercase, removing numbers, removing stopwords, removing punctuation, and stripping out the white space. Afterwards, we create a bar graph of the most frequently used words and create a wordcloud as a data visualization.
install.packages('rtweet', repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'rtweet' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Steve\AppData\Local\Temp\RtmpU3dsQ1\downloaded_packages
install.packages('tidyverse', repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Steve\AppData\Local\Temp\RtmpU3dsQ1\downloaded_packages
install.packages('ggplot2', repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Steve\AppData\Local\Temp\RtmpU3dsQ1\downloaded_packages
install.packages('dplyr', repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\Steve\AppData\Local\R\win-library\4.2\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\Steve\AppData\Local\R\win-library\4.2\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
##
## The downloaded binary packages are in
## C:\Users\Steve\AppData\Local\Temp\RtmpU3dsQ1\downloaded_packages
install.packages('tm', repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'tm' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'tm'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\Steve\AppData\Local\R\win-library\4.2\00LOCK\tm\libs\x64\tm.dll to C:
## \Users\Steve\AppData\Local\R\win-library\4.2\tm\libs\x64\tm.dll: Permission
## denied
## Warning: restored 'tm'
##
## The downloaded binary packages are in
## C:\Users\Steve\AppData\Local\Temp\RtmpU3dsQ1\downloaded_packages
install.packages('wordcloud', repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'wordcloud' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'wordcloud'
## Warning in file.copy(savedcopy, lib, recursive = TRUE):
## problem copying C:\Users\Steve\AppData\Local\R\win-
## library\4.2\00LOCK\wordcloud\libs\x64\wordcloud.dll to C:
## \Users\Steve\AppData\Local\R\win-library\4.2\wordcloud\libs\x64\wordcloud.dll:
## Permission denied
## Warning: restored 'wordcloud'
##
## The downloaded binary packages are in
## C:\Users\Steve\AppData\Local\Temp\RtmpU3dsQ1\downloaded_packages
library(rtweet)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks rtweet::flatten()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(dplyr)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
auth_setup_default()
## Using default authentication available.
## Reading auth from 'C:\Users\Steve\AppData\Roaming/R/config/R/rtweet/default.rds'
rstats_tweets <- search_tweets(q = "#Biden", n = 1000)
head(rstats_tweets)
## # A tibble: 6 × 43
## created_at id id_str full_…¹ trunc…² displ…³ entities
## <dttm> <dbl> <chr> <chr> <lgl> <dbl> <list>
## 1 2022-12-01 12:14:25 1.60e18 159839510912… "House… FALSE 151 <named list>
## 2 2022-12-01 13:33:00 1.60e18 159841488483… "Pdt q… FALSE 279 <named list>
## 3 2022-11-30 17:13:56 1.60e18 159810809895… "Tonig… FALSE 249 <named list>
## 4 2022-12-01 21:22:10 1.60e18 159853295322… "An el… FALSE 193 <named list>
## 5 2022-12-01 21:21:54 1.60e18 159853288949… "RT @j… FALSE 144 <named list>
## 6 2022-12-01 21:21:38 1.60e18 159853282010… "#POTU… FALSE 20 <named list>
## # … with 36 more variables: metadata <list>, source <chr>,
## # in_reply_to_status_id <dbl>, in_reply_to_status_id_str <chr>,
## # in_reply_to_user_id <dbl>, in_reply_to_user_id_str <chr>,
## # in_reply_to_screen_name <chr>, geo <list>, coordinates <list>,
## # place <list>, contributors <lgl>, is_quote_status <lgl>,
## # retweet_count <int>, favorite_count <int>, favorited <lgl>,
## # retweeted <lgl>, possibly_sensitive <lgl>, lang <chr>, …
head(rstats_tweets$full_text)
## [1] "House Democrat new leader Hakeem Jeffries called for a probe into the sexual assault claim against #Biden #BidenCrimeFamilly\n\nhttps://t.co/SnYA1nqowU"
## [2] "Pdt que Macron pleurniche devant #Biden, sont annoncées des coupures d'électricité pour 60% des Français. À noter, les Corses seront préservés. Pourquoi ? Car l'électricité corse est... italienne. Faut choisir ses amis ! À quand une visite d'Etat à #Meloni ? #CoupuresElectricite https://t.co/H64Z1JTWjk"
## [3] "Tonight on Real America!--- Forgiato Blow talks about his new song with Bryson Gray about the recent Balenciaga scandal. @DanNewsManBall @ForgiatoBlow47 @RealBrysonGray #RealAmerica #OAN #OneAmericaNews #TalkShow #Trump #Biden #Conservative #DanBall https://t.co/TPkYWzuhkd"
## [4] "An eloquent exposition of the current legal action against #Biden and the National Archives in light of #Trump’s failure to further transparency in the matter of the assassination in 2017. #JFK https://t.co/LPmp6Zmw2E"
## [5] "RT @jacquiejae: Fentanyl, Human Trafficking, #Gottaways, the Cartel & the staggering high death rate on the Southern Border is forever @Joe…"
## [6] "#POTUS #Biden #Trump https://t.co/3IfWyDbZuB"
#text data
textdata <- rstats_tweets$text
dirty_corpus <- Corpus(VectorSource(textdata))
Tweet_document1 <- tm_map(dirty_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(dirty_corpus, content_transformer(tolower)):
## transformation drops documents
Tweet_document2 <- tm_map(Tweet_document1, removeNumbers)
## Warning in tm_map.SimpleCorpus(Tweet_document1, removeNumbers): transformation
## drops documents
Tweet_document3 <- tm_map(Tweet_document2, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(Tweet_document2, removeWords,
## stopwords("english")): transformation drops documents
Tweet_document4 <- tm_map(Tweet_document3, removePunctuation)
## Warning in tm_map.SimpleCorpus(Tweet_document3, removePunctuation):
## transformation drops documents
Tweet_document5 <- tm_map(Tweet_document4, stripWhitespace)
## Warning in tm_map.SimpleCorpus(Tweet_document4, stripWhitespace): transformation
## drops documents
dtm <- TermDocumentMatrix(Tweet_document5)
m <- as.matrix(dtm)
v <- sort(rowSums(m), decreasing=TRUE)
d<- data.frame(frequency=v)
d<-data.frame(word=names(v), frequency=v)
head(d,10)
## word frequency
## biden biden 688
## president president 102
## macron macron 97
## joe joe 73
## trump trump 71
## amp amp 69
## les les 65
## new new 61
## vehicles vehicles 54
## … … 50
barplot(d[1:10,]$frequency, las=2, names.arg = d[1:10,]$word,
col = "lightgreen", main="Top Ten Most Frequent Words",
ylab = "Word Frequencies")
set.seed(12342)
wordcloud(words = d$word, freq = d$frequency, min.freq = 5,
max.words=50, random.order = FALSE, rot.per = 0.40,
colors=brewer.pal(8,"Dark2"))