This is a data project that aims to analyze Video Game Sales data scraped from vgchartz.com with sales greater than 100,000 copies with Rstudio. Fields include:
Rank - Ranking of overall sales
Name - The games name
Platform - Platform of the games release (i.e. PC,PS4, etc.)
Year - Year of the game’s release
Genre - Genre of the game
Publisher - Publisher of the game
NA_Sales - Sales in North America (in millions)
EU_Sales - Sales in Europe (in millions)
JP_Sales - Sales in Japan (in millions)
Other_Sales - Sales in the rest of the world (in millions)
Global_Sales - Total worldwide sales.
The original dataset can be found at https://www.kaggle.com/datasets/gregorut/videogamesales.
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("treemapify")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(treemapify)
library(ggplot2)
vg_sales <-read_csv('vgsales.csv')
## Rows: 16598 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Name, Platform, Year, Genre, Publisher
## dbl (6): Rank, NA_Sales, EU_Sales, JP_Sales, Other_Sales, Global_Sales
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(vg_sales)
## Rows: 16,598
## Columns: 11
## $ Rank <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ Name <chr> "Wii Sports", "Super Mario Bros.", "Mario Kart Wii", "Wii…
## $ Platform <chr> "Wii", "NES", "Wii", "Wii", "GB", "GB", "DS", "Wii", "Wii…
## $ Year <chr> "2006", "1985", "2008", "2009", "1996", "1989", "2006", "…
## $ Genre <chr> "Sports", "Platform", "Racing", "Sports", "Role-Playing",…
## $ Publisher <chr> "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo…
## $ NA_Sales <dbl> 41.49, 29.08, 15.85, 15.75, 11.27, 23.20, 11.38, 14.03, 1…
## $ EU_Sales <dbl> 29.02, 3.58, 12.88, 11.01, 8.89, 2.26, 9.23, 9.20, 7.06, …
## $ JP_Sales <dbl> 3.77, 6.81, 3.79, 3.28, 10.22, 4.22, 6.50, 2.93, 4.70, 0.…
## $ Other_Sales <dbl> 8.46, 0.77, 3.31, 2.96, 1.00, 0.58, 2.90, 2.85, 2.26, 0.4…
## $ Global_Sales <dbl> 82.74, 40.24, 35.82, 33.00, 31.37, 30.26, 30.01, 29.02, 2…
colnames(vg_sales)
## [1] "Rank" "Name" "Platform" "Year" "Genre"
## [6] "Publisher" "NA_Sales" "EU_Sales" "JP_Sales" "Other_Sales"
## [11] "Global_Sales"
str(vg_sales)
## spc_tbl_ [16,598 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Rank : num [1:16598] 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : chr [1:16598] "Wii Sports" "Super Mario Bros." "Mario Kart Wii" "Wii Sports Resort" ...
## $ Platform : chr [1:16598] "Wii" "NES" "Wii" "Wii" ...
## $ Year : chr [1:16598] "2006" "1985" "2008" "2009" ...
## $ Genre : chr [1:16598] "Sports" "Platform" "Racing" "Sports" ...
## $ Publisher : chr [1:16598] "Nintendo" "Nintendo" "Nintendo" "Nintendo" ...
## $ NA_Sales : num [1:16598] 41.5 29.1 15.8 15.8 11.3 ...
## $ EU_Sales : num [1:16598] 29.02 3.58 12.88 11.01 8.89 ...
## $ JP_Sales : num [1:16598] 3.77 6.81 3.79 3.28 10.22 ...
## $ Other_Sales : num [1:16598] 8.46 0.77 3.31 2.96 1 0.58 2.9 2.85 2.26 0.47 ...
## $ Global_Sales: num [1:16598] 82.7 40.2 35.8 33 31.4 ...
## - attr(*, "spec")=
## .. cols(
## .. Rank = col_double(),
## .. Name = col_character(),
## .. Platform = col_character(),
## .. Year = col_character(),
## .. Genre = col_character(),
## .. Publisher = col_character(),
## .. NA_Sales = col_double(),
## .. EU_Sales = col_double(),
## .. JP_Sales = col_double(),
## .. Other_Sales = col_double(),
## .. Global_Sales = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(vg_sales)
## Rank Name Platform Year
## Min. : 1 Length:16598 Length:16598 Length:16598
## 1st Qu.: 4151 Class :character Class :character Class :character
## Median : 8300 Mode :character Mode :character Mode :character
## Mean : 8301
## 3rd Qu.:12450
## Max. :16600
## Genre Publisher NA_Sales EU_Sales
## Length:16598 Length:16598 Min. : 0.0000 Min. : 0.0000
## Class :character Class :character 1st Qu.: 0.0000 1st Qu.: 0.0000
## Mode :character Mode :character Median : 0.0800 Median : 0.0200
## Mean : 0.2647 Mean : 0.1467
## 3rd Qu.: 0.2400 3rd Qu.: 0.1100
## Max. :41.4900 Max. :29.0200
## JP_Sales Other_Sales Global_Sales
## Min. : 0.00000 Min. : 0.00000 Min. : 0.0100
## 1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.: 0.0600
## Median : 0.00000 Median : 0.01000 Median : 0.1700
## Mean : 0.07778 Mean : 0.04806 Mean : 0.5374
## 3rd Qu.: 0.04000 3rd Qu.: 0.04000 3rd Qu.: 0.4700
## Max. :10.22000 Max. :10.57000 Max. :82.7400
any(is.na(vg_sales))
## [1] FALSE
Test_2 <- vg_sales %>% select(everything()) %>% duplicated()
sum(Test_2, na.rm = TRUE)
## [1] 0
Test_3 <- vg_sales %>% select(Name) %>% duplicated()
sum(Test_3, na.rm = TRUE)
## [1] 5105
Test_4 <- vg_sales %>% select(Rank) %>% duplicated()
sum(Test_4, na.rm = TRUE)
## [1] 0
vg_sales_clean <- mutate(vg_sales, Year_new = as.numeric(Year))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
Test_5 <- vg_sales_clean %>% select(Year_new)
Test_6 <- ifelse(nchar(Test_5!=4),1,0)
sum(Test_6)
## [1] NA
Test_7 <- vg_sales %>% select(Genre)
Test_8 <- ifelse(Test_7!='Action' & Test_7!='Adventure' & Test_7!='Fighting' & Test_7!='Misc' & Test_7!='Platform' & Test_7!='Puzzle' & Test_7!='Racing' & Test_7!='Role-Playing' & Test_7!='Shooter' & Test_7!='Simulation' & Test_7!='Sports' & Test_7!='Strategy',1,0)
sum(Test_8)
## [1] 0
High_genre <- vg_sales %>% group_by(Genre) %>% summarize(Total_sales=sum(Global_Sales)) %>% arrange(-Total_sales)
glimpse(High_genre)
## Rows: 12
## Columns: 2
## $ Genre <chr> "Action", "Sports", "Shooter", "Role-Playing", "Platform",…
## $ Total_sales <dbl> 1751.18, 1330.93, 1037.37, 927.37, 831.37, 809.96, 732.04,…
High_game <- vg_sales %>% group_by(Name,Publisher) %>% summarize(Total_sales=sum(Global_Sales)) %>% arrange(-Total_sales)
## `summarise()` has grouped output by 'Name'. You can override using the
## `.groups` argument.
glimpse(High_game)
## Rows: 11,917
## Columns: 3
## Groups: Name [11,493]
## $ Name <chr> "Wii Sports", "Grand Theft Auto V", "Super Mario Bros.", "…
## $ Publisher <chr> "Nintendo", "Take-Two Interactive", "Nintendo", "Nintendo"…
## $ Total_sales <dbl> 82.74, 55.92, 45.31, 35.84, 35.82, 33.00, 31.37, 31.03, 30…
High_publisher <- vg_sales %>% group_by(Publisher) %>% summarize(Total_sales=sum(Global_Sales)) %>% arrange(-Total_sales)
glimpse(High_publisher)
## Rows: 579
## Columns: 2
## $ Publisher <chr> "Nintendo", "Electronic Arts", "Activision", "Sony Compute…
## $ Total_sales <dbl> 1786.56, 1110.32, 727.46, 607.50, 474.72, 399.54, 340.77, …
High_platform <- vg_sales %>% group_by(Platform) %>% summarize(Total_sales=sum(Global_Sales)) %>% arrange(-Total_sales)
glimpse(High_platform)
## Rows: 31
## Columns: 2
## $ Platform <chr> "PS2", "X360", "PS3", "Wii", "DS", "PS", "GBA", "PSP", "PS…
## $ Total_sales <dbl> 1255.64, 979.96, 957.84, 926.71, 822.49, 730.66, 318.50, 2…
High_year <- vg_sales %>% group_by(Year) %>% summarize(Total_sales=sum(Global_Sales)) %>% arrange(-Total_sales)
glimpse(High_year)
## Rows: 40
## Columns: 2
## $ Year <chr> "2008", "2009", "2007", "2010", "2006", "2011", "2005", "2…
## $ Total_sales <dbl> 678.90, 667.30, 611.13, 600.45, 521.04, 515.99, 459.94, 41…
global_sales_over_time <- vg_sales %>% group_by(Year) %>% summarize(Total_sales_over_time=sum(Global_Sales)) %>% arrange(-Total_sales_over_time)
ggplot(data=global_sales_over_time) + geom_point(mapping=aes(x=Year,y=Total_sales_over_time)) + theme(axis.text.x=element_text(angle=90,hjust=1)) + theme(axis.text.x=element_text(angle=90,hjust=1)) + ggtitle("Global Sales Over Time") + theme(plot.title = element_text(hjust = 0.5)) +
labs(x="Year",y="Sales (Million USD)")
genre_sales_over_time <- vg_sales %>% group_by(Genre,Year) %>% summarize(genre_sales=sum(Global_Sales)) %>% arrange(-genre_sales)
## `summarise()` has grouped output by 'Genre'. You can override using the
## `.groups` argument.
ggplot(data=genre_sales_over_time) + geom_point(mapping=aes(x=Year,y=genre_sales)) + facet_wrap(~Genre) + theme(axis.text.x=element_text(angle=90,hjust=1)) + ggtitle("Genre Sales Over Time") + theme(plot.title = element_text(hjust = 0.5)) +
labs(x="Year",y="Sales (Million USD)")
platform_sales_over_time <- vg_sales_clean %>% group_by(Platform,Year_new) %>% summarize(platform_sales=sum(Global_Sales)) %>% arrange(-platform_sales)
## `summarise()` has grouped output by 'Platform'. You can override using the
## `.groups` argument.
ggplot(data=platform_sales_over_time) + geom_bar(mapping=aes(x=Year_new,y=platform_sales,fill=Platform),stat="identity") + ggtitle("Platform Sales Over Time") + theme(plot.title = element_text(hjust = 0.5)) +
labs(x="Year",y="Sales (Million USD)")
## Warning: Removed 16 rows containing missing values (position_stack).
top_10_sales1 <- vg_sales %>% filter(Publisher=='Nintendo' | Publisher=='Activision' | Publisher=='Sony Computer Entertainment' | Publisher=='Ubisoft' | Publisher=='Take-Two Interactive' | Publisher=='THQ' | Publisher=='Konami Digital Entertainment' | Publisher=='Sega' | Publisher=='Namco Bandai Games')
top_10_sales2 <- mutate(top_10_sales1, Year_new = as.numeric(Year))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
top_10_sales3 <- top_10_sales2 %>% group_by(Publisher,Year_new) %>% summarize(Total_Sales=sum(Global_Sales))
## `summarise()` has grouped output by 'Publisher'. You can override using the
## `.groups` argument.
ggplot(data=top_10_sales3) + geom_smooth(mapping=aes(x=Year_new,y=Total_Sales)) + facet_wrap(~Publisher) + ggtitle("Top Publisher Sales Over Time") + theme(plot.title = element_text(hjust = 0.5)) +
labs(x="Year",y="Sales (Million USD)")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
top_10_sales1 <- vg_sales %>% filter(Publisher=='Nintendo' | Publisher=='Activision' | Publisher=='Sony Computer Entertainment' | Publisher=='Ubisoft' | Publisher=='Take-Two Interactive' | Publisher=='THQ' | Publisher=='Konami Digital Entertainment' | Publisher=='Sega' | Publisher=='Namco Bandai Games')
top_10_sales2 <- mutate(top_10_sales1, Year_new = as.numeric(Year))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
top_10_sales3 <- top_10_sales2 %>% group_by(Publisher,Year_new) %>% summarize(Total_Sales=sum(Global_Sales))
## `summarise()` has grouped output by 'Publisher'. You can override using the
## `.groups` argument.
ggplot(data=top_10_sales3) + geom_tile(mapping=aes(x=Year_new,y=Publisher,fill=Total_Sales)) + scale_fill_gradient(low = "white", high = "black") + theme(axis.text.x=element_text(angle=90,hjust=1)) + coord_fixed() + ggtitle("Top Publisher Sales Over Time Heatmap") + theme(plot.title = element_text(hjust = 0.5)) +
labs(x="Year",y="Publisher",fill="Total Sales (Millions USD)")
## Warning: Removed 9 rows containing missing values (geom_tile).
ggplot(data=vg_sales_clean) + geom_bar(mapping=aes(x=Year_new,fill=Platform)) + ggtitle("Total Video Game Releases By Platform Over Time") + theme(plot.title = element_text(hjust = 0.5)) +
labs(x="Year",y="Video Game Releases")
## Warning: Removed 271 rows containing non-finite values (stat_count).
percent_sales0 <- filter(vg_sales_clean, Year_new<2017)
percent_sales1 <- percent_sales0 %>% group_by(Year_new) %>% summarize(Total_NA=sum(NA_Sales),Total_EU=sum(EU_Sales),Total_JP=sum(JP_Sales),Total_others=sum(Other_Sales),Total_global=sum(Global_Sales))
percent_sales2 <- mutate(percent_sales1, NA_percent=(Total_NA/Total_global)*100, EU_percent=(Total_EU/Total_global)*100, JP_percent=(Total_JP/Total_global)*100, other_percent=(Total_others/Total_global)*100)
ggplot(data=percent_sales2) + geom_line(mapping=aes(x=Year_new,y=NA_percent),color="red") + geom_line(mapping=aes(x=Year_new,y=EU_percent),color="Blue") + geom_line(mapping=aes(x=Year_new,y=JP_percent),color="Orange") + geom_line(mapping=aes(x=Year_new,y=other_percent),color="Purple") +
annotate("text", x = 2008, y = 55, label = "NA Sales",color="Red") + annotate("text", x = 2008, y = 35, label = "EU Sales",color="Blue") + annotate("text", x = 2008, y = 15, label = "JP Sales",color="Orange") + annotate("text", x = 2008, y = 5, label = "Other Sales",color="Purple") +
labs(x="Year",y="Market Share (%)") + ggtitle("Region Market Share Over Time") + theme(plot.title = element_text(hjust = 0.5))
tree1 <- vg_sales_clean %>% group_by(Platform,Publisher) %>% summarize(platform_count=n_distinct(Name)) %>% arrange(-platform_count)
## `summarise()` has grouped output by 'Platform'. You can override using the
## `.groups` argument.
tree2 <- filter(tree1,platform_count >= 50)
ggplot(tree2, aes(area = platform_count, fill = platform_count,
label = Publisher, subgroup = Platform)) +
geom_treemap() +
geom_treemap_subgroup_border(colour = "white", size = 5) +
geom_treemap_subgroup_text(place = "centre", grow = TRUE,
alpha = 0.25, colour = "black",
fontface = "italic") +
geom_treemap_text(colour = "white", place = "centre",
size = 15, grow = TRUE) +
ggtitle("Top Publishers On Each Platform") + theme(plot.title = element_text(hjust = 0.5)) + labs(fill="Platform Releases")