Introduction

This is a data project that aims to analyze Video Game Sales data scraped from vgchartz.com with sales greater than 100,000 copies with Rstudio. Fields include:

Rank - Ranking of overall sales

Name - The games name

Platform - Platform of the games release (i.e. PC,PS4, etc.)

Year - Year of the game’s release

Genre - Genre of the game

Publisher - Publisher of the game

NA_Sales - Sales in North America (in millions)

EU_Sales - Sales in Europe (in millions)

JP_Sales - Sales in Japan (in millions)

Other_Sales - Sales in the rest of the world (in millions)

Global_Sales - Total worldwide sales.

The original dataset can be found at https://www.kaggle.com/datasets/gregorut/videogamesales.

Load libraries

install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("treemapify")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(treemapify)
library(ggplot2)

Read csv file

vg_sales <-read_csv('vgsales.csv')
## Rows: 16598 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Name, Platform, Year, Genre, Publisher
## dbl (6): Rank, NA_Sales, EU_Sales, JP_Sales, Other_Sales, Global_Sales
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Check Structure of Data

glimpse(vg_sales)
## Rows: 16,598
## Columns: 11
## $ Rank         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ Name         <chr> "Wii Sports", "Super Mario Bros.", "Mario Kart Wii", "Wii…
## $ Platform     <chr> "Wii", "NES", "Wii", "Wii", "GB", "GB", "DS", "Wii", "Wii…
## $ Year         <chr> "2006", "1985", "2008", "2009", "1996", "1989", "2006", "…
## $ Genre        <chr> "Sports", "Platform", "Racing", "Sports", "Role-Playing",…
## $ Publisher    <chr> "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo…
## $ NA_Sales     <dbl> 41.49, 29.08, 15.85, 15.75, 11.27, 23.20, 11.38, 14.03, 1…
## $ EU_Sales     <dbl> 29.02, 3.58, 12.88, 11.01, 8.89, 2.26, 9.23, 9.20, 7.06, …
## $ JP_Sales     <dbl> 3.77, 6.81, 3.79, 3.28, 10.22, 4.22, 6.50, 2.93, 4.70, 0.…
## $ Other_Sales  <dbl> 8.46, 0.77, 3.31, 2.96, 1.00, 0.58, 2.90, 2.85, 2.26, 0.4…
## $ Global_Sales <dbl> 82.74, 40.24, 35.82, 33.00, 31.37, 30.26, 30.01, 29.02, 2…
colnames(vg_sales)
##  [1] "Rank"         "Name"         "Platform"     "Year"         "Genre"       
##  [6] "Publisher"    "NA_Sales"     "EU_Sales"     "JP_Sales"     "Other_Sales" 
## [11] "Global_Sales"
str(vg_sales)
## spc_tbl_ [16,598 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Rank        : num [1:16598] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Name        : chr [1:16598] "Wii Sports" "Super Mario Bros." "Mario Kart Wii" "Wii Sports Resort" ...
##  $ Platform    : chr [1:16598] "Wii" "NES" "Wii" "Wii" ...
##  $ Year        : chr [1:16598] "2006" "1985" "2008" "2009" ...
##  $ Genre       : chr [1:16598] "Sports" "Platform" "Racing" "Sports" ...
##  $ Publisher   : chr [1:16598] "Nintendo" "Nintendo" "Nintendo" "Nintendo" ...
##  $ NA_Sales    : num [1:16598] 41.5 29.1 15.8 15.8 11.3 ...
##  $ EU_Sales    : num [1:16598] 29.02 3.58 12.88 11.01 8.89 ...
##  $ JP_Sales    : num [1:16598] 3.77 6.81 3.79 3.28 10.22 ...
##  $ Other_Sales : num [1:16598] 8.46 0.77 3.31 2.96 1 0.58 2.9 2.85 2.26 0.47 ...
##  $ Global_Sales: num [1:16598] 82.7 40.2 35.8 33 31.4 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Rank = col_double(),
##   ..   Name = col_character(),
##   ..   Platform = col_character(),
##   ..   Year = col_character(),
##   ..   Genre = col_character(),
##   ..   Publisher = col_character(),
##   ..   NA_Sales = col_double(),
##   ..   EU_Sales = col_double(),
##   ..   JP_Sales = col_double(),
##   ..   Other_Sales = col_double(),
##   ..   Global_Sales = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(vg_sales)
##       Rank           Name             Platform             Year          
##  Min.   :    1   Length:16598       Length:16598       Length:16598      
##  1st Qu.: 4151   Class :character   Class :character   Class :character  
##  Median : 8300   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 8301                                                           
##  3rd Qu.:12450                                                           
##  Max.   :16600                                                           
##     Genre            Publisher            NA_Sales          EU_Sales      
##  Length:16598       Length:16598       Min.   : 0.0000   Min.   : 0.0000  
##  Class :character   Class :character   1st Qu.: 0.0000   1st Qu.: 0.0000  
##  Mode  :character   Mode  :character   Median : 0.0800   Median : 0.0200  
##                                        Mean   : 0.2647   Mean   : 0.1467  
##                                        3rd Qu.: 0.2400   3rd Qu.: 0.1100  
##                                        Max.   :41.4900   Max.   :29.0200  
##     JP_Sales         Other_Sales        Global_Sales    
##  Min.   : 0.00000   Min.   : 0.00000   Min.   : 0.0100  
##  1st Qu.: 0.00000   1st Qu.: 0.00000   1st Qu.: 0.0600  
##  Median : 0.00000   Median : 0.01000   Median : 0.1700  
##  Mean   : 0.07778   Mean   : 0.04806   Mean   : 0.5374  
##  3rd Qu.: 0.04000   3rd Qu.: 0.04000   3rd Qu.: 0.4700  
##  Max.   :10.22000   Max.   :10.57000   Max.   :82.7400

Check for nulls

any(is.na(vg_sales))
## [1] FALSE

Check for duplicates overall

Test_2 <- vg_sales %>% select(everything()) %>% duplicated()
sum(Test_2, na.rm = TRUE)
## [1] 0

Check for duplicates in name

Test_3 <- vg_sales %>% select(Name) %>% duplicated()
sum(Test_3, na.rm = TRUE)
## [1] 5105

Check for duplicates in Rank

Test_4 <- vg_sales %>% select(Rank) %>% duplicated()
sum(Test_4, na.rm = TRUE)
## [1] 0

Change Year to numeric from string

vg_sales_clean <- mutate(vg_sales, Year_new = as.numeric(Year))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

Check for Year format

Test_5 <- vg_sales_clean %>% select(Year_new)
Test_6 <- ifelse(nchar(Test_5!=4),1,0)
sum(Test_6)
## [1] NA

Check Genre for format

Test_7 <- vg_sales %>% select(Genre)
Test_8 <- ifelse(Test_7!='Action' & Test_7!='Adventure' & Test_7!='Fighting' & Test_7!='Misc' & Test_7!='Platform' & Test_7!='Puzzle' & Test_7!='Racing' & Test_7!='Role-Playing' & Test_7!='Shooter' & Test_7!='Simulation' & Test_7!='Sports' & Test_7!='Strategy',1,0)
sum(Test_8)
## [1] 0

Figure out Highest Selling Genres

High_genre <- vg_sales %>% group_by(Genre) %>% summarize(Total_sales=sum(Global_Sales)) %>% arrange(-Total_sales)
glimpse(High_genre)
## Rows: 12
## Columns: 2
## $ Genre       <chr> "Action", "Sports", "Shooter", "Role-Playing", "Platform",…
## $ Total_sales <dbl> 1751.18, 1330.93, 1037.37, 927.37, 831.37, 809.96, 732.04,…

Figure out Highest Selling Games

High_game <- vg_sales %>% group_by(Name,Publisher) %>% summarize(Total_sales=sum(Global_Sales)) %>% arrange(-Total_sales)
## `summarise()` has grouped output by 'Name'. You can override using the
## `.groups` argument.
glimpse(High_game)
## Rows: 11,917
## Columns: 3
## Groups: Name [11,493]
## $ Name        <chr> "Wii Sports", "Grand Theft Auto V", "Super Mario Bros.", "…
## $ Publisher   <chr> "Nintendo", "Take-Two Interactive", "Nintendo", "Nintendo"…
## $ Total_sales <dbl> 82.74, 55.92, 45.31, 35.84, 35.82, 33.00, 31.37, 31.03, 30…

Figure out Highest Selling Publisher

High_publisher <- vg_sales %>% group_by(Publisher) %>% summarize(Total_sales=sum(Global_Sales)) %>% arrange(-Total_sales)
glimpse(High_publisher)
## Rows: 579
## Columns: 2
## $ Publisher   <chr> "Nintendo", "Electronic Arts", "Activision", "Sony Compute…
## $ Total_sales <dbl> 1786.56, 1110.32, 727.46, 607.50, 474.72, 399.54, 340.77, …

Figure out Highest Selling Platforms

High_platform <- vg_sales %>% group_by(Platform) %>% summarize(Total_sales=sum(Global_Sales)) %>% arrange(-Total_sales)
glimpse(High_platform)
## Rows: 31
## Columns: 2
## $ Platform    <chr> "PS2", "X360", "PS3", "Wii", "DS", "PS", "GBA", "PSP", "PS…
## $ Total_sales <dbl> 1255.64, 979.96, 957.84, 926.71, 822.49, 730.66, 318.50, 2…

Figure out Highest Selling Years

High_year <- vg_sales %>% group_by(Year) %>% summarize(Total_sales=sum(Global_Sales)) %>% arrange(-Total_sales)
glimpse(High_year)
## Rows: 40
## Columns: 2
## $ Year        <chr> "2008", "2009", "2007", "2010", "2006", "2011", "2005", "2…
## $ Total_sales <dbl> 678.90, 667.30, 611.13, 600.45, 521.04, 515.99, 459.94, 41…

Figure Out global sales trend over time

Sales Peaked globally in 2008

global_sales_over_time <- vg_sales %>% group_by(Year) %>% summarize(Total_sales_over_time=sum(Global_Sales))  %>% arrange(-Total_sales_over_time)
ggplot(data=global_sales_over_time) + geom_point(mapping=aes(x=Year,y=Total_sales_over_time)) + theme(axis.text.x=element_text(angle=90,hjust=1)) + theme(axis.text.x=element_text(angle=90,hjust=1)) + ggtitle("Global Sales Over Time") + theme(plot.title = element_text(hjust = 0.5)) +
  labs(x="Year",y="Sales (Million USD)")

Figure out genre treads over time

We can see Action, Misc, Racing, Role-Playing, Shooter,Sports doing good in recent years

genre_sales_over_time <- vg_sales %>% group_by(Genre,Year) %>% summarize(genre_sales=sum(Global_Sales))  %>% arrange(-genre_sales)
## `summarise()` has grouped output by 'Genre'. You can override using the
## `.groups` argument.
ggplot(data=genre_sales_over_time) + geom_point(mapping=aes(x=Year,y=genre_sales)) + facet_wrap(~Genre) + theme(axis.text.x=element_text(angle=90,hjust=1)) + ggtitle("Genre Sales Over Time") + theme(plot.title = element_text(hjust = 0.5)) +
  labs(x="Year",y="Sales (Million USD)")

We can see different platform sales peaking during their lifecycle

platform_sales_over_time <- vg_sales_clean %>% group_by(Platform,Year_new) %>% summarize(platform_sales=sum(Global_Sales))  %>% arrange(-platform_sales)
## `summarise()` has grouped output by 'Platform'. You can override using the
## `.groups` argument.
ggplot(data=platform_sales_over_time) + geom_bar(mapping=aes(x=Year_new,y=platform_sales,fill=Platform),stat="identity") + ggtitle("Platform Sales Over Time") + theme(plot.title = element_text(hjust = 0.5)) +
  labs(x="Year",y="Sales (Million USD)")
## Warning: Removed 16 rows containing missing values (position_stack).

Create line graph of Top Publisher sales over time

top_10_sales1 <- vg_sales %>%  filter(Publisher=='Nintendo' | Publisher=='Activision' | Publisher=='Sony Computer Entertainment' | Publisher=='Ubisoft' | Publisher=='Take-Two Interactive' | Publisher=='THQ' | Publisher=='Konami Digital Entertainment' | Publisher=='Sega' | Publisher=='Namco Bandai Games')
top_10_sales2 <- mutate(top_10_sales1, Year_new = as.numeric(Year))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
top_10_sales3 <- top_10_sales2 %>% group_by(Publisher,Year_new) %>% summarize(Total_Sales=sum(Global_Sales))
## `summarise()` has grouped output by 'Publisher'. You can override using the
## `.groups` argument.
ggplot(data=top_10_sales3) + geom_smooth(mapping=aes(x=Year_new,y=Total_Sales)) + facet_wrap(~Publisher) + ggtitle("Top Publisher Sales Over Time") + theme(plot.title = element_text(hjust = 0.5)) +
  labs(x="Year",y="Sales (Million USD)")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 9 rows containing non-finite values (stat_smooth).

Create Heat map of the top 10 publishers

top_10_sales1 <- vg_sales %>%  filter(Publisher=='Nintendo' | Publisher=='Activision' | Publisher=='Sony Computer Entertainment' | Publisher=='Ubisoft' | Publisher=='Take-Two Interactive' | Publisher=='THQ' | Publisher=='Konami Digital Entertainment' | Publisher=='Sega' | Publisher=='Namco Bandai Games')
top_10_sales2 <- mutate(top_10_sales1, Year_new = as.numeric(Year))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
top_10_sales3 <- top_10_sales2 %>% group_by(Publisher,Year_new) %>% summarize(Total_Sales=sum(Global_Sales))
## `summarise()` has grouped output by 'Publisher'. You can override using the
## `.groups` argument.
ggplot(data=top_10_sales3) + geom_tile(mapping=aes(x=Year_new,y=Publisher,fill=Total_Sales)) + scale_fill_gradient(low = "white", high = "black") + theme(axis.text.x=element_text(angle=90,hjust=1)) + coord_fixed() + ggtitle("Top Publisher Sales Over Time Heatmap") + theme(plot.title = element_text(hjust = 0.5)) +
  labs(x="Year",y="Publisher",fill="Total Sales (Millions USD)")
## Warning: Removed 9 rows containing missing values (geom_tile).

Visualize number of games published over time

Videogame releases peaked in 2008-2010 period at around 1500 games a year

ggplot(data=vg_sales_clean) + geom_bar(mapping=aes(x=Year_new,fill=Platform)) + ggtitle("Total Video Game Releases By Platform Over Time") + theme(plot.title = element_text(hjust = 0.5)) +
  labs(x="Year",y="Video Game Releases")
## Warning: Removed 271 rows containing non-finite values (stat_count).

Regional Sales Over time

Filtered out data after 2016 due to having too few data points in that period

percent_sales0 <- filter(vg_sales_clean, Year_new<2017)
percent_sales1 <- percent_sales0 %>%  group_by(Year_new) %>% summarize(Total_NA=sum(NA_Sales),Total_EU=sum(EU_Sales),Total_JP=sum(JP_Sales),Total_others=sum(Other_Sales),Total_global=sum(Global_Sales))
percent_sales2 <- mutate(percent_sales1, NA_percent=(Total_NA/Total_global)*100, EU_percent=(Total_EU/Total_global)*100,  JP_percent=(Total_JP/Total_global)*100, other_percent=(Total_others/Total_global)*100)
ggplot(data=percent_sales2) + geom_line(mapping=aes(x=Year_new,y=NA_percent),color="red") + geom_line(mapping=aes(x=Year_new,y=EU_percent),color="Blue") + geom_line(mapping=aes(x=Year_new,y=JP_percent),color="Orange") + geom_line(mapping=aes(x=Year_new,y=other_percent),color="Purple") +
  annotate("text", x = 2008, y = 55, label = "NA Sales",color="Red") +   annotate("text", x = 2008, y = 35, label = "EU Sales",color="Blue") + annotate("text", x = 2008, y = 15, label = "JP Sales",color="Orange") + annotate("text", x = 2008, y = 5, label = "Other Sales",color="Purple") +
  labs(x="Year",y="Market Share (%)") + ggtitle("Region Market Share Over Time") + theme(plot.title = element_text(hjust = 0.5))

Creating a Tree map of platforms with their publishers

Filtered to only platforms/publsihers with 50 or more releases

tree1 <- vg_sales_clean %>% group_by(Platform,Publisher) %>% summarize(platform_count=n_distinct(Name))  %>% arrange(-platform_count)
## `summarise()` has grouped output by 'Platform'. You can override using the
## `.groups` argument.
tree2 <- filter(tree1,platform_count >= 50)  
ggplot(tree2, aes(area = platform_count, fill = platform_count,
               label = Publisher, subgroup = Platform)) +
  geom_treemap() +
  geom_treemap_subgroup_border(colour = "white", size = 5) +
  geom_treemap_subgroup_text(place = "centre", grow = TRUE,
                             alpha = 0.25, colour = "black",
                             fontface = "italic") +
  geom_treemap_text(colour = "white", place = "centre",
                    size = 15, grow = TRUE) +
  ggtitle("Top Publishers On Each Platform") + theme(plot.title = element_text(hjust = 0.5)) + labs(fill="Platform Releases")