Part 2 Graphical look on the data
2.1 First ggplot of the data
First, I want to visually find out which is the most prolific studio
#Making a scatter plot
<-
studio1959_2020 ggplot(data = studiotop50, mapping = aes(x = production,
y = reorder (studios, production))) +
geom_point(aes(size = rating, colour = rating, check_overlap = TRUE)) +
#Rating label round it to 2 sig fig and bold
geom_text(aes(label = sprintf("%.2f",rating),hjust = -0.5),frontface="bold")
#Set up the theme, prevent label overlapping, and create tags.
<- studio1959_2020 + scale_x_discrete(guide = guide_axis(n.dodge = 5)) +
plot scale_x_continuous("Number of Total Productions") +
theme(axis.text.y = element_text(size = 12, face = "bold"),
axis.title.x = element_text(size = 20, face = "bold"),
axis.text.x = element_text(size = 20, face = "bold"),
plot.tag.position = c (0.2, 0.01))
print(plot + labs(y = "Studio Name", tag = "Figure1: 1958-2020 No.studio's production and overallrating"))
2.2 Second ggplot of the data
With second graph, I could visually find out the top average rating studio and its amount of work.
<-
studio1959_2020c ggplot(data = studiotop50, mapping = aes(x = rating,
y = reorder (studios, rating))) +
geom_point(aes(size = production, colour = rating, check_overlap = TRUE)) +
geom_text(aes(label = production),hjust = -0.5) +
geom_text(aes(label = sprintf("%.2f",rating),hjust = +1.5))
<- studio1959_2020c + scale_x_discrete(guide = guide_axis(n.dodge = 5)) +
plot1 scale_x_continuous("Average rating") +
theme(axis.text.y = element_text(size = 12, face = "bold"),
axis.title.x = element_text(size = 12, face = "bold"),
axis.text.x = element_text(size = 12, face = "bold"),
plot.tag.position = c (0.2, 0.01))
print(plot1+labs(y = "Studio Name",tag = "Figure2: 1958-2020 ranked studio's overallrating"))
2.3 Further Interprtation
After having a general view of the data, I want to find out the best studios that their total amount of production is above lower quartile and the rating is above median. This could reflect the studio’s consistence of producing quality Anime.
#Find out the mean production without affected by outliers (number of production above 300)
<- studiotop50 %>% top_n(-45, studiotop50$production)
studiotop50less summary(studiotop50less$production)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 42.00 49.00 62.00 92.93 116.00 263.00
#Find out the amount of studios that is above median rating
summary(studiotop50$rating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6368 2.7511 2.9774 2.9305 3.2745 3.6612
sum(studiotop50$rating>=2.9774)
## [1] 25
#First, find out the studio that the rating is above median
<- studiotop50 %>% top_n(26, studiotop50$rating)
beststudio
#Then, find out the studio that the production is above lower quartile
sum(beststudio$production>=49)
## [1] 21
#In the end, create the list of the best studio.
<- beststudio%>%top_n(21,beststudio$production) beststudio1
Visualise the best studios
<-
best1959_2020 ggplot(data = beststudio1, mapping = aes(x = rating,
y = reorder (studios, rating))) +
geom_point(aes(size = production, colour = rating, check_overlap = TRUE)) +
geom_text(aes(label = production),hjust = -0.5) +
geom_text(aes(label = sprintf("%.2f",rating),hjust = +1.5))
<- best1959_2020 + scale_x_discrete(guide = guide_axis(n.dodge = 1)) +
plot2 scale_x_continuous("Average rating") +
theme(axis.text.y = element_text(size = 12, face = "bold"),
axis.title.x = element_text(size = 12, face = "bold"),
axis.text.x = element_text(size = 12, face = "bold"),
plot.tag.position = c (0.2, 0.01))
print(plot2+labs(y = "Studio Name",tag = "Figure3: 1958-2020 Best studio's overallrating"))