R Notebook by Miguel Miranda Enriquez
Data source: Kaggle
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(tidyverse)))
df = read.csv("C:\\Users\\mmira\\Downloads\\SpotifyFeatures.csv")
colnames(df)
## [1] "genre" "artist_name" "track_name" "track_id"
## [5] "popularity" "acousticness" "danceability" "duration_ms"
## [9] "energy" "instrumentalness" "key" "liveness"
## [13] "loudness" "mode" "speechiness" "tempo"
## [17] "time_signature" "valence"
#Merge Children's Music and Children`s Music genre
df$genre[df$genre == "Children's Music"] <- "Children’s Music"
#Remove music genres: A Capella, Anime, Children’s Music,
# Classical, Comedy, Movie, Opera, Soundtrack
df <- df[df$genre != "A Capella" & df$genre != "Anime" &
df$genre != "Children’s Music" &
df$genre != "Classical" & df$genre != "Comedy" &
df$genre != "Movie" & df$genre != "Opera" &
df$genre != "Soundtrack",]
head(df)
#Remove duplicates and
# create list of multiple genre values in one cell
new_df <- aggregate(df[1], by = df[4],
FUN = unique)
head(new_df)
#Sort each list using str_sort function from tidyverse package
new_df$genre <- sapply(new_df$genre, str_sort)
#Change list to string
new_df$genre <- sapply(new_df$genre, paste, collapse=",")
head(new_df)
#Merge dataframes by performing left inner join using dplyr package
#Drop old genre column
dropped_df = df[,!(names(df)
%in%c("genre"))]
#Merge new genre column
new_df = new_df %>% inner_join(dropped_df,
by="track_id")
#Remove duplicates
new_df = new_df %>% distinct()
head(new_df)
new_df
# Use mutate funtion to create dummy variables based on if the genre title is contained in the all_genre column
#To avoid using special characters in column names...
# Column name for R&B is RnB
spotify_data <- new_df %>% mutate(
Pop = ifelse(grepl("Pop", genre, fixed = TRUE) == TRUE, 1, 0),
Rap = ifelse(grepl("Rap", genre, fixed = TRUE) == TRUE, 1, 0),
Rock = ifelse(grepl("Rock", genre, fixed = TRUE) == TRUE, 1, 0),
Hip_Hop = ifelse(grepl("Hip-Hop", genre, fixed = TRUE) == TRUE, 1, 0),
Dance = ifelse(grepl("Dance", genre, fixed = TRUE) == TRUE, 1, 0),
Indie = ifelse(grepl("Indie", genre, fixed = TRUE) == TRUE, 1, 0),
RnB = ifelse(grepl("R&B", genre, fixed = TRUE) == TRUE, 1, 0),
Alternative = ifelse(grepl("Alternative", genre, fixed = TRUE) == TRUE, 1, 0),
Folk = ifelse(grepl("Folk", genre, fixed = TRUE) == TRUE, 1, 0),
Soul = ifelse(grepl("Soul", genre, fixed = TRUE) == TRUE, 1, 0),
Country = ifelse(grepl("Country", genre, fixed = TRUE) == TRUE, 1, 0),
Jazz = ifelse(grepl("Jazz", genre, fixed = TRUE) == TRUE, 1, 0),
Electronic = ifelse(grepl("Electronic", genre, fixed = TRUE) == TRUE, 1, 0),
Reggaeton = ifelse(grepl("Reggaeton", genre, fixed = TRUE) == TRUE, 1, 0),
Reggae = ifelse(grepl("Reggae", genre, fixed = TRUE) == TRUE, 1, 0),
World = ifelse(grepl("World", genre, fixed = TRUE) == TRUE, 1, 0),
Blues = ifelse(grepl("Blues", genre, fixed = TRUE) == TRUE, 1, 0),
Ska = ifelse(grepl("Ska", genre, fixed = TRUE) == TRUE, 1, 0),)
spotify_data
colnames(spotify_data)
## [1] "track_id" "genre" "artist_name" "track_name"
## [5] "popularity" "acousticness" "danceability" "duration_ms"
## [9] "energy" "instrumentalness" "key" "liveness"
## [13] "loudness" "mode" "speechiness" "tempo"
## [17] "time_signature" "valence" "Pop" "Rap"
## [21] "Rock" "Hip_Hop" "Dance" "Indie"
## [25] "RnB" "Alternative" "Folk" "Soul"
## [29] "Country" "Jazz" "Electronic" "Reggaeton"
## [33] "Reggae" "World" "Blues" "Ska"
nrow(spotify_data)
## [1] 128794
spotify_data_final <- spotify_data
colnames(spotify_data_final)
## [1] "track_id" "genre" "artist_name" "track_name"
## [5] "popularity" "acousticness" "danceability" "duration_ms"
## [9] "energy" "instrumentalness" "key" "liveness"
## [13] "loudness" "mode" "speechiness" "tempo"
## [17] "time_signature" "valence" "Pop" "Rap"
## [21] "Rock" "Hip_Hop" "Dance" "Indie"
## [25] "RnB" "Alternative" "Folk" "Soul"
## [29] "Country" "Jazz" "Electronic" "Reggaeton"
## [33] "Reggae" "World" "Blues" "Ska"
#Count tracks from each genre combination
genres_df <- group_by(spotify_data_final, genre) %>% summarize(tracks_count = length(genre))
genres_df
Calculate mean, standard deviation, and coefficient of variation for each audio feature corresponding to each music genre combination (489 total)
genre_popularity_summary <- group_by(spotify_data_final, genre) %>%
summarize_at(vars(popularity),
list(pop_mean = mean,
pop_stdev = sd)) %>%
mutate(pop_cv = pop_stdev / pop_mean) %>%
arrange(desc(pop_mean))
genre_popularity_summary
genre_acousticness_summary <- group_by(spotify_data_final, genre) %>%
summarize_at(vars(acousticness),
list(acousticness_mean = mean,
acousticness_stdev = sd)) %>%
mutate(acousticness_cv = acousticness_stdev / acousticness_mean) %>%
arrange(desc(acousticness_mean))
genre_acousticness_summary
genre_danceability_summary <- group_by(spotify_data_final, genre) %>%
summarize_at(vars(danceability),
list(danceability_mean = mean,
danceability_stdev = sd)) %>%
mutate(danceability_cv = danceability_stdev / danceability_mean) %>%
arrange(desc(danceability_mean))
genre_danceability_summary
genre_duration_ms_summary <- group_by(spotify_data_final, genre) %>%
summarize_at(vars(duration_ms),
list(duration_ms_mean = mean,
duration_ms_stdev = sd)) %>%
mutate(duration_ms_cv = duration_ms_stdev / duration_ms_mean) %>%
arrange(desc(duration_ms_mean))
genre_duration_ms_summary
genre_energy_summary <- group_by(spotify_data_final, genre) %>%
summarize_at(vars(energy),
list(energy_mean = mean,
energy_stdev = sd)) %>%
mutate(energy_cv = energy_stdev / energy_mean) %>%
arrange(desc(energy_mean))
genre_energy_summary
genre_liveness_summary <- group_by(spotify_data_final, genre) %>%
summarize_at(vars(liveness),
list(liveness_mean = mean,
liveness_stdev = sd)) %>%
mutate(liveness_cv = liveness_stdev / liveness_mean) %>%
arrange(desc(liveness_mean))
genre_liveness_summary
genre_loudness_summary <- group_by(spotify_data_final, genre) %>%
summarize_at(vars(loudness),
list(loudness_mean = mean,
loudness_stdev = sd)) %>%
mutate(loudness_cv = loudness_stdev / loudness_mean) %>%
arrange(desc(loudness_mean))
genre_loudness_summary
genre_speechiness_summary <- group_by(spotify_data_final, genre) %>%
summarize_at(vars(speechiness),
list(speechiness_mean = mean,
speechiness_stdev = sd)) %>%
mutate(speechiness_cv = speechiness_stdev / speechiness_mean) %>%
arrange(desc(speechiness_mean))
genre_speechiness_summary
genre_tempo_summary <- group_by(spotify_data_final, genre) %>%
summarize_at(vars(tempo),
list(tempo_mean = mean,
tempo_stdev = sd)) %>%
mutate(tempo_cv = tempo_stdev / tempo_mean) %>%
arrange(desc(tempo_mean))
genre_tempo_summary
genre_valence_summary <- group_by(spotify_data_final, genre) %>%
summarize_at(vars(valence),
list(valence_mean = mean,
valence_stdev = sd)) %>%
mutate(valence_cv = valence_stdev / valence_mean) %>%
arrange(desc(valence_mean))
genre_valence_summary
Merge audio features of genre into one dataframe
genres_data_list <- list(genres_df,
genre_popularity_summary,genre_acousticness_summary,
genre_danceability_summary,genre_duration_ms_summary,
genre_energy_summary,genre_liveness_summary,
genre_loudness_summary,genre_speechiness_summary,
genre_tempo_summary,genre_valence_summary)
#Merging multiple dataframes
my_merge <- function(df1, df2){
merge(df1, df2, by = 'genre')
}
genres_df <- Reduce(my_merge, genres_data_list)
genres_df
colnames(genres_df)
## [1] "genre" "tracks_count" "pop_mean"
## [4] "pop_stdev" "pop_cv" "acousticness_mean"
## [7] "acousticness_stdev" "acousticness_cv" "danceability_mean"
## [10] "danceability_stdev" "danceability_cv" "duration_ms_mean"
## [13] "duration_ms_stdev" "duration_ms_cv" "energy_mean"
## [16] "energy_stdev" "energy_cv" "liveness_mean"
## [19] "liveness_stdev" "liveness_cv" "loudness_mean"
## [22] "loudness_stdev" "loudness_cv" "speechiness_mean"
## [25] "speechiness_stdev" "speechiness_cv" "tempo_mean"
## [28] "tempo_stdev" "tempo_cv" "valence_mean"
## [31] "valence_stdev" "valence_cv"
#Sort order was lost on merge, resorting
genres_df <- genres_df[order(-genres_df$pop_mean,
genres_df$tracks_count),]
#Remove row index column
row.names(genres_df) <- NULL
genres_df
Detect how many music genre combinations are statistically significant (minimun of 100 tracks)
genres_df[genres_df$tracks_count >= 100,]
Irrelevant single genres removed: Movie, A Capella, Anime, Children's Music, Soundtrack,
Comedy, Opera, and Classical.
Total count of single genre tracks after filtering irrelevant genres: 121,411.
Total count of combined genres tracks after filtering irrelevant genres: 73,489
Most combined genres tend to outperform tracks associated with only a single genre. In other words,
tracks associated with more than one music genre tend to be more popular.
The chart below uses the same dataset as previous graph. A limited amount of single genres are displayed to preserve chart interpretability.
Let's examine how popular some audio features how and which single genres tend to be associated more with a particular audio feature (i.e. energy, danceability, duration, etc..)
Spotify uses the word “valence” to measure whether a song is likely to make someone feel happy (higher valence) or sad (lower valence).
Songs with elements of Dance, Pop, and Rock genres are the most likely to be profitable.
High correlation between danceability and popularity except
with electronic music.
Beats-Per-Minute (BPM) range of 120-125 is recommended. This range is easy to dance to.
Song duration of ~3.5 minutes is recommended for most genres but for rap music ~4.5 minutes is advised.
Wether a song makes you feel happy or sad doesn't play a significant role except for reggaeton.
Reggaeton songs that are too happy are not as popular.