# %%capture
# #!pip install squarify geopandas plotly Ipython ipywidget dash
Spotify World Insights Viz
Data Viz
Data Viz Project From Uni
Installing Libraries
Importing Libraries
import random
import squarify
import pandas as pd
import numpy as np
import seaborn as sns
"talk", font_scale=1.2)
sns.set_context('white')
sns.set_style(import geopandas as gpd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objects as go
from IPython.display import HTML
from ipywidgets import interact, Dropdown
# from googlesearch import search
Importing the Dataset and Processing a Bit
= pd.read_csv("~/side_projects/Projects/DataVizProject/spotify_with_locations.csv", index_col=0)
data data.shape
(225456, 28)
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 225456 entries, 0 to 225455
Data columns (total 28 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 acousticness 225456 non-null float64
1 artists 225444 non-null object
2 danceability 225456 non-null float64
3 duration_ms 225456 non-null int64
4 energy 225456 non-null float64
5 explicit 225456 non-null int64
6 id 225456 non-null object
7 instrumentalness 225456 non-null float64
8 key 225456 non-null int64
9 liveness 225456 non-null float64
10 loudness 225456 non-null float64
11 mode 225456 non-null int64
12 name 225456 non-null object
13 popularity 225456 non-null int64
14 release_date 225456 non-null object
15 speechiness 225456 non-null float64
16 tempo 225456 non-null float64
17 valence 225456 non-null float64
18 year 225456 non-null int64
19 mbid 181952 non-null object
20 artist_lastfm 177791 non-null object
21 country_mb 171605 non-null object
22 country_lastfm 132978 non-null object
23 tags_mb 142502 non-null object
24 tags_lastfm 160841 non-null object
25 listeners_lastfm 177791 non-null float64
26 scrobbles_lastfm 177791 non-null float64
27 ambiguous_artist 181952 non-null object
dtypes: float64(11), int64(6), object(11)
memory usage: 49.9+ MB
# How much of the data is null
sum() data.isnull().
acousticness 0
artists 12
danceability 0
duration_ms 0
energy 0
explicit 0
id 0
instrumentalness 0
key 0
liveness 0
loudness 0
mode 0
name 0
popularity 0
release_date 0
speechiness 0
tempo 0
valence 0
year 0
mbid 43504
artist_lastfm 47665
country_mb 53851
country_lastfm 92478
tags_mb 82954
tags_lastfm 64615
listeners_lastfm 47665
scrobbles_lastfm 47665
ambiguous_artist 43504
dtype: int64
# Removing columns which are not useful
= data.drop(['mbid','country_lastfm', 'tags_mb', 'listeners_lastfm','scrobbles_lastfm','id', 'ambiguous_artist','artist_lastfm'], axis = 1)
data = data.rename(columns = {'country_mb':'artist_origin','tags_lastfm':'genres'}) data
# Now checking the null values and dropping the rows with null values in any of the column
= data.isnull().sum(axis = 0).reset_index()
df_missing = ['column_name', 'missing_count']
df_missing.columns 'missing_ratio'] = df_missing['missing_count']/data.shape[0]
df_missing['missing_ratio>0').sort_values(by = 'missing_ratio', ascending = False) df_missing.query(
column_name | missing_count | missing_ratio | |
---|---|---|---|
19 | genres | 64615 | 0.286597 |
18 | artist_origin | 53851 | 0.238854 |
1 | artists | 12 | 0.000053 |
= data.dropna(subset = ['artist_origin','artists','genres'],axis = 0)
df_cleaned sum(axis = 0) df_cleaned.isna().
acousticness 0
artists 0
danceability 0
duration_ms 0
energy 0
explicit 0
instrumentalness 0
key 0
liveness 0
loudness 0
mode 0
name 0
popularity 0
release_date 0
speechiness 0
tempo 0
valence 0
year 0
artist_origin 0
genres 0
dtype: int64
# Creating a copy of data to work on
= df_cleaned.copy()
df df.shape
(156512, 20)
# Doing some processing, like choosing year as discussed
= 1979
min_year = df.query("year > @min_year").copy()
df # Convert duration_ms to duration in seconds
'duration'] = df['duration_ms'] / 1000
df[
# Drop the duration_ms column
'duration_ms', axis=1, inplace=True)
df.drop( df.shape
(73887, 20)
df.describe()
acousticness | danceability | energy | explicit | instrumentalness | key | liveness | loudness | mode | popularity | speechiness | tempo | valence | year | duration | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 | 73887.000000 |
mean | 0.270194 | 0.572947 | 0.623204 | 0.162004 | 0.094045 | 5.261115 | 0.200284 | -8.982739 | 0.683733 | 47.005820 | 0.088800 | 120.083159 | 0.520840 | 1999.056816 | 247.873261 |
std | 0.304551 | 0.178310 | 0.244244 | 0.368457 | 0.241324 | 3.562242 | 0.175176 | 5.129790 | 0.465022 | 12.571747 | 0.101303 | 30.003903 | 0.254652 | 11.332604 | 88.925993 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -60.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1980.000000 | 30.080000 |
25% | 0.020500 | 0.456000 | 0.462500 | 0.000000 | 0.000000 | 2.000000 | 0.093100 | -11.216000 | 0.000000 | 38.000000 | 0.034500 | 96.013500 | 0.318000 | 1989.000000 | 200.095500 |
50% | 0.132000 | 0.587000 | 0.661000 | 0.000000 | 0.000024 | 5.000000 | 0.130000 | -7.688000 | 1.000000 | 46.000000 | 0.046600 | 117.956000 | 0.526000 | 1999.000000 | 236.253000 |
75% | 0.458000 | 0.705000 | 0.822000 | 0.000000 | 0.006290 | 8.000000 | 0.258000 | -5.440000 | 1.000000 | 55.000000 | 0.090500 | 139.557000 | 0.729000 | 2009.000000 | 279.933000 |
max | 0.996000 | 0.986000 | 1.000000 | 1.000000 | 0.999000 | 11.000000 | 0.998000 | 1.342000 | 1.000000 | 100.000000 | 0.960000 | 244.091000 | 0.998000 | 2020.000000 | 3816.373000 |
= {
dtypes 'acousticness': 'float32',
'danceability': 'float32',
'energy': 'float32',
'explicit': 'bool',
'instrumentalness': 'float32',
'key': 'uint8',
'liveness': 'float32',
'loudness': 'float32',
'mode': 'bool',
'popularity': 'float32',
'speechiness': 'float32',
'tempo': 'float32',
'valence': 'float32',
'year': 'uint16',
'duration': 'float32',
'artists': 'string',
'name': 'string',
'release_date': 'string',
'genres': 'string',
'artist_origin': 'string'
}= df.astype(dtypes) #Changing the dtypes for efficient processing. df
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 73887 entries, 5920 to 225454
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 acousticness 73887 non-null float32
1 artists 73887 non-null string
2 danceability 73887 non-null float32
3 energy 73887 non-null float32
4 explicit 73887 non-null bool
5 instrumentalness 73887 non-null float32
6 key 73887 non-null uint8
7 liveness 73887 non-null float32
8 loudness 73887 non-null float32
9 mode 73887 non-null bool
10 name 73887 non-null string
11 popularity 73887 non-null float32
12 release_date 73887 non-null string
13 speechiness 73887 non-null float32
14 tempo 73887 non-null float32
15 valence 73887 non-null float32
16 year 73887 non-null uint16
17 artist_origin 73887 non-null string
18 genres 73887 non-null string
19 duration 73887 non-null float32
dtypes: bool(2), float32(11), string(5), uint16(1), uint8(1)
memory usage: 6.8 MB
# df.to_csv("PowerBI_Spotify_Cleaned_data",index = 0)
# grouped_df = df.groupby("name", sort=False).apply(lambda x: x.reset_index(drop=True))
# grouped_df[grouped_df['artists'] == 'Sade']
- Observing this we can see that the same song is released in multiple years maybe by different producers or in different albums.
- So we will group any feature by songs and take the mean value of the feature before plotting
Plot 1: Distribution of Artist Origins across years
Context
- Most artists come from which country for a given year or all the years
Caution
- For any year, all countries who have less than 1% of total origins are grouped together into others category
import plotly.graph_objects as go
import seaborn as sns
import numpy as np
def treemap(year):
if year != 'All':
= df.query('year == @year')
data = f'Artist Origins Distribution for {year} - Treemap (Thresholded)'
title else:
= df
data = np.min(df['year'])
min_year = np.max(df['year'])
max_year = f'Artist Origins Distribution from {min_year} to {max_year} - Treemap (Thresholded)'
title
# Set the threshold count
= 0.01 * data.shape[0]
threshold
# Get the counts of artist origins
= data['artist_origin'].value_counts()
artist_counts
# Identify countries with counts less than the threshold
= artist_counts[artist_counts < threshold]
other_countries
# Sum the counts of other countries
= other_countries.sum()
other_count
# Exclude other countries from the main data
= artist_counts.drop(other_countries.index)
filtered_counts
# Add the other count as a separate entry
'Other'] = other_count
filtered_counts[
# Calculate the total size
= filtered_counts.sum()
total_size
# Create the treemap
= filtered_counts.index.tolist()
labels = filtered_counts.tolist()
values
# Retrieve the muted color palette from Seaborn
= sns.color_palette("muted")
color_palette
= go.Figure(go.Treemap(
fig =labels,
labels=[""] * len(labels),
parents=values,
values="label+value+percent parent",
textinfo=color_palette[:len(labels)],
marker_colors="<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percentParent:.1%}<extra></extra>",
hovertemplate='middle center', # Center the label text in the treemap cells
textposition=dict(
textfont=30, # Set the font size for the text inside the treemap
size='black',
color='Times New Roman'
family
)
))
fig.update_layout(={
title'text': title,
'font': {
'size': 24 # Set the font size for the title
}
},=dict(l=0, r=0, t=40, b=0),
margin=1800,
width=900, # Set the background style to sns.darkgrid
height
)#fig.write_image(f'{title}.png')
fig.show()
= ['All'] + df['year'].unique().tolist()
years
# Create the interactive plot
=years) interact(treemap, year
<function __main__.treemap(year)>
# def treemap(year):
# if year != 'All':
# data = df.query('year == @year')
# title = f'Artist Origins Distribution for {year} - Treemap (Thresholded)'
# else:
# data = df
# min_year = np.min(df['year'])
# max_year = np.max(df['year'])
# title = f'Artist Origins Distribution from {min_year} to {max_year} - Treemap (Thresholded)'
# # Set the threshold count
# threshold = 0.01 * data.shape[0]
# # Get the counts of artist origins
# artist_counts = data['artist_origin'].value_counts()
# # Identify countries with counts less than the threshold
# other_countries = artist_counts[artist_counts < threshold]
# # Sum the counts of other countries
# other_count = other_countries.sum()
# # Exclude other countries from the main data
# filtered_counts = artist_counts.drop(other_countries.index)
# # Add the other count as a separate entry
# filtered_counts['Other'] = other_count
# # Calculate the total size
# total_size = filtered_counts.sum()
# # Create the treemap
# plt.figure(figsize=(24,12))
# squarify.plot(sizes=filtered_counts, label=filtered_counts.index, alpha=0.8, color=sns.color_palette("muted"))
# plt.axis('off')
# # Add percentage labels
# for i, patch in enumerate(plt.gca().patches):
# x, y, dx, dy = patch.get_bbox().bounds
# label = filtered_counts.index[i]
# size = filtered_counts[label]
# percentage = (size / total_size) * 100
# plt.gca().text(x + dx / 2, y + dy / 2, f'\n\n{percentage:.1f}%', ha='center', va='center')
# plt.title(title)
# plt.savefig(title,bbox_inches = 'tight' ,dpi=150)
# plt.show()
# years = ['All'] + df['year'].unique().tolist()
# # Create the interactive plot
# interact(treemap, year=years)
# plt.show()
Plot 1 Alternative
# # Read the data into a pandas DataFrame
# # data = df # Replace 'your_data_file.csv' with your actual data file path
# # Group the data by artist origin and count the number of artists from each region
# artist_counts = df['artist_origin'].value_counts()
# # Normalize the counts by dividing by the maximum count value
# # normalized_counts = artist_counts / artist_counts.max()
# # Create the choropleth map using plotly.graph_objects
# fig = go.Figure(data=go.Choropleth(
# locations=artist_counts.index,
# z=artist_counts.values,
# locationmode='country names',
# colorscale='ylorrd',
# reversescale=False, # Reverse the colorscale to have darker blue for higher counts
# # zmin=0, # Set the minimum value for the color scale
# # zmax=1, # Set the maximum value for the color scale
# ))
# # Set the layout properties
# fig.update_layout(
# title_text='Artist Origins Count by Country from 1980 to 2020',
# height = 1200,
# width = 1500,
# geo=dict(
# showframe=False,
# showcoastlines=False,
# projection_type='equirectangular'
# ),
# )
# # Show the figure
# fig.show()
# plt.show()
Plot 2: Most Popular Artists for Given Year n Place of Origin
Context of Plot
- For a given year what are the origins of most popular artists
- Who are the most popular artists in a particular year and place of origin
Caution Points
- For few countries in a particular year, there is no data available for ex. Taiwan’s data seems to be available only for few years.
- Another point is in some years for a particular country, we have only few artists, so we can even have top 2 or top 3 artist, and in the worst case only a single artist or no artist from a country
- Also because of digitization, I think the new artists are most popular since new generation has access to share there likes, with the new artists, and they don’t listen to the old artists that much.
import plotly.graph_objects as go
import seaborn as sns
import numpy as np
def create_plot(year, country):
# Filter the data for the selected year and country
if country != 'All':
if year != 'All':
= df[(df['year'] == year) & (df['artist_origin'] == country)]
filtered_data = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0]
num = f'Top {country} Artists in Year {year}'
title else:
= df[df['artist_origin'] == country]
filtered_data = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0]
num = np.min(filtered_data.year)
min_year = np.max(filtered_data.year)
max_year = f'Top Artists from {country} across {min_year} - {max_year}'
title else:
if year != 'All':
= df[df['year'] == year]
filtered_data = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0]
num = f'Top Artists from all countries in Year {year}'
title else:
= df
filtered_data = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0]
num = np.min(filtered_data.year)
min_year = np.max(filtered_data.year)
max_year = f'Top Artists from all countries across {min_year} - {max_year}'
title
# Group the data by artists and calculate the mean popularity per artist
= filtered_data.groupby('artists')['popularity'].mean().reset_index()
grouped
# Sort the data by popularity in descending order
= grouped.sort_values(by='popularity', ascending=False).head(5)
sorted_data
# Retrieve the muted color palette from Seaborn
= sns.color_palette("muted").as_hex()
color_palette
# Create the bar plot using Plotly
= go.Figure()
fig
# Create a trace for each artist
for i, artist in enumerate(sorted_data['artists']):
= sorted_data.loc[sorted_data['artists'] == artist]
artist_data = filtered_data.loc[filtered_data['artists'] == artist, 'artist_origin'].iloc[0]
artist_country = color_palette[i % len(color_palette)] # Use a color from the palette
color
fig.add_trace(go.Bar(=[artist],
x=artist_data['popularity'],
y=artist,
name=dict(
marker=color,
color
),=[f'{artist_country}'], # Display the artist's country and name
text='inside',
textposition="<b>%{x}</b><br>Popularity: %{y}<extra></extra>",
hovertemplate=dict(
textfont=30, # Set the font size for the text inside the treemap
size='white',
color='Times New Roman'
family
),
))
fig.update_layout(={
title'text': title,
'font': {'size': 24}
},=dict(
xaxis='Artists',
title=45,
tickangle=True,
automargin=False,
showgrid={'size': 18},
tickfont= dict(size = 20),
title_font
),=dict(
yaxis='Popularity',
title='linear',
tickmode=0,
tick0=5,
dtick=False,
showgrid={'size': 18},
tickfont= dict(size = 20),
title_font
),
=dict(l=0, r=0, t=100, b=0),
margin=False,
showlegend
)
if country == 'All':
for annotation in fig['layout']['annotations']:
'text'] = annotation['text'].split(': ')[1] # Remove the country name from the annotation text
annotation[
fig.update_layout(=1800,
width=900,
height
)#fig.write_image(f'{title}.png')
fig.show()
# Get the unique years and countries in the data
= ['All'] + df['year'].unique().tolist()
years = sorted(['All'] + df['artist_origin'].unique().tolist())
countries
# Create the interactive plot
=years, country=countries)
interact(create_plot, year plt.show()
# def create_plot(year, country):
# # Filter the data for the selected year and country
# if country != 'All':
# if year !='All':
# filtered_data = df[(df['year'] == year) & (df['artist_origin'] == country)]
# num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0] #For some years we have even less than 5 artists
# # print(filtered_data.shape[0])
# title = f'Top {country} Artists in Year {year}'
# else:
# filtered_data = df[df['artist_origin'] == country]
# num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0] #For some years we have even less than 5 artists
# min_year = np.min(filtered_data.year)
# max_year = np.max(filtered_data.year)
# # print(filtered_data.shape[0])
# title = f'Top Artists from {country} across {min_year} - {max_year}'
# else:
# if year!='All':
# filtered_data = df[df['year'] == year]
# num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0] #For some years we have even less than 5 artists
# # print(filtered_data.shape[0])
# title = f'Top Artists from all countries in Year {year}'
# else:
# filtered_data = df
# num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0] #For some years we have even less than 5 artists
# min_year = np.min(filtered_data.year)
# max_year = np.max(filtered_data.year)
# # print(filtered_data.shape[0])
# title = f'Top Artists from all countries across {min_year} - {max_year}'
# # Group the data by artists and calculate the mean popularity per artist
# grouped = filtered_data.groupby('artists')['popularity'].mean().reset_index()
# # Sort the data by popularity in descending order
# sorted_data = grouped.sort_values(by='popularity', ascending=False).head(5)
# # Create the bar plot using Seaborn
# plt.figure(figsize=(24, 12))
# sns.barplot(data=sorted_data, x='artists', y='popularity', palette='muted')
# plt.title(title)
# plt.xlabel('Artists')
# plt.ylabel('Popularity')
# plt.xticks(rotation=45)
# plt.yticks(range(0, int(sorted_data['popularity'].max()) + 6, 5)) # Set y-axis ticks with a fixed increment of 5
# # Add artist origin labels when 'All' is selected
# if country == 'All':
# for i, bar in enumerate(plt.gca().patches):
# artist_name = sorted_data.iloc[i, 0]
# artist_origin = df.loc[df['artists'] == artist_name, 'artist_origin'].iloc[0]
# plt.gca().text(bar.get_x() + bar.get_width() / 2, bar.get_height() / 2, artist_origin, ha='center', va='center', rotation=90)
# plt.tight_layout()
# plt.savefig(title,bbox_inches = 'tight' ,dpi=150)
# plt.show()
# # Get the unique years and countries in the data
# years = ['All'] + df['year'].unique().tolist()
# countries = sorted(['All'] + df['artist_origin'].unique().tolist())
# # Create the interactive plot
# interact(create_plot, year=years, country=countries)
# plt.show()
Plot 3 High Feature Valued Songs distribution across Years and Origins classified wrt Genres
Context
- Identifying top 5 songs with high value for a particular feature across countries and time frame
- We can even identify what are the genres that have most high feature values across years and time frames
Caution:
- We get the values for the features by aggregation, since songs are published multiple times maybe in different playlists
- For few countries we don’t get top 5 since the data is not available
import plotly.express as px
import seaborn as sns
import numpy as np
from ipywidgets import interact
def create_plot(year, country, feature, top_n=5):
# Filter the data for the selected feature, countries, and years
= df.copy()
data_dummy = data_dummy.loc[:,['name',feature,'artist_origin','year','genres']]
data_dummy
if country != 'All':
if year !='All':
= data_dummy[(data_dummy['year'] == year) & (data_dummy['artist_origin'] == country)]
filtered_data = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
num = f'Top {feature.capitalize()} {country} Songs in Year {year}'
title else:
= data_dummy[data_dummy['artist_origin'] == country]
filtered_data = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
num = np.min(filtered_data.year)
min_year = np.max(filtered_data.year)
max_year = f'Top {feature.capitalize()} Songs from {country} across {min_year} - {max_year}'
title else:
if year!='All':
= data_dummy[data_dummy['year'] == year]
filtered_data = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
num = f'Top {feature.capitalize()} Songs from all countries in Year {year}'
title else:
= data_dummy
filtered_data = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
num = np.min(filtered_data.year)
min_year = np.max(filtered_data.year)
max_year = f'Top {feature.capitalize()} Songs from all countries across {min_year} - {max_year}'
title
# Group the data by artists and calculate the mean popularity per artist
= filtered_data.groupby(['name','genres'])[feature].mean().reset_index()
grouped
# Only taking unique song names
= grouped.drop_duplicates(subset='name')
grouped
# Group the data by name of songs and calculate the mean popularity per song
= grouped.sort_values(by=feature, ascending=False).head(top_n)
sorted_data
# Only using first 2 genres for songs
= sorted_data.genres.apply(lambda x: ';'.join(x.split(';')[0:2]))
sorted_data.genres
# Retrieve the muted color palette from Seaborn
= sns.color_palette("muted").as_hex()
color_palette
# Merge the 'artist_origin' column from data_dummy based on the 'name' column
= sorted_data.merge(data_dummy[['name', 'artist_origin']], on='name', how='left')
sorted_data
= sorted_data.drop_duplicates(subset='name')
sorted_data
# Concatenate 'name' and 'artist_origin' columns into a single column
'text'] = sorted_data['artist_origin']
sorted_data[
# Create the bar plot using Plotly Express
= px.bar(
fig
sorted_data,='name',
x=feature,
y='genres',
color=color_palette,
color_discrete_sequence='text',
text=title,
title='plotly_white'
template
)
='%{text}<br>%{y:.2f}', textposition='inside',textfont=dict(color = 'white',family = 'Times New Roman',size=30),textangle=0)
fig.update_traces(texttemplate
fig.update_layout(={
title'text': title,
'font': {
'size': 24 # Set the font size for the title
}
},=dict(
xaxis=dict(size=14), # Set the font size for x-axis labels and ticks
tickfont='Name of Songs',
title=dict(size=16), # Set the font size for x-axis title
title_font
),=dict(
yaxis=dict(size=14), # Set the font size for y-axis labels and ticks
tickfont=f'{feature.capitalize()}',
title=dict(size=16), # Set the font size for y-axis title
title_font
),=600,
height=dict(l=50, r=50, t=80, b=50),
margin
)#fig.write_image(f'{title}.png',scale = 6)
fig.show()
# Get the unique years and countries in the data
= ['All'] + df['year'].unique().tolist()
years = sorted(['All'] + df['artist_origin'].unique().tolist())
countries = ['danceability', 'energy', 'acousticness', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
features 'popularity', 'speechiness', 'tempo', 'valence', 'duration']
= [5, 10, 15]
tops
# Create the interactive plot
=years, country=countries, feature=features, top_n=tops)
interact(create_plot, year plt.show()
# def create_plot(year, country, feature, top_n=5):
# # Filter the data for the selected feature then for countries and years
# data_dummy = df.copy()
# data_dummy = data_dummy.loc[:,['name',feature,'artist_origin','year','genres']]
# if country != 'All':
# if year !='All':
# filtered_data = data_dummy[(data_dummy['year'] == year) & (data_dummy['artist_origin'] == country)]
# num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
# title = f'Top {feature.capitalize()} {country} Songs in Year {year}'
# else:
# filtered_data = data_dummy[data_dummy['artist_origin'] == country]
# num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
# min_year = np.min(filtered_data.year)
# max_year = np.max(filtered_data.year)
# title = f'Top {feature.capitalize()} Songs from {country} across {min_year} - {max_year}'
# else:
# if year!='All':
# filtered_data = data_dummy[data_dummy['year'] == year]
# num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
# title = f'Top {feature.capitalize()} Songs from all countries in Year {year}'
# else:
# filtered_data = data_dummy
# num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
# min_year = np.min(filtered_data.year)
# max_year = np.max(filtered_data.year)
# title = f'Top {feature.capitalize()} Songs from all countries across {min_year} - {max_year}'
# # Group the data by artists and calculate the mean popularity per artist
# grouped = filtered_data.groupby(['name','genres'])[feature].mean().reset_index()
# # Only taking unique song names
# grouped = grouped.drop_duplicates(subset='name')
# # Group the data by name of songs and calculate the mean popularity per song
# sorted_data = grouped.sort_values(by=feature, ascending=False).head(top_n)
# # Only using first 2 genres for songs
# sorted_data.genres = sorted_data.genres.apply(lambda x: ';'.join(x.split(';')[0:2]))
# # ... existing code ...
# # Create the bar plot using Seaborn
# plt.figure(figsize=(24, 12))
# # Calculate the bar width
# bar_width = 3/sorted_data.shape[0]
# sns.barplot(
# data=sorted_data,
# x='name',
# y=feature,
# palette='muted',
# hue='genres',
# width=bar_width # Set the bar width
# )
# for i, bar in enumerate(plt.gca().patches):
# if i < top_n:
# mean_value = sorted_data.iloc[i, 2]
# song_name = sorted_data.iloc[i, 0]
# origin = df.loc[df['name'] == song_name, 'artist_origin'].iloc[0]
# if country == 'All':
# plt.gca().text(
# bar.get_x() + bar.get_width() / 2,
# bar.get_height() / 2,
# f'{mean_value:.2f} in {origin}',
# ha='center',
# va='center',
# rotation=90
# )
# else:
# plt.gca().text(
# bar.get_x() + bar.get_width() / 2,
# bar.get_height() / 2,
# f'{mean_value:.2f}',
# ha='center',
# va='center'
# )
# # ... existing code ...
# # Customize the x-axis ticks
# xticks = np.arange(len(sorted_data)) #+ (bar_width/2.7) # Add half of the bar width
# plt.xticks(xticks, sorted_data['name'], rotation=36)
# plt.gca().xaxis.set_tick_params(width=1) # Set the tick width
# # ... existing code ...
# plt.xlabel('Name of Songs')
# plt.ylabel(f'{feature.capitalize()}')
# plt.title(title)
# plt.tight_layout()
# plt.savefig(title, bbox_inches='tight', dpi=150)
# plt.show()
# # Get the unique years and countries in the data
# years = ['All'] + df['year'].unique().tolist()
# countries = sorted(['All'] + df['artist_origin'].unique().tolist())
# features = ['danceability', 'energy', 'acousticness','instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity','speechiness', 'tempo', 'valence', 'duration']
# tops = [5,10,15]
# # Removed Explicit: Whether or not the track has explicit lyrics (true = yes it does; false = no it does not OR unknown); Not that important
# # Removed Year as it's not a feature for a song
# # Create the interactive plot
# interact(create_plot, year=years,country=countries, feature = features, top_n=tops)
# plt.show()
Plot 4: Distribution of Features across countries
- Able to answer questions such as which countries has most danceable songs, most loud songs and so on in a given time frame.
def update_plot(feature):
# Group the data by artist origin and calculate the mean of the selected feature per country
= df.groupby('artist_origin')[feature].mean()
feature_means = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world # Merge the data with the world shapefile based on the country names
= world.merge(feature_means, left_on='name', right_index=True)
merged_data
# Create a choropleth map using plotly
= go.Figure(data=go.Choropleth(
fig =merged_data['iso_a3'],
locations=merged_data[feature],
z=merged_data['name'],
text='Viridis',
colorscale=False,
autocolorscale='white',
marker_line_color=0.5,
marker_line_width=f'{feature.capitalize()}'
colorbar_title
))= f'{feature.capitalize()} Distribution by Country'
title # Customize the layout
fig.update_layout(=title,
title_text=900,
height=1800,
width=dict(
geo=False,
showframe=False,
showcoastlines='equirectangular'
projection_type
)
)# Save the plot
f'{title}.png') # Save as PNG image
fig.write_image(
fig.show()return title
= ['danceability', 'energy', 'acousticness','instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity','speechiness', 'tempo', 'valence', 'duration']
features # Use interact to create the interactive dropdown widget
=Dropdown(options=features))
interact(update_plot, feature plt.show()
Rest Good Plots
import plotly.graph_objects as go
# Function to create the interactive histogram plot with KDE
def plot_histogram(feature):
= go.Figure()
fig
for year in df['year'].unique():
= df[df['year'] == year][feature]
year_data = go.Histogram(x=year_data, nbinsx=30, histnorm='density', name=f'Year {year}', opacity=0.7)
hist_trace = go.Scatter(x=year_data, y=year_data, mode='lines', name=f'KDE - Year {year}')
kde_trace
fig.add_trace(hist_trace)
fig.add_trace(kde_trace)
= f'{feature.capitalize()} Distribution by Year'
title
fig.update_layout(={
title'text': title,
'font': {'size': 24}
},=dict(
xaxis='Mean Value',
title={'size': 18},
tickfont=dict(size=20),
title_font
),=dict(
yaxis='Density',
title={'size': 18},
tickfont=dict(size=20),
title_font
),=dict(l=0, r=0, t=100, b=0),
margin=True,
showlegend
)
fig.update_layout(# width=1500,
=900,
height
)
fig.show()
# Create the interactive dropdown widget using interact
=features)
interact(plot_histogram, feature plt.show()
# The distribution of features over the years by aggregating them detailed of this is the plot below this
# Define the list of features
= ['acousticness', 'danceability', 'duration', 'energy', 'explicit',
features 'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness',
'tempo', 'valence']
# Function to create the interactive histogram plot
def plot_histogram(feature):
# Create the histogram plot using seaborn
=(20, 12))
plt.figure(figsize=df, x=feature, bins=30, kde=True, hue='year', multiple='stack', palette='viridis')
sns.histplot(data= f'{feature.capitalize()} Distribution by Year'
title
plt.title(title)'Mean Value')
plt.xlabel('Frequency')
plt.ylabel(='Year')
plt.legend(title='tight', dpi=150)
plt.savefig(title, bbox_inches
plt.show()
# Create the interactive dropdown widget using interact
=Dropdown(options=features, description='Select Feature'))
interact(plot_histogram, feature plt.show()
import plotly.graph_objects as go
# Function to create the interactive histogram plot
def plot_histogram(feature):
# Group the data by year and calculate the mean of the selected feature per year
= df.groupby('year')[feature].mean()
feature_means
# Create the bar trace using Plotly
= go.Bar(x=feature_means.index, y=feature_means, marker_color='purple')
bar_trace
# Set the plot title and labels
= f'Mean {feature.capitalize()} by Year'
title = 'Year'
x_label = 'Mean Value'
y_label
# Create the layout
= go.Layout(
layout =title,
title=dict(title=x_label, tickangle=45, automargin=True),
xaxis=dict(title=y_label),
yaxis=False
showlegend
)
# Create the figure and add the bar trace
= go.Figure(data=[bar_trace], layout=layout)
fig
fig.update_layout(={
title'text': title,
'font': {'size': 24}
},=dict(
xaxis='Mean Value',
title={'size': 18},
tickfont=dict(size=20),
title_font
),=dict(
yaxis='Density',
title={'size': 18},
tickfont=dict(size=20),
title_font
),=dict(l=0, r=0, t=100, b=0),
margin=True,
showlegend
)
fig.update_layout(# width=1500,
=900,
height
)
# Show the figure
fig.show()
# Create the interactive dropdown widget using interact
=features)
interact(plot_histogram, feature plt.show()
# # Read the data into a pandas DataFrame
# # data = df # Replace 'your_data_file.csv' with your actual data file path
# # Define the list of features
# features = ['acousticness', 'danceability', 'duration', 'energy', 'explicit',
# 'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness',
# 'tempo', 'valence']
# # Define the number of light colors to select
# num_colors = len(features)
# desired_colors = ['tab:purple', 'tab:orange', 'tab:green', 'tab:blue']
# # Function to create the interactive histogram plot
# def plot_histogram(feature):
# # Select random light colors from 'tab10' palette
# # palette = random.sample(sns.color_palette('tab10', 10)[:5], 1)
# # Group the data by year and calculate the mean of the selected feature per year
# feature_means = df.groupby('year')[feature].mean()
# # Select a random light color from the palette
# color = random.choice(desired_colors)
# fig, ax = plt.subplots(1,1, figsize = (20,12),dpi = 120)
# # Create the bar plot for the means using seaborn
# ax = sns.barplot(x=feature_means.index, y=feature_means, color=color)
# # Set the plot title and labels
# title = f'Mean {feature.capitalize()} by Year'
# ax.set_title(title)
# ax.set_xlabel('Year')
# ax.set_ylabel('Mean Value')
# # Rotate and evenly space the year labels on the x-axis
# ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
# ax.xaxis.set_tick_params(pad=10)
# # Show the plot
# plt.tight_layout()
# plt.savefig(title, bbox_inches='tight', dpi=150)
# plt.show()
# # Create the interactive dropdown widget using interact
# interact(plot_histogram, feature=Dropdown(options=features, description='Select Feature'))
# plt.show()
Rough Plots
# ac = df.copy()
# # ac[['genre_1', 'genre_2', 'genre_3','rest_genres']] = ac.genres.str.split(';', n=3, expand=True)
# ac.isnull().sum()
# import geopandas as gpd
# import pandas as pd
# import plotly.graph_objects as go
# from ipywidgets import interact, Dropdown
# # Read the world shapefile
# world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# # Read the data containing country and genre information
# data = ac
# # Define the function to update the plot based on the selected year
# def update_plot(year):
# # Filter the data for the selected year
# filtered_data = data[data['year'] == year]
# # Calculate the genre distribution by country
# genre_distribution = filtered_data.groupby('artist_origin')['genres'].value_counts().unstack().reset_index().fillna(0)
# # Merge the data with the world shapefile based on the country names
# merged_data = world.merge(genre_distribution, left_on='name', right_on='artist_origin')
# # Create a stacked bar plot using plotly
# fig = go.Figure()
# for genre in genre_distribution.columns[1:]:
# fig.add_trace(go.Choropleth(
# locations=merged_data['iso_a3'],
# z=merged_data[genre],
# text=merged_data['name'],
# colorscale='Viridis',
# autocolorscale=False,
# marker_line_color='white',
# marker_line_width=0.5,
# colorbar_title='Count',
# name=genre
# ))
# # Customize the layout
# fig.update_layout(
# title_text=f'Genre Distribution by Country in {year}',
# height=1200,
# width=1500,
# barmode='stack',
# geo=dict(
# showframe=False,
# showcoastlines=False,
# projection_type='equirectangular'
# )
# )
# # Show the plot
# fig.show()
# # Get the unique years from the data
# years = data['year'].unique().tolist()
# # Use interact to create the interactive dropdown widget
# interact(update_plot, year=Dropdown(options=years))
# plt.show()