Spotify World Insights Viz

Data Viz

Data Viz Project From Uni

Installing Libraries

# %%capture
# #!pip install squarify geopandas plotly Ipython ipywidget dash

Importing Libraries

import random
import squarify
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_context("talk", font_scale=1.2)
sns.set_style('white')
import geopandas as gpd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objects as go
from IPython.display import HTML
from ipywidgets import interact, Dropdown
# from googlesearch import search

Importing the Dataset and Processing a Bit

data = pd.read_csv("~/side_projects/Projects/DataVizProject/spotify_with_locations.csv", index_col=0)
data.shape

(225456, 28)

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 225456 entries, 0 to 225455
Data columns (total 28 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   acousticness      225456 non-null  float64
 1   artists           225444 non-null  object 
 2   danceability      225456 non-null  float64
 3   duration_ms       225456 non-null  int64  
 4   energy            225456 non-null  float64
 5   explicit          225456 non-null  int64  
 6   id                225456 non-null  object 
 7   instrumentalness  225456 non-null  float64
 8   key               225456 non-null  int64  
 9   liveness          225456 non-null  float64
 10  loudness          225456 non-null  float64
 11  mode              225456 non-null  int64  
 12  name              225456 non-null  object 
 13  popularity        225456 non-null  int64  
 14  release_date      225456 non-null  object 
 15  speechiness       225456 non-null  float64
 16  tempo             225456 non-null  float64
 17  valence           225456 non-null  float64
 18  year              225456 non-null  int64  
 19  mbid              181952 non-null  object 
 20  artist_lastfm     177791 non-null  object 
 21  country_mb        171605 non-null  object 
 22  country_lastfm    132978 non-null  object 
 23  tags_mb           142502 non-null  object 
 24  tags_lastfm       160841 non-null  object 
 25  listeners_lastfm  177791 non-null  float64
 26  scrobbles_lastfm  177791 non-null  float64
 27  ambiguous_artist  181952 non-null  object 
dtypes: float64(11), int64(6), object(11)
memory usage: 49.9+ MB

# How much of the data is null
data.isnull().sum()

acousticness            0
artists                12
danceability            0
duration_ms             0
energy                  0
explicit                0
id                      0
instrumentalness        0
key                     0
liveness                0
loudness                0
mode                    0
name                    0
popularity              0
release_date            0
speechiness             0
tempo                   0
valence                 0
year                    0
mbid                43504
artist_lastfm       47665
country_mb          53851
country_lastfm      92478
tags_mb             82954
tags_lastfm         64615
listeners_lastfm    47665
scrobbles_lastfm    47665
ambiguous_artist    43504
dtype: int64

# Removing columns which are not useful
data = data.drop(['mbid','country_lastfm', 'tags_mb', 'listeners_lastfm','scrobbles_lastfm','id', 'ambiguous_artist','artist_lastfm'], axis = 1)
data = data.rename(columns = {'country_mb':'artist_origin','tags_lastfm':'genres'})

# Now checking the null values and dropping the rows with null values in any of the column
df_missing = data.isnull().sum(axis = 0).reset_index()
df_missing.columns = ['column_name', 'missing_count']
df_missing['missing_ratio'] = df_missing['missing_count']/data.shape[0]
df_missing.query('missing_ratio>0').sort_values(by = 'missing_ratio', ascending = False)

	column_name	missing_count	missing_ratio
19	genres	64615	0.286597
18	artist_origin	53851	0.238854
1	artists	12	0.000053

df_cleaned = data.dropna(subset = ['artist_origin','artists','genres'],axis = 0)
df_cleaned.isna().sum(axis = 0)

acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
valence             0
year                0
artist_origin       0
genres              0
dtype: int64

# Creating a copy of data to work on
df = df_cleaned.copy()
df.shape

(156512, 20)

# Doing some processing, like choosing year as discussed
min_year = 1979
df = df.query("year > @min_year").copy()
# Convert duration_ms to duration in seconds
df['duration'] = df['duration_ms'] / 1000

# Drop the duration_ms column
df.drop('duration_ms', axis=1, inplace=True)
df.shape

(73887, 20)

df.describe()

	acousticness	danceability	energy	explicit	instrumentalness	key	liveness	loudness	mode	popularity	speechiness	tempo	valence	year	duration
count	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000	73887.000000
mean	0.270194	0.572947	0.623204	0.162004	0.094045	5.261115	0.200284	-8.982739	0.683733	47.005820	0.088800	120.083159	0.520840	1999.056816	247.873261
std	0.304551	0.178310	0.244244	0.368457	0.241324	3.562242	0.175176	5.129790	0.465022	12.571747	0.101303	30.003903	0.254652	11.332604	88.925993
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-60.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1980.000000	30.080000
25%	0.020500	0.456000	0.462500	0.000000	0.000000	2.000000	0.093100	-11.216000	0.000000	38.000000	0.034500	96.013500	0.318000	1989.000000	200.095500
50%	0.132000	0.587000	0.661000	0.000000	0.000024	5.000000	0.130000	-7.688000	1.000000	46.000000	0.046600	117.956000	0.526000	1999.000000	236.253000
75%	0.458000	0.705000	0.822000	0.000000	0.006290	8.000000	0.258000	-5.440000	1.000000	55.000000	0.090500	139.557000	0.729000	2009.000000	279.933000
max	0.996000	0.986000	1.000000	1.000000	0.999000	11.000000	0.998000	1.342000	1.000000	100.000000	0.960000	244.091000	0.998000	2020.000000	3816.373000

dtypes = {
    'acousticness': 'float32',
    'danceability': 'float32',
    'energy': 'float32',
    'explicit': 'bool',
    'instrumentalness': 'float32',
    'key': 'uint8',
    'liveness': 'float32',
    'loudness': 'float32',
    'mode': 'bool',
    'popularity': 'float32',
    'speechiness': 'float32',
    'tempo': 'float32',
    'valence': 'float32',
    'year': 'uint16',
    'duration': 'float32',
    'artists': 'string',
    'name': 'string',
    'release_date': 'string',
    'genres': 'string',
    'artist_origin': 'string'
}
df = df.astype(dtypes) #Changing the dtypes for efficient processing.

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73887 entries, 5920 to 225454
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   acousticness      73887 non-null  float32
 1   artists           73887 non-null  string 
 2   danceability      73887 non-null  float32
 3   energy            73887 non-null  float32
 4   explicit          73887 non-null  bool   
 5   instrumentalness  73887 non-null  float32
 6   key               73887 non-null  uint8  
 7   liveness          73887 non-null  float32
 8   loudness          73887 non-null  float32
 9   mode              73887 non-null  bool   
 10  name              73887 non-null  string 
 11  popularity        73887 non-null  float32
 12  release_date      73887 non-null  string 
 13  speechiness       73887 non-null  float32
 14  tempo             73887 non-null  float32
 15  valence           73887 non-null  float32
 16  year              73887 non-null  uint16 
 17  artist_origin     73887 non-null  string 
 18  genres            73887 non-null  string 
 19  duration          73887 non-null  float32
dtypes: bool(2), float32(11), string(5), uint16(1), uint8(1)
memory usage: 6.8 MB

# df.to_csv("PowerBI_Spotify_Cleaned_data",index = 0)

# grouped_df = df.groupby("name", sort=False).apply(lambda x: x.reset_index(drop=True))
# grouped_df[grouped_df['artists'] == 'Sade']

Observing this we can see that the same song is released in multiple years maybe by different producers or in different albums.
So we will group any feature by songs and take the mean value of the feature before plotting

Plot 1: Distribution of Artist Origins across years

Context

Most artists come from which country for a given year or all the years

Caution

For any year, all countries who have less than 1% of total origins are grouped together into others category

import plotly.graph_objects as go
import seaborn as sns
import numpy as np

def treemap(year):
    if year != 'All':
        data = df.query('year == @year')
        title = f'Artist Origins Distribution for {year} - Treemap (Thresholded)'
    else:
        data = df
        min_year = np.min(df['year'])
        max_year = np.max(df['year'])
        title = f'Artist Origins Distribution from {min_year} to {max_year} - Treemap (Thresholded)'

    # Set the threshold count
    threshold = 0.01 * data.shape[0]

    # Get the counts of artist origins
    artist_counts = data['artist_origin'].value_counts()

    # Identify countries with counts less than the threshold
    other_countries = artist_counts[artist_counts < threshold]

    # Sum the counts of other countries
    other_count = other_countries.sum()

    # Exclude other countries from the main data
    filtered_counts = artist_counts.drop(other_countries.index)

    # Add the other count as a separate entry
    filtered_counts['Other'] = other_count

    # Calculate the total size
    total_size = filtered_counts.sum()

    # Create the treemap
    labels = filtered_counts.index.tolist()
    values = filtered_counts.tolist()

    # Retrieve the muted color palette from Seaborn
    color_palette = sns.color_palette("muted")

    fig = go.Figure(go.Treemap(
        labels=labels,
        parents=[""] * len(labels),
        values=values,
        textinfo="label+value+percent parent",
        marker_colors=color_palette[:len(labels)],
        hovertemplate="<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percentParent:.1%}<extra></extra>",
        textposition='middle center',  # Center the label text in the treemap cells
        textfont=dict(
            size=30,  # Set the font size for the text inside the treemap
            color='black',
            family='Times New Roman'
        )
    ))

    fig.update_layout(
          title={
            'text': title,
            'font': {
                'size': 24 # Set the font size for the title
            }
        },
        margin=dict(l=0, r=0, t=40, b=0),
        width=1800,
        height=900,  # Set the background style to sns.darkgrid
    )
    #fig.write_image(f'{title}.png')
    fig.show()

years = ['All'] + df['year'].unique().tolist()

# Create the interactive plot
interact(treemap, year=years)

<function __main__.treemap(year)>

# def treemap(year):
#     if year != 'All':
#         data = df.query('year == @year')
#         title = f'Artist Origins Distribution for {year} - Treemap (Thresholded)'
#     else:
#         data = df
#         min_year = np.min(df['year'])
#         max_year = np.max(df['year'])
#         title = f'Artist Origins Distribution from {min_year} to {max_year} - Treemap (Thresholded)'

#     # Set the threshold count
#     threshold = 0.01 * data.shape[0]

#     # Get the counts of artist origins
#     artist_counts = data['artist_origin'].value_counts()

#     # Identify countries with counts less than the threshold
#     other_countries = artist_counts[artist_counts < threshold]

#     # Sum the counts of other countries
#     other_count = other_countries.sum()

#     # Exclude other countries from the main data
#     filtered_counts = artist_counts.drop(other_countries.index)

#     # Add the other count as a separate entry
#     filtered_counts['Other'] = other_count

#     # Calculate the total size
#     total_size = filtered_counts.sum()

#     # Create the treemap
#     plt.figure(figsize=(24,12))
#     squarify.plot(sizes=filtered_counts, label=filtered_counts.index, alpha=0.8, color=sns.color_palette("muted"))
#     plt.axis('off')

#     # Add percentage labels
#     for i, patch in enumerate(plt.gca().patches):
#         x, y, dx, dy = patch.get_bbox().bounds
#         label = filtered_counts.index[i]
#         size = filtered_counts[label]
#         percentage = (size / total_size) * 100
#         plt.gca().text(x + dx / 2, y + dy / 2, f'\n\n{percentage:.1f}%', ha='center', va='center')
    
#     plt.title(title)
#     plt.savefig(title,bbox_inches = 'tight' ,dpi=150)
#     plt.show()

# years = ['All'] + df['year'].unique().tolist()

# # Create the interactive plot
# interact(treemap, year=years)
# plt.show()

Plot 1 Alternative

# # Read the data into a pandas DataFrame
# # data = df  # Replace 'your_data_file.csv' with your actual data file path

# # Group the data by artist origin and count the number of artists from each region
# artist_counts = df['artist_origin'].value_counts()

# # Normalize the counts by dividing by the maximum count value
# # normalized_counts = artist_counts / artist_counts.max()

# # Create the choropleth map using plotly.graph_objects
# fig = go.Figure(data=go.Choropleth(
#     locations=artist_counts.index,
#     z=artist_counts.values,
#     locationmode='country names',
#     colorscale='ylorrd',
#     reversescale=False,  # Reverse the colorscale to have darker blue for higher counts
# #     zmin=0,  # Set the minimum value for the color scale
# #     zmax=1,  # Set the maximum value for the color scale
# ))

# # Set the layout properties
# fig.update_layout(
#     title_text='Artist Origins Count by Country from 1980 to 2020',
#     height = 1200,
#     width = 1500,
#     geo=dict(
#         showframe=False,
#         showcoastlines=False,
#         projection_type='equirectangular'
#     ),
# )

# # Show the figure
# fig.show()
# plt.show()

Plot 2: Most Popular Artists for Given Year n Place of Origin

Context of Plot

For a given year what are the origins of most popular artists
Who are the most popular artists in a particular year and place of origin

Caution Points

For few countries in a particular year, there is no data available for ex. Taiwan’s data seems to be available only for few years.
Another point is in some years for a particular country, we have only few artists, so we can even have top 2 or top 3 artist, and in the worst case only a single artist or no artist from a country
Also because of digitization, I think the new artists are most popular since new generation has access to share there likes, with the new artists, and they don’t listen to the old artists that much.

import plotly.graph_objects as go
import seaborn as sns
import numpy as np

def create_plot(year, country):
    # Filter the data for the selected year and country
    if country != 'All':
        if year != 'All':
            filtered_data = df[(df['year'] == year) & (df['artist_origin'] == country)]
            num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0]
            title = f'Top {country} Artists in Year {year}'
        else:
            filtered_data = df[df['artist_origin'] == country]
            num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0]
            min_year = np.min(filtered_data.year)
            max_year = np.max(filtered_data.year)
            title = f'Top Artists from {country} across {min_year} - {max_year}'
    else:
        if year != 'All':
            filtered_data = df[df['year'] == year]
            num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0]
            title = f'Top Artists from all countries in Year {year}'
        else:
            filtered_data = df
            num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0]
            min_year = np.min(filtered_data.year)
            max_year = np.max(filtered_data.year)
            title = f'Top Artists from all countries across {min_year} - {max_year}'

    # Group the data by artists and calculate the mean popularity per artist
    grouped = filtered_data.groupby('artists')['popularity'].mean().reset_index()

    # Sort the data by popularity in descending order
    sorted_data = grouped.sort_values(by='popularity', ascending=False).head(5)

    # Retrieve the muted color palette from Seaborn
    color_palette = sns.color_palette("muted").as_hex()

    # Create the bar plot using Plotly
    fig = go.Figure()

    # Create a trace for each artist
    for i, artist in enumerate(sorted_data['artists']):
        artist_data = sorted_data.loc[sorted_data['artists'] == artist]
        artist_country = filtered_data.loc[filtered_data['artists'] == artist, 'artist_origin'].iloc[0]
        color = color_palette[i % len(color_palette)]  # Use a color from the palette
        fig.add_trace(go.Bar(
            x=[artist],
            y=artist_data['popularity'],
            name=artist,
            marker=dict(
                color=color,
            ),
            text=[f'{artist_country}'],  # Display the artist's country and name
            textposition='inside',
            hovertemplate="<b>%{x}</b><br>Popularity: %{y}<extra></extra>",
         textfont=dict(
            size=30,  # Set the font size for the text inside the treemap
            color='white',
            family='Times New Roman'
        ),
        ))
        


    fig.update_layout(
        title={
            'text': title,
            'font': {'size': 24}
        },
        xaxis=dict(
            title='Artists',
            tickangle=45,
            automargin=True,
            showgrid=False,
            tickfont={'size': 18},
            title_font = dict(size = 20),
        ),
        yaxis=dict(
            title='Popularity',
            tickmode='linear',
            tick0=0,
            dtick=5,
            showgrid=False,
            tickfont={'size': 18},
            title_font = dict(size = 20),
        ),
        
        margin=dict(l=0, r=0, t=100, b=0),
        showlegend=False,
    )

    if country == 'All':
        for annotation in fig['layout']['annotations']:
            annotation['text'] = annotation['text'].split(': ')[1]  # Remove the country name from the annotation text

    fig.update_layout(
        width=1800,
        height=900,
    )
    #fig.write_image(f'{title}.png')
    fig.show()

# Get the unique years and countries in the data
years = ['All'] + df['year'].unique().tolist()
countries = sorted(['All'] + df['artist_origin'].unique().tolist())

# Create the interactive plot
interact(create_plot, year=years, country=countries)
plt.show()

# def create_plot(year, country):
#     # Filter the data for the selected year and country
#     if country != 'All':
#         if year !='All':
#             filtered_data = df[(df['year'] == year) & (df['artist_origin'] == country)]
#             num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0] #For some years we have even less than 5 artists
# #             print(filtered_data.shape[0])
#             title = f'Top {country} Artists in Year {year}'
#         else:
#             filtered_data = df[df['artist_origin'] == country]
#             num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0] #For some years we have even less than 5 artists
#             min_year = np.min(filtered_data.year)
#             max_year = np.max(filtered_data.year)
# #             print(filtered_data.shape[0])
#             title = f'Top Artists from {country} across {min_year} - {max_year}'
#     else:
#         if year!='All':
#             filtered_data = df[df['year'] == year]
#             num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0] #For some years we have even less than 5 artists
# #             print(filtered_data.shape[0])
#             title = f'Top Artists from all countries in Year {year}'
#         else:
#             filtered_data = df
#             num = 5 if filtered_data.shape[0] > 5 else filtered_data.shape[0] #For some years we have even less than 5 artists
#             min_year = np.min(filtered_data.year)
#             max_year = np.max(filtered_data.year)
# #             print(filtered_data.shape[0])
#             title = f'Top Artists from all countries across {min_year} - {max_year}'

#     # Group the data by artists and calculate the mean popularity per artist
#     grouped = filtered_data.groupby('artists')['popularity'].mean().reset_index()

#     # Sort the data by popularity in descending order
#     sorted_data = grouped.sort_values(by='popularity', ascending=False).head(5)

#     # Create the bar plot using Seaborn
#     plt.figure(figsize=(24, 12))
#     sns.barplot(data=sorted_data, x='artists', y='popularity', palette='muted')
#     plt.title(title)
#     plt.xlabel('Artists')
#     plt.ylabel('Popularity')
#     plt.xticks(rotation=45)
#     plt.yticks(range(0, int(sorted_data['popularity'].max()) + 6, 5)) # Set y-axis ticks with a fixed increment of 5

#     # Add artist origin labels when 'All' is selected
#     if country == 'All':
#         for i, bar in enumerate(plt.gca().patches):
#             artist_name = sorted_data.iloc[i, 0]
#             artist_origin = df.loc[df['artists'] == artist_name, 'artist_origin'].iloc[0]
#             plt.gca().text(bar.get_x() + bar.get_width() / 2, bar.get_height() / 2, artist_origin, ha='center', va='center', rotation=90)

#     plt.tight_layout()
#     plt.savefig(title,bbox_inches = 'tight' ,dpi=150)
#     plt.show()

# # Get the unique years and countries in the data
# years = ['All'] + df['year'].unique().tolist()
# countries = sorted(['All'] + df['artist_origin'].unique().tolist())

# # Create the interactive plot
# interact(create_plot, year=years, country=countries)
# plt.show()

Plot 3 High Feature Valued Songs distribution across Years and Origins classified wrt Genres

Context

Identifying top 5 songs with high value for a particular feature across countries and time frame
We can even identify what are the genres that have most high feature values across years and time frames

Caution:

We get the values for the features by aggregation, since songs are published multiple times maybe in different playlists
For few countries we don’t get top 5 since the data is not available

import plotly.express as px
import seaborn as sns
import numpy as np
from ipywidgets import interact

def create_plot(year, country, feature, top_n=5):
    # Filter the data for the selected feature, countries, and years
    data_dummy = df.copy()
    data_dummy = data_dummy.loc[:,['name',feature,'artist_origin','year','genres']]
    
    if country != 'All':
        if year !='All':
            filtered_data = data_dummy[(data_dummy['year'] == year) & (data_dummy['artist_origin'] == country)]
            num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
            title = f'Top {feature.capitalize()} {country} Songs in Year {year}'
        else:
            filtered_data = data_dummy[data_dummy['artist_origin'] == country]
            num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
            min_year = np.min(filtered_data.year)
            max_year = np.max(filtered_data.year)
            title = f'Top {feature.capitalize()} Songs from {country} across {min_year} - {max_year}'
    else:
        if year!='All':
            filtered_data = data_dummy[data_dummy['year'] == year]
            num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
            title = f'Top {feature.capitalize()} Songs from all countries in Year {year}'
        else:
            filtered_data = data_dummy
            num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
            min_year = np.min(filtered_data.year)
            max_year = np.max(filtered_data.year)
            title = f'Top {feature.capitalize()} Songs from all countries across {min_year} - {max_year}'
    
    # Group the data by artists and calculate the mean popularity per artist
    grouped = filtered_data.groupby(['name','genres'])[feature].mean().reset_index()
    
    # Only taking unique song names 
    grouped = grouped.drop_duplicates(subset='name')
    
    # Group the data by name of songs and calculate the mean popularity per song
    sorted_data = grouped.sort_values(by=feature, ascending=False).head(top_n)
    
    # Only using first 2 genres for songs
    sorted_data.genres = sorted_data.genres.apply(lambda x: ';'.join(x.split(';')[0:2]))

    # Retrieve the muted color palette from Seaborn
    color_palette = sns.color_palette("muted").as_hex()

    # Merge the 'artist_origin' column from data_dummy based on the 'name' column
    sorted_data = sorted_data.merge(data_dummy[['name', 'artist_origin']], on='name', how='left')

    sorted_data = sorted_data.drop_duplicates(subset='name')
    
    # Concatenate 'name' and 'artist_origin' columns into a single column
    sorted_data['text'] = sorted_data['artist_origin']

    # Create the bar plot using Plotly Express
    fig = px.bar(
        sorted_data,
        x='name',
        y=feature,
        color='genres',
        color_discrete_sequence=color_palette,
        text='text',
        title=title,
        template='plotly_white'
    )

    fig.update_traces(texttemplate='%{text}<br>%{y:.2f}', textposition='inside',textfont=dict(color = 'white',family = 'Times New Roman',size=30),textangle=0)

    fig.update_layout(
         title={
            'text': title,
            'font': {
                'size': 24 # Set the font size for the title
            }
        },
        xaxis=dict(
            tickfont=dict(size=14),  # Set the font size for x-axis labels and ticks
            title='Name of Songs',
            title_font=dict(size=16),  # Set the font size for x-axis title
        ),
        yaxis=dict(
            tickfont=dict(size=14),  # Set the font size for y-axis labels and ticks
            title=f'{feature.capitalize()}',
            title_font=dict(size=16),  # Set the font size for y-axis title
        ),
        height=600,
        margin=dict(l=50, r=50, t=80, b=50),
    )
    #fig.write_image(f'{title}.png',scale = 6)
    fig.show()

# Get the unique years and countries in the data
years = ['All'] + df['year'].unique().tolist()
countries = sorted(['All'] + df['artist_origin'].unique().tolist())
features = ['danceability', 'energy', 'acousticness', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
            'popularity', 'speechiness', 'tempo', 'valence', 'duration']
tops = [5, 10, 15]

# Create the interactive plot
interact(create_plot, year=years, country=countries, feature=features, top_n=tops)
plt.show()

# def create_plot(year, country, feature, top_n=5):
#         # Filter the data for the selected feature then for countries and years
#     data_dummy = df.copy()
#     data_dummy = data_dummy.loc[:,['name',feature,'artist_origin','year','genres']]
    
#     if country != 'All':
#         if year !='All':
#             filtered_data = data_dummy[(data_dummy['year'] == year) & (data_dummy['artist_origin'] == country)]
#             num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
#             title = f'Top {feature.capitalize()} {country} Songs in Year {year}'
#         else:
#             filtered_data = data_dummy[data_dummy['artist_origin'] == country]
#             num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
#             min_year = np.min(filtered_data.year)
#             max_year = np.max(filtered_data.year)
#             title = f'Top {feature.capitalize()} Songs from {country} across {min_year} - {max_year}'
#     else:
#         if year!='All':
#             filtered_data = data_dummy[data_dummy['year'] == year]
#             num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
#             title = f'Top {feature.capitalize()} Songs from all countries in Year {year}'
#         else:
#             filtered_data = data_dummy
#             num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
#             min_year = np.min(filtered_data.year)
#             max_year = np.max(filtered_data.year)
#             title = f'Top {feature.capitalize()} Songs from all countries across {min_year} - {max_year}'
    
#     # Group the data by artists and calculate the mean popularity per artist
#     grouped = filtered_data.groupby(['name','genres'])[feature].mean().reset_index()
    
#     # Only taking unique song names 
#     grouped = grouped.drop_duplicates(subset='name')
    
#     # Group the data by name of songs and calculate the mean popularity per song
#     sorted_data = grouped.sort_values(by=feature, ascending=False).head(top_n)
    
#     # Only using first 2 genres for songs
#     sorted_data.genres = sorted_data.genres.apply(lambda x: ';'.join(x.split(';')[0:2]))
#     # ... existing code ...

#     # Create the bar plot using Seaborn
#     plt.figure(figsize=(24, 12))

#     # Calculate the bar width
#     bar_width = 3/sorted_data.shape[0]

#     sns.barplot(
#         data=sorted_data,
#         x='name',
#         y=feature,
#         palette='muted',
#         hue='genres',
#         width=bar_width  # Set the bar width
#     )
    
    
#     for i, bar in enumerate(plt.gca().patches):
#         if i < top_n:
#             mean_value = sorted_data.iloc[i, 2]
#             song_name = sorted_data.iloc[i, 0]
#             origin = df.loc[df['name'] == song_name, 'artist_origin'].iloc[0]
#         if country == 'All':
#             plt.gca().text(
#                 bar.get_x() + bar.get_width() / 2,
#                 bar.get_height() / 2,
#                 f'{mean_value:.2f} in {origin}',
#                 ha='center',
#                 va='center',
#                 rotation=90
#             )
#         else:
#             plt.gca().text(
#                 bar.get_x() + bar.get_width() / 2,
#                 bar.get_height() / 2,
#                 f'{mean_value:.2f}',
#                 ha='center',
#                 va='center'
#             )

#     # ... existing code ...

#     # Customize the x-axis ticks
#     xticks = np.arange(len(sorted_data)) #+ (bar_width/2.7)  # Add half of the bar width
#     plt.xticks(xticks, sorted_data['name'], rotation=36)
#     plt.gca().xaxis.set_tick_params(width=1)  # Set the tick width

#     # ... existing code ...
#     plt.xlabel('Name of Songs')
#     plt.ylabel(f'{feature.capitalize()}')
#     plt.title(title)
#     plt.tight_layout()
#     plt.savefig(title, bbox_inches='tight', dpi=150)
#     plt.show()


# # Get the unique years and countries in the data
# years = ['All'] + df['year'].unique().tolist()
# countries = sorted(['All'] + df['artist_origin'].unique().tolist())
# features = ['danceability', 'energy', 'acousticness','instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity','speechiness', 'tempo', 'valence', 'duration']
# tops = [5,10,15]
# # Removed Explicit: Whether or not the track has explicit lyrics (true = yes it does; false = no it does not OR unknown); Not that important
# # Removed Year as it's not a feature for a song

# # Create the interactive plot
# interact(create_plot, year=years,country=countries, feature = features, top_n=tops)
# plt.show()

Plot 4: Distribution of Features across countries

Able to answer questions such as which countries has most danceable songs, most loud songs and so on in a given time frame.

def update_plot(feature):
    # Group the data by artist origin and calculate the mean of the selected feature per country
    feature_means = df.groupby('artist_origin')[feature].mean()
    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    # Merge the data with the world shapefile based on the country names
    merged_data = world.merge(feature_means, left_on='name', right_index=True)

    # Create a choropleth map using plotly
    fig = go.Figure(data=go.Choropleth(
        locations=merged_data['iso_a3'],
        z=merged_data[feature],
        text=merged_data['name'],
        colorscale='Viridis',
        autocolorscale=False,
        marker_line_color='white',
        marker_line_width=0.5,
        colorbar_title=f'{feature.capitalize()}'
    ))
    title = f'{feature.capitalize()} Distribution by Country'
    # Customize the layout
    fig.update_layout(
        title_text=title,
        height=900,
        width=1800,
        geo=dict(
            showframe=False,
            showcoastlines=False,
            projection_type='equirectangular'
        )
    )
    # Save the plot
    fig.write_image(f'{title}.png')  # Save as PNG image
    fig.show()
    return title

features = ['danceability', 'energy', 'acousticness','instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity','speechiness', 'tempo', 'valence', 'duration']
# Use interact to create the interactive dropdown widget
interact(update_plot, feature=Dropdown(options=features))
plt.show()

Rest Good Plots

import plotly.graph_objects as go

# Function to create the interactive histogram plot with KDE
def plot_histogram(feature):
    fig = go.Figure()

    for year in df['year'].unique():
        year_data = df[df['year'] == year][feature]
        hist_trace = go.Histogram(x=year_data, nbinsx=30, histnorm='density', name=f'Year {year}', opacity=0.7)
        kde_trace = go.Scatter(x=year_data, y=year_data, mode='lines', name=f'KDE - Year {year}')
        fig.add_trace(hist_trace)
        fig.add_trace(kde_trace)

    title = f'{feature.capitalize()} Distribution by Year'
    fig.update_layout(
        title={
            'text': title,
            'font': {'size': 24}
        },
        xaxis=dict(
            title='Mean Value',
            tickfont={'size': 18},
            title_font=dict(size=20),
        ),
        yaxis=dict(
            title='Density',
            tickfont={'size': 18},
            title_font=dict(size=20),
        ),
        margin=dict(l=0, r=0, t=100, b=0),
        showlegend=True,
    )

    fig.update_layout(
        # width=1500,
        height=900,
    )

    fig.show()

# Create the interactive dropdown widget using interact
interact(plot_histogram, feature=features)
plt.show()

# The distribution of features over the years by aggregating them detailed of this is the plot below this

# Define the list of features
features = ['acousticness', 'danceability', 'duration', 'energy', 'explicit',
            'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness',
            'tempo', 'valence']

# Function to create the interactive histogram plot
def plot_histogram(feature):
    # Create the histogram plot using seaborn
    plt.figure(figsize=(20, 12))
    sns.histplot(data=df, x=feature, bins=30, kde=True, hue='year', multiple='stack', palette='viridis')
    title = f'{feature.capitalize()} Distribution by Year'
    plt.title(title)
    plt.xlabel('Mean Value')
    plt.ylabel('Frequency')
    plt.legend(title='Year')
    plt.savefig(title, bbox_inches='tight', dpi=150)
    plt.show()

# Create the interactive dropdown widget using interact
interact(plot_histogram, feature=Dropdown(options=features, description='Select Feature'))
plt.show()

import plotly.graph_objects as go

# Function to create the interactive histogram plot
def plot_histogram(feature):
    # Group the data by year and calculate the mean of the selected feature per year
    feature_means = df.groupby('year')[feature].mean()

    # Create the bar trace using Plotly
    bar_trace = go.Bar(x=feature_means.index, y=feature_means, marker_color='purple')

    # Set the plot title and labels
    title = f'Mean {feature.capitalize()} by Year'
    x_label = 'Year'
    y_label = 'Mean Value'

    # Create the layout
    layout = go.Layout(
        title=title,
        xaxis=dict(title=x_label, tickangle=45, automargin=True),
        yaxis=dict(title=y_label),
        showlegend=False
    )

    # Create the figure and add the bar trace
    fig = go.Figure(data=[bar_trace], layout=layout)
    fig.update_layout(
        title={
            'text': title,
            'font': {'size': 24}
        },
        xaxis=dict(
            title='Mean Value',
            tickfont={'size': 18},
            title_font=dict(size=20),
        ),
        yaxis=dict(
            title='Density',
            tickfont={'size': 18},
            title_font=dict(size=20),
        ),
        margin=dict(l=0, r=0, t=100, b=0),
        showlegend=True,
    )

    fig.update_layout(
        # width=1500,
        height=900,
    )

    # Show the figure
    fig.show()

# Create the interactive dropdown widget using interact
interact(plot_histogram, feature=features)
plt.show()

# # Read the data into a pandas DataFrame
# # data = df  # Replace 'your_data_file.csv' with your actual data file path

# # Define the list of features
# features = ['acousticness', 'danceability', 'duration', 'energy', 'explicit',
#             'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness',
#             'tempo', 'valence']

# # Define the number of light colors to select
# num_colors = len(features)

# desired_colors = ['tab:purple', 'tab:orange', 'tab:green', 'tab:blue']

# # Function to create the interactive histogram plot
# def plot_histogram(feature):
#     # Select random light colors from 'tab10' palette
# #     palette = random.sample(sns.color_palette('tab10', 10)[:5], 1)

#     # Group the data by year and calculate the mean of the selected feature per year
#     feature_means = df.groupby('year')[feature].mean()
    
#     # Select a random light color from the palette
#     color = random.choice(desired_colors)
    
#     fig, ax = plt.subplots(1,1, figsize = (20,12),dpi = 120)
    
#     # Create the bar plot for the means using seaborn
#     ax = sns.barplot(x=feature_means.index, y=feature_means, color=color)
    
#     # Set the plot title and labels
#     title = f'Mean {feature.capitalize()} by Year'
#     ax.set_title(title)
#     ax.set_xlabel('Year')
#     ax.set_ylabel('Mean Value')
    
#     # Rotate and evenly space the year labels on the x-axis
#     ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
#     ax.xaxis.set_tick_params(pad=10)
    
#     # Show the plot
#     plt.tight_layout()
#     plt.savefig(title, bbox_inches='tight', dpi=150)
#     plt.show()

# # Create the interactive dropdown widget using interact
# interact(plot_histogram, feature=Dropdown(options=features, description='Select Feature'))
# plt.show()

Rough Plots

# ac = df.copy()
# # ac[['genre_1', 'genre_2', 'genre_3','rest_genres']] = ac.genres.str.split(';', n=3, expand=True)

# ac.isnull().sum()

# import geopandas as gpd
# import pandas as pd
# import plotly.graph_objects as go
# from ipywidgets import interact, Dropdown

# # Read the world shapefile
# world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# # Read the data containing country and genre information
# data = ac

# # Define the function to update the plot based on the selected year
# def update_plot(year):
#     # Filter the data for the selected year
#     filtered_data = data[data['year'] == year]

#     # Calculate the genre distribution by country
#     genre_distribution = filtered_data.groupby('artist_origin')['genres'].value_counts().unstack().reset_index().fillna(0)

#     # Merge the data with the world shapefile based on the country names
#     merged_data = world.merge(genre_distribution, left_on='name', right_on='artist_origin')

#     # Create a stacked bar plot using plotly
#     fig = go.Figure()

#     for genre in genre_distribution.columns[1:]:
#         fig.add_trace(go.Choropleth(
#             locations=merged_data['iso_a3'],
#             z=merged_data[genre],
#             text=merged_data['name'],
#             colorscale='Viridis',
#             autocolorscale=False,
#             marker_line_color='white',
#             marker_line_width=0.5,
#             colorbar_title='Count',
#             name=genre
#         ))

#     # Customize the layout
#     fig.update_layout(
#         title_text=f'Genre Distribution by Country in {year}',
#         height=1200,
#         width=1500,
#         barmode='stack',
#         geo=dict(
#             showframe=False,
#             showcoastlines=False,
#             projection_type='equirectangular'
#         )
#     )

#     # Show the plot
#     fig.show()

# # Get the unique years from the data
# years = data['year'].unique().tolist()

# # Use interact to create the interactive dropdown widget
# interact(update_plot, year=Dropdown(options=years))
# plt.show()