Spotify World Insights Viz

Data Viz
Data Viz Project From Uni

Installing Libraries

# %%capture
# #!pip install squarify geopandas plotly Ipython ipywidget dash

Importing Libraries

import random
import squarify
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_context("talk", font_scale=1.2)
sns.set_style('white')
import geopandas as gpd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objects as go
from IPython.display import HTML
from ipywidgets import interact, Dropdown
# from googlesearch import search

Importing the Dataset and Processing a Bit

data = pd.read_csv("~/side_projects/Projects/DataVizProject/spotify_with_locations.csv", index_col=0)
data.shape
(225456, 28)
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 225456 entries, 0 to 225455
Data columns (total 28 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   acousticness      225456 non-null  float64
 1   artists           225444 non-null  object 
 2   danceability      225456 non-null  float64
 3   duration_ms       225456 non-null  int64  
 4   energy            225456 non-null  float64
 5   explicit          225456 non-null  int64  
 6   id                225456 non-null  object 
 7   instrumentalness  225456 non-null  float64
 8   key               225456 non-null  int64  
 9   liveness          225456 non-null  float64
 10  loudness          225456 non-null  float64
 11  mode              225456 non-null  int64  
 12  name              225456 non-null  object 
 13  popularity        225456 non-null  int64  
 14  release_date      225456 non-null  object 
 15  speechiness       225456 non-null  float64
 16  tempo             225456 non-null  float64
 17  valence           225456 non-null  float64
 18  year              225456 non-null  int64  
 19  mbid              181952 non-null  object 
 20  artist_lastfm     177791 non-null  object 
 21  country_mb        171605 non-null  object 
 22  country_lastfm    132978 non-null  object 
 23  tags_mb           142502 non-null  object 
 24  tags_lastfm       160841 non-null  object 
 25  listeners_lastfm  177791 non-null  float64
 26  scrobbles_lastfm  177791 non-null  float64
 27  ambiguous_artist  181952 non-null  object 
dtypes: float64(11), int64(6), object(11)
memory usage: 49.9+ MB
# How much of the data is null
data.isnull().sum()
acousticness            0
artists                12
danceability            0
duration_ms             0
energy                  0
explicit                0
id                      0
instrumentalness        0
key                     0
liveness                0
loudness                0
mode                    0
name                    0
popularity              0
release_date            0
speechiness             0
tempo                   0
valence                 0
year                    0
mbid                43504
artist_lastfm       47665
country_mb          53851
country_lastfm      92478
tags_mb             82954
tags_lastfm         64615
listeners_lastfm    47665
scrobbles_lastfm    47665
ambiguous_artist    43504
dtype: int64
# Removing columns which are not useful
data = data.drop(['mbid','country_lastfm', 'tags_mb', 'listeners_lastfm','scrobbles_lastfm','id', 'ambiguous_artist','artist_lastfm'], axis = 1)
data = data.rename(columns = {'country_mb':'artist_origin','tags_lastfm':'genres'})
# Now checking the null values and dropping the rows with null values in any of the column
df_missing = data.isnull().sum(axis = 0).reset_index()
df_missing.columns = ['column_name', 'missing_count']
df_missing['missing_ratio'] = df_missing['missing_count']/data.shape[0]
df_missing.query('missing_ratio>0').sort_values(by = 'missing_ratio', ascending = False)
column_name missing_count missing_ratio
19 genres 64615 0.286597
18 artist_origin 53851 0.238854
1 artists 12 0.000053
df_cleaned = data.dropna(subset = ['artist_origin','artists','genres'],axis = 0)
df_cleaned.isna().sum(axis = 0)
acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
valence             0
year                0
artist_origin       0
genres              0
dtype: int64
# Creating a copy of data to work on
df = df_cleaned.copy()
df.shape
(156512, 20)
# Doing some processing, like choosing year as discussed
min_year = 1979
df = df.query("year > @min_year").copy()
# Convert duration_ms to duration in seconds
df['duration'] = df['duration_ms'] / 1000

# Drop the duration_ms column
df.drop('duration_ms', axis=1, inplace=True)
df.shape
(73887, 20)
df.describe()
acousticness danceability energy explicit instrumentalness key liveness loudness mode popularity speechiness tempo valence year duration
count 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000 73887.000000
mean 0.270194 0.572947 0.623204 0.162004 0.094045 5.261115 0.200284 -8.982739 0.683733 47.005820 0.088800 120.083159 0.520840 1999.056816 247.873261
std 0.304551 0.178310 0.244244 0.368457 0.241324 3.562242 0.175176 5.129790 0.465022 12.571747 0.101303 30.003903 0.254652 11.332604 88.925993
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -60.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1980.000000 30.080000
25% 0.020500 0.456000 0.462500 0.000000 0.000000 2.000000 0.093100 -11.216000 0.000000 38.000000 0.034500 96.013500 0.318000 1989.000000 200.095500
50% 0.132000 0.587000 0.661000 0.000000 0.000024 5.000000 0.130000 -7.688000 1.000000 46.000000 0.046600 117.956000 0.526000 1999.000000 236.253000
75% 0.458000 0.705000 0.822000 0.000000 0.006290 8.000000 0.258000 -5.440000 1.000000 55.000000 0.090500 139.557000 0.729000 2009.000000 279.933000
max 0.996000 0.986000 1.000000 1.000000 0.999000 11.000000 0.998000 1.342000 1.000000 100.000000 0.960000 244.091000 0.998000 2020.000000 3816.373000
dtypes = {
    'acousticness': 'float32',
    'danceability': 'float32',
    'energy': 'float32',
    'explicit': 'bool',
    'instrumentalness': 'float32',
    'key': 'uint8',
    'liveness': 'float32',
    'loudness': 'float32',
    'mode': 'bool',
    'popularity': 'float32',
    'speechiness': 'float32',
    'tempo': 'float32',
    'valence': 'float32',
    'year': 'uint16',
    'duration': 'float32',
    'artists': 'string',
    'name': 'string',
    'release_date': 'string',
    'genres': 'string',
    'artist_origin': 'string'
}
df = df.astype(dtypes) #Changing the dtypes for efficient processing.
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 73887 entries, 5920 to 225454
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   acousticness      73887 non-null  float32
 1   artists           73887 non-null  string 
 2   danceability      73887 non-null  float32
 3   energy            73887 non-null  float32
 4   explicit          73887 non-null  bool   
 5   instrumentalness  73887 non-null  float32
 6   key               73887 non-null  uint8  
 7   liveness          73887 non-null  float32
 8   loudness          73887 non-null  float32
 9   mode              73887 non-null  bool   
 10  name              73887 non-null  string 
 11  popularity        73887 non-null  float32
 12  release_date      73887 non-null  string 
 13  speechiness       73887 non-null  float32
 14  tempo             73887 non-null  float32
 15  valence           73887 non-null  float32
 16  year              73887 non-null  uint16 
 17  artist_origin     73887 non-null  string 
 18  genres            73887 non-null  string 
 19  duration          73887 non-null  float32
dtypes: bool(2), float32(11), string(5), uint16(1), uint8(1)
memory usage: 6.8 MB
# df.to_csv("PowerBI_Spotify_Cleaned_data",index = 0)
# grouped_df = df.groupby("name", sort=False).apply(lambda x: x.reset_index(drop=True))
# grouped_df[grouped_df['artists'] == 'Sade']
  • Observing this we can see that the same song is released in multiple years maybe by different producers or in different albums.
  • So we will group any feature by songs and take the mean value of the feature before plotting

Plot 1: Distribution of Artist Origins across years

Context

  • Most artists come from which country for a given year or all the years

Caution

  • For any year, all countries who have less than 1% of total origins are grouped together into others category
import plotly.graph_objects as go
import seaborn as sns
import numpy as np

def treemap(year):
    if year != 'All':
        data = df.query('year == @year')
        title = f'Artist Origins Distribution for {year} - Treemap (Thresholded)'
    else:
        data = df
        min_year = np.min(df['year'])
        max_year = np.max(df['year'])
        title = f'Artist Origins Distribution from {min_year} to {max_year} - Treemap (Thresholded)'

    # Set the threshold count
    threshold = 0.01 * data.shape[0]

    # Get the counts of artist origins
    artist_counts = data['artist_origin'].value_counts()

    # Identify countries with counts less than the threshold
    other_countries = artist_counts[artist_counts < threshold]

    # Sum the counts of other countries
    other_count = other_countries.sum()

    # Exclude other countries from the main data
    filtered_counts = artist_counts.drop(other_countries.index)

    # Add the other count as a separate entry
    filtered_counts['Other'] = other_count

    # Calculate the total size
    total_size = filtered_counts.sum()

    # Create the treemap
    labels = filtered_counts.index.tolist()
    values = filtered_counts.tolist()

    # Retrieve the muted color palette from Seaborn
    color_palette = sns.color_palette("muted")

    fig = go.Figure(go.Treemap(
        labels=labels,
        parents=[""] * len(labels),
        values=values,
        textinfo="label+value+percent parent",
        marker_colors=color_palette[:len(labels)],
        hovertemplate="<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percentParent:.1%}<extra></extra>",
        textposition='middle center',  # Center the label text in the treemap cells
        textfont=dict(
            size=30,  # Set the font size for the text inside the treemap
            color='black',
            family='Times New Roman'
        )
    ))

    fig.update_layout(
          title={
            'text': title,
            'font': {
                'size': 24 # Set the font size for the title
            }
        },
        margin=dict(l=0, r=0, t=40, b=0),
        width=1800,
        height=900,  # Set the background style to sns.darkgrid
    )
    #fig.write_image(f'{title}.png')
    fig.show()

years = ['All'] + df['year'].unique().tolist()

# Create the interactive plot
interact(treemap, year=years)
<function __main__.treemap(year)>
# def treemap(year):
#     if year != 'All':
#         data = df.query('year == @year')
#         title = f'Artist Origins Distribution for {year} - Treemap (Thresholded)'
#     else:
#         data = df
#         min_year = np.min(df['year'])
#         max_year = np.max(df['year'])
#         title = f'Artist Origins Distribution from {min_year} to {max_year} - Treemap (Thresholded)'

#     # Set the threshold count
#     threshold = 0.01 * data.shape[0]

#     # Get the counts of artist origins
#     artist_counts = data['artist_origin'].value_counts()

#     # Identify countries with counts less than the threshold
#     other_countries = artist_counts[artist_counts < threshold]

#     # Sum the counts of other countries
#     other_count = other_countries.sum()

#     # Exclude other countries from the main data
#     filtered_counts = artist_counts.drop(other_countries.index)

#     # Add the other count as a separate entry
#     filtered_counts['Other'] = other_count

#     # Calculate the total size
#     total_size = filtered_counts.sum()

#     # Create the treemap
#     plt.figure(figsize=(24,12))
#     squarify.plot(sizes=filtered_counts, label=filtered_counts.index, alpha=0.8, color=sns.color_palette("muted"))
#     plt.axis('off')

#     # Add percentage labels
#     for i, patch in enumerate(plt.gca().patches):
#         x, y, dx, dy = patch.get_bbox().bounds
#         label = filtered_counts.index[i]
#         size = filtered_counts[label]
#         percentage = (size / total_size) * 100
#         plt.gca().text(x + dx / 2, y + dy / 2, f'\n\n{percentage:.1f}%', ha='center', va='center')
    
#     plt.title(title)
#     plt.savefig(title,bbox_inches = 'tight' ,dpi=150)
#     plt.show()

# years = ['All'] + df['year'].unique().tolist()

# # Create the interactive plot
# interact(treemap, year=years)
# plt.show()

Plot 1 Alternative

# # Read the data into a pandas DataFrame
# # data = df  # Replace 'your_data_file.csv' with your actual data file path

# # Group the data by artist origin and count the number of artists from each region
# artist_counts = df['artist_origin'].value_counts()

# # Normalize the counts by dividing by the maximum count value
# # normalized_counts = artist_counts / artist_counts.max()

# # Create the choropleth map using plotly.graph_objects
# fig = go.Figure(data=go.Choropleth(
#     locations=artist_counts.index,
#     z=artist_counts.values,
#     locationmode='country names',
#     colorscale='ylorrd',
#     reversescale=False,  # Reverse the colorscale to have darker blue for higher counts
# #     zmin=0,  # Set the minimum value for the color scale
# #     zmax=1,  # Set the maximum value for the color scale
# ))

# # Set the layout properties
# fig.update_layout(
#     title_text='Artist Origins Count by Country from 1980 to 2020',
#     height = 1200,
#     width = 1500,
#     geo=dict(
#         showframe=False,
#         showcoastlines=False,
#         projection_type='equirectangular'
#     ),
# )

# # Show the figure
# fig.show()
# plt.show()

Plot 3 High Feature Valued Songs distribution across Years and Origins classified wrt Genres

Context

  • Identifying top 5 songs with high value for a particular feature across countries and time frame
  • We can even identify what are the genres that have most high feature values across years and time frames

Caution:

  • We get the values for the features by aggregation, since songs are published multiple times maybe in different playlists
  • For few countries we don’t get top 5 since the data is not available
import plotly.express as px
import seaborn as sns
import numpy as np
from ipywidgets import interact

def create_plot(year, country, feature, top_n=5):
    # Filter the data for the selected feature, countries, and years
    data_dummy = df.copy()
    data_dummy = data_dummy.loc[:,['name',feature,'artist_origin','year','genres']]
    
    if country != 'All':
        if year !='All':
            filtered_data = data_dummy[(data_dummy['year'] == year) & (data_dummy['artist_origin'] == country)]
            num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
            title = f'Top {feature.capitalize()} {country} Songs in Year {year}'
        else:
            filtered_data = data_dummy[data_dummy['artist_origin'] == country]
            num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
            min_year = np.min(filtered_data.year)
            max_year = np.max(filtered_data.year)
            title = f'Top {feature.capitalize()} Songs from {country} across {min_year} - {max_year}'
    else:
        if year!='All':
            filtered_data = data_dummy[data_dummy['year'] == year]
            num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
            title = f'Top {feature.capitalize()} Songs from all countries in Year {year}'
        else:
            filtered_data = data_dummy
            num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
            min_year = np.min(filtered_data.year)
            max_year = np.max(filtered_data.year)
            title = f'Top {feature.capitalize()} Songs from all countries across {min_year} - {max_year}'
    
    # Group the data by artists and calculate the mean popularity per artist
    grouped = filtered_data.groupby(['name','genres'])[feature].mean().reset_index()
    
    # Only taking unique song names 
    grouped = grouped.drop_duplicates(subset='name')
    
    # Group the data by name of songs and calculate the mean popularity per song
    sorted_data = grouped.sort_values(by=feature, ascending=False).head(top_n)
    
    # Only using first 2 genres for songs
    sorted_data.genres = sorted_data.genres.apply(lambda x: ';'.join(x.split(';')[0:2]))

    # Retrieve the muted color palette from Seaborn
    color_palette = sns.color_palette("muted").as_hex()

    # Merge the 'artist_origin' column from data_dummy based on the 'name' column
    sorted_data = sorted_data.merge(data_dummy[['name', 'artist_origin']], on='name', how='left')

    sorted_data = sorted_data.drop_duplicates(subset='name')
    
    # Concatenate 'name' and 'artist_origin' columns into a single column
    sorted_data['text'] = sorted_data['artist_origin']

    # Create the bar plot using Plotly Express
    fig = px.bar(
        sorted_data,
        x='name',
        y=feature,
        color='genres',
        color_discrete_sequence=color_palette,
        text='text',
        title=title,
        template='plotly_white'
    )

    fig.update_traces(texttemplate='%{text}<br>%{y:.2f}', textposition='inside',textfont=dict(color = 'white',family = 'Times New Roman',size=30),textangle=0)

    fig.update_layout(
         title={
            'text': title,
            'font': {
                'size': 24 # Set the font size for the title
            }
        },
        xaxis=dict(
            tickfont=dict(size=14),  # Set the font size for x-axis labels and ticks
            title='Name of Songs',
            title_font=dict(size=16),  # Set the font size for x-axis title
        ),
        yaxis=dict(
            tickfont=dict(size=14),  # Set the font size for y-axis labels and ticks
            title=f'{feature.capitalize()}',
            title_font=dict(size=16),  # Set the font size for y-axis title
        ),
        height=600,
        margin=dict(l=50, r=50, t=80, b=50),
    )
    #fig.write_image(f'{title}.png',scale = 6)
    fig.show()

# Get the unique years and countries in the data
years = ['All'] + df['year'].unique().tolist()
countries = sorted(['All'] + df['artist_origin'].unique().tolist())
features = ['danceability', 'energy', 'acousticness', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
            'popularity', 'speechiness', 'tempo', 'valence', 'duration']
tops = [5, 10, 15]

# Create the interactive plot
interact(create_plot, year=years, country=countries, feature=features, top_n=tops)
plt.show()
# def create_plot(year, country, feature, top_n=5):
#         # Filter the data for the selected feature then for countries and years
#     data_dummy = df.copy()
#     data_dummy = data_dummy.loc[:,['name',feature,'artist_origin','year','genres']]
    
#     if country != 'All':
#         if year !='All':
#             filtered_data = data_dummy[(data_dummy['year'] == year) & (data_dummy['artist_origin'] == country)]
#             num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
#             title = f'Top {feature.capitalize()} {country} Songs in Year {year}'
#         else:
#             filtered_data = data_dummy[data_dummy['artist_origin'] == country]
#             num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
#             min_year = np.min(filtered_data.year)
#             max_year = np.max(filtered_data.year)
#             title = f'Top {feature.capitalize()} Songs from {country} across {min_year} - {max_year}'
#     else:
#         if year!='All':
#             filtered_data = data_dummy[data_dummy['year'] == year]
#             num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
#             title = f'Top {feature.capitalize()} Songs from all countries in Year {year}'
#         else:
#             filtered_data = data_dummy
#             num = top_n if filtered_data.shape[0] > top_n else filtered_data.shape[0] #For some years we have even less than 5 artists
#             min_year = np.min(filtered_data.year)
#             max_year = np.max(filtered_data.year)
#             title = f'Top {feature.capitalize()} Songs from all countries across {min_year} - {max_year}'
    
#     # Group the data by artists and calculate the mean popularity per artist
#     grouped = filtered_data.groupby(['name','genres'])[feature].mean().reset_index()
    
#     # Only taking unique song names 
#     grouped = grouped.drop_duplicates(subset='name')
    
#     # Group the data by name of songs and calculate the mean popularity per song
#     sorted_data = grouped.sort_values(by=feature, ascending=False).head(top_n)
    
#     # Only using first 2 genres for songs
#     sorted_data.genres = sorted_data.genres.apply(lambda x: ';'.join(x.split(';')[0:2]))
#     # ... existing code ...

#     # Create the bar plot using Seaborn
#     plt.figure(figsize=(24, 12))

#     # Calculate the bar width
#     bar_width = 3/sorted_data.shape[0]

#     sns.barplot(
#         data=sorted_data,
#         x='name',
#         y=feature,
#         palette='muted',
#         hue='genres',
#         width=bar_width  # Set the bar width
#     )
    
    
#     for i, bar in enumerate(plt.gca().patches):
#         if i < top_n:
#             mean_value = sorted_data.iloc[i, 2]
#             song_name = sorted_data.iloc[i, 0]
#             origin = df.loc[df['name'] == song_name, 'artist_origin'].iloc[0]
#         if country == 'All':
#             plt.gca().text(
#                 bar.get_x() + bar.get_width() / 2,
#                 bar.get_height() / 2,
#                 f'{mean_value:.2f} in {origin}',
#                 ha='center',
#                 va='center',
#                 rotation=90
#             )
#         else:
#             plt.gca().text(
#                 bar.get_x() + bar.get_width() / 2,
#                 bar.get_height() / 2,
#                 f'{mean_value:.2f}',
#                 ha='center',
#                 va='center'
#             )

#     # ... existing code ...

#     # Customize the x-axis ticks
#     xticks = np.arange(len(sorted_data)) #+ (bar_width/2.7)  # Add half of the bar width
#     plt.xticks(xticks, sorted_data['name'], rotation=36)
#     plt.gca().xaxis.set_tick_params(width=1)  # Set the tick width

#     # ... existing code ...
#     plt.xlabel('Name of Songs')
#     plt.ylabel(f'{feature.capitalize()}')
#     plt.title(title)
#     plt.tight_layout()
#     plt.savefig(title, bbox_inches='tight', dpi=150)
#     plt.show()


# # Get the unique years and countries in the data
# years = ['All'] + df['year'].unique().tolist()
# countries = sorted(['All'] + df['artist_origin'].unique().tolist())
# features = ['danceability', 'energy', 'acousticness','instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity','speechiness', 'tempo', 'valence', 'duration']
# tops = [5,10,15]
# # Removed Explicit: Whether or not the track has explicit lyrics (true = yes it does; false = no it does not OR unknown); Not that important
# # Removed Year as it's not a feature for a song

# # Create the interactive plot
# interact(create_plot, year=years,country=countries, feature = features, top_n=tops)
# plt.show()

Plot 4: Distribution of Features across countries

  • Able to answer questions such as which countries has most danceable songs, most loud songs and so on in a given time frame.
def update_plot(feature):
    # Group the data by artist origin and calculate the mean of the selected feature per country
    feature_means = df.groupby('artist_origin')[feature].mean()
    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    # Merge the data with the world shapefile based on the country names
    merged_data = world.merge(feature_means, left_on='name', right_index=True)

    # Create a choropleth map using plotly
    fig = go.Figure(data=go.Choropleth(
        locations=merged_data['iso_a3'],
        z=merged_data[feature],
        text=merged_data['name'],
        colorscale='Viridis',
        autocolorscale=False,
        marker_line_color='white',
        marker_line_width=0.5,
        colorbar_title=f'{feature.capitalize()}'
    ))
    title = f'{feature.capitalize()} Distribution by Country'
    # Customize the layout
    fig.update_layout(
        title_text=title,
        height=900,
        width=1800,
        geo=dict(
            showframe=False,
            showcoastlines=False,
            projection_type='equirectangular'
        )
    )
    # Save the plot
    fig.write_image(f'{title}.png')  # Save as PNG image
    fig.show()
    return title

features = ['danceability', 'energy', 'acousticness','instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity','speechiness', 'tempo', 'valence', 'duration']
# Use interact to create the interactive dropdown widget
interact(update_plot, feature=Dropdown(options=features))
plt.show()

Rest Good Plots

import plotly.graph_objects as go

# Function to create the interactive histogram plot with KDE
def plot_histogram(feature):
    fig = go.Figure()

    for year in df['year'].unique():
        year_data = df[df['year'] == year][feature]
        hist_trace = go.Histogram(x=year_data, nbinsx=30, histnorm='density', name=f'Year {year}', opacity=0.7)
        kde_trace = go.Scatter(x=year_data, y=year_data, mode='lines', name=f'KDE - Year {year}')
        fig.add_trace(hist_trace)
        fig.add_trace(kde_trace)

    title = f'{feature.capitalize()} Distribution by Year'
    fig.update_layout(
        title={
            'text': title,
            'font': {'size': 24}
        },
        xaxis=dict(
            title='Mean Value',
            tickfont={'size': 18},
            title_font=dict(size=20),
        ),
        yaxis=dict(
            title='Density',
            tickfont={'size': 18},
            title_font=dict(size=20),
        ),
        margin=dict(l=0, r=0, t=100, b=0),
        showlegend=True,
    )

    fig.update_layout(
        # width=1500,
        height=900,
    )

    fig.show()

# Create the interactive dropdown widget using interact
interact(plot_histogram, feature=features)
plt.show()
# The distribution of features over the years by aggregating them detailed of this is the plot below this

# Define the list of features
features = ['acousticness', 'danceability', 'duration', 'energy', 'explicit',
            'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness',
            'tempo', 'valence']

# Function to create the interactive histogram plot
def plot_histogram(feature):
    # Create the histogram plot using seaborn
    plt.figure(figsize=(20, 12))
    sns.histplot(data=df, x=feature, bins=30, kde=True, hue='year', multiple='stack', palette='viridis')
    title = f'{feature.capitalize()} Distribution by Year'
    plt.title(title)
    plt.xlabel('Mean Value')
    plt.ylabel('Frequency')
    plt.legend(title='Year')
    plt.savefig(title, bbox_inches='tight', dpi=150)
    plt.show()

# Create the interactive dropdown widget using interact
interact(plot_histogram, feature=Dropdown(options=features, description='Select Feature'))
plt.show()
import plotly.graph_objects as go

# Function to create the interactive histogram plot
def plot_histogram(feature):
    # Group the data by year and calculate the mean of the selected feature per year
    feature_means = df.groupby('year')[feature].mean()

    # Create the bar trace using Plotly
    bar_trace = go.Bar(x=feature_means.index, y=feature_means, marker_color='purple')

    # Set the plot title and labels
    title = f'Mean {feature.capitalize()} by Year'
    x_label = 'Year'
    y_label = 'Mean Value'

    # Create the layout
    layout = go.Layout(
        title=title,
        xaxis=dict(title=x_label, tickangle=45, automargin=True),
        yaxis=dict(title=y_label),
        showlegend=False
    )

    # Create the figure and add the bar trace
    fig = go.Figure(data=[bar_trace], layout=layout)
    fig.update_layout(
        title={
            'text': title,
            'font': {'size': 24}
        },
        xaxis=dict(
            title='Mean Value',
            tickfont={'size': 18},
            title_font=dict(size=20),
        ),
        yaxis=dict(
            title='Density',
            tickfont={'size': 18},
            title_font=dict(size=20),
        ),
        margin=dict(l=0, r=0, t=100, b=0),
        showlegend=True,
    )

    fig.update_layout(
        # width=1500,
        height=900,
    )

    # Show the figure
    fig.show()

# Create the interactive dropdown widget using interact
interact(plot_histogram, feature=features)
plt.show()
# # Read the data into a pandas DataFrame
# # data = df  # Replace 'your_data_file.csv' with your actual data file path

# # Define the list of features
# features = ['acousticness', 'danceability', 'duration', 'energy', 'explicit',
#             'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness',
#             'tempo', 'valence']

# # Define the number of light colors to select
# num_colors = len(features)

# desired_colors = ['tab:purple', 'tab:orange', 'tab:green', 'tab:blue']

# # Function to create the interactive histogram plot
# def plot_histogram(feature):
#     # Select random light colors from 'tab10' palette
# #     palette = random.sample(sns.color_palette('tab10', 10)[:5], 1)

#     # Group the data by year and calculate the mean of the selected feature per year
#     feature_means = df.groupby('year')[feature].mean()
    
#     # Select a random light color from the palette
#     color = random.choice(desired_colors)
    
#     fig, ax = plt.subplots(1,1, figsize = (20,12),dpi = 120)
    
#     # Create the bar plot for the means using seaborn
#     ax = sns.barplot(x=feature_means.index, y=feature_means, color=color)
    
#     # Set the plot title and labels
#     title = f'Mean {feature.capitalize()} by Year'
#     ax.set_title(title)
#     ax.set_xlabel('Year')
#     ax.set_ylabel('Mean Value')
    
#     # Rotate and evenly space the year labels on the x-axis
#     ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
#     ax.xaxis.set_tick_params(pad=10)
    
#     # Show the plot
#     plt.tight_layout()
#     plt.savefig(title, bbox_inches='tight', dpi=150)
#     plt.show()

# # Create the interactive dropdown widget using interact
# interact(plot_histogram, feature=Dropdown(options=features, description='Select Feature'))
# plt.show()

Rough Plots

# ac = df.copy()
# # ac[['genre_1', 'genre_2', 'genre_3','rest_genres']] = ac.genres.str.split(';', n=3, expand=True)
# ac.isnull().sum()
# import geopandas as gpd
# import pandas as pd
# import plotly.graph_objects as go
# from ipywidgets import interact, Dropdown

# # Read the world shapefile
# world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# # Read the data containing country and genre information
# data = ac

# # Define the function to update the plot based on the selected year
# def update_plot(year):
#     # Filter the data for the selected year
#     filtered_data = data[data['year'] == year]

#     # Calculate the genre distribution by country
#     genre_distribution = filtered_data.groupby('artist_origin')['genres'].value_counts().unstack().reset_index().fillna(0)

#     # Merge the data with the world shapefile based on the country names
#     merged_data = world.merge(genre_distribution, left_on='name', right_on='artist_origin')

#     # Create a stacked bar plot using plotly
#     fig = go.Figure()

#     for genre in genre_distribution.columns[1:]:
#         fig.add_trace(go.Choropleth(
#             locations=merged_data['iso_a3'],
#             z=merged_data[genre],
#             text=merged_data['name'],
#             colorscale='Viridis',
#             autocolorscale=False,
#             marker_line_color='white',
#             marker_line_width=0.5,
#             colorbar_title='Count',
#             name=genre
#         ))

#     # Customize the layout
#     fig.update_layout(
#         title_text=f'Genre Distribution by Country in {year}',
#         height=1200,
#         width=1500,
#         barmode='stack',
#         geo=dict(
#             showframe=False,
#             showcoastlines=False,
#             projection_type='equirectangular'
#         )
#     )

#     # Show the plot
#     fig.show()

# # Get the unique years from the data
# years = data['year'].unique().tolist()

# # Use interact to create the interactive dropdown widget
# interact(update_plot, year=Dropdown(options=years))
# plt.show()