filename = 'music_that_tickles_my_fancy.csv'

from matplotlib import pyplot
import seaborn
import pandas
from collections import defaultdict
from scipy.stats import pareto, gamma
from datetime import date

# read the data
data = pandas.read_csv(filename)
print("total songs:", data.shape[0])
print(data[:3])

total songs: 5499
                              Track URI                  Track Name  \
0  spotify:track:3T9HSgS5jBFdXIBPav51gj  Fanfare for the Common Man   
1  spotify:track:2bdZDXDoFLzazaomjzoER8            Highschool Lover   
2  spotify:track:1fE3ddAlmjJ99IIfLgZjTy             I Need a Dollar   

                                          Album Name  \
0  Copland Conducts Copland - Expanded Edition (F...   
1                                    Virgin Suicides   
2                                    I Need A Dollar   

                            Artist Name(s) Release Date  Duration (ms)  \
0  Aaron Copland,London Symphony Orchestra         1963         196466   
1                                      Air         2000         162093   
2                               Aloe Blacc   2010-03-16         244373   

   Popularity  Explicit Added By              Added At  ... Key Loudness  \
0          36     False  pvlkmrv  2014-12-28T00:57:17Z  ...  10  -15.727   
1           0     False  pvlkmrv  2014-12-28T00:59:35Z  ...   1  -15.025   
2           0     False  pvlkmrv  2014-12-28T01:03:38Z  ...   8  -11.829   

   Mode  Speechiness  Acousticness  Instrumentalness  Liveness  Valence  \
0     1       0.0381         0.986             0.954    0.0575   0.0377   
1     0       0.0302         0.952             0.959    0.2520   0.0558   
2     0       0.0387         0.178             0.000    0.0863   0.9620   

     Tempo  Time Signature  
0  104.036               4  
1  130.052               4  
2   95.509               4  

[3 rows x 24 columns]

# count songs per artist
artists = defaultdict(int)
for i,song in data.iterrows():
    if isinstance(song['Artist Name(s)'], str):
    	for musician in song['Artist Name(s)'].split(','):
    		artists[musician] += 1

# sort for chart
artists = pandas.DataFrame(artists.items(), columns=['Artist', 'Num Songs']
                          ).sort_values('Num Songs', ascending=False).reset_index(drop=True)
print("number of unique artists:", artists.shape[0])

pyplot.figure(figsize=(18, 6))
pyplot.bar(artists['Artist'], artists['Num Songs'])
pyplot.xticks(visible=False)
pyplot.xlabel(artists.columns[0])
pyplot.ylabel(artists.columns[1])
pyplot.title('everybody')
pyplot.show()

number of unique artists: 2733

# Let's find the best parameters. Need x, y data 'sampled' from the distribution for
# parameter fit.
y = []
for i in range(artists.shape[0]):
	for j in range(artists['Num Songs'][i]):
		y.append(i) # just let y have index[artist] repeated for each song 

# sanity check. If the dataframe isn't sorted properly, y isn't either.
#pyplot.figure()
#pyplot.hist(y, bins=30)
        
# The documentation is pretty bad, but this is okay:
# https://stackoverflow.com/questions/6620471/fitting-empirical-distribution-to-theoretical-
# ones-with-scipy-python
param = pareto.fit(y, 100)
pareto_fitted = len(y)*pareto.pdf(range(artists.shape[0]), *param)
# param = gamma.fit(y) # gamma fits abysmally; see for yourself by uncommenting
# gamma_fitted = len(y)*gamma.pdf(range(artists.shape[0]), *param)

pyplot.figure(figsize=(18, 6))
pyplot.bar(artists['Artist'], artists['Num Songs'])
pyplot.plot(pareto_fitted, color='r')
#pyplot.plot(gamma_fitted, color='g')
pyplot.xticks(visible=False)
pyplot.xlabel(artists.columns[0])
pyplot.ylabel(artists.columns[1])
pyplot.title('everybody');

pyplot.figure(figsize=(18, 10))
pyplot.bar(artists['Artist'][:50], artists['Num Songs'][:50])
pyplot.xticks(rotation=80)
pyplot.xlabel(artists.columns[0])
pyplot.ylabel(artists.columns[1])
pyplot.title('top 50');

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters() # to suppress warning

# Plot of added volume over time
parse_date = lambda d:(int(d[:4]), int(d[5:7]), int(d[8:10]))
pyplot.figure(figsize=(10, 6))
pyplot.hist([date(*parse_date(d)) for d in data['Added At']], bins=30)
pyplot.title('volume added over time');

# bar chart of first bar chart == hipster diversity factor
frequency = defaultdict(int)
for n in artists['Num Songs']:
	frequency[n] += n
frequency = pandas.DataFrame(frequency.items(), columns=['Unique Count', 'Volume']
                           ).sort_values('Volume', ascending=False)
print("number of song-artist pairs represented in the eclecticness chart:",
      sum(frequency['Volume']))

pyplot.figure(figsize=(10, 6))
pyplot.bar(frequency['Unique Count'].values, frequency['Volume'].values)
pyplot.title('volume of songs binned by |songs from that artist|')
pyplot.xlabel('quasi-frequency domain')
pyplot.ylabel(frequency.columns[1]);

number of song-artist pairs represented in the eclecticness chart: 6268

# count songs per genre
genres = defaultdict(int)
for i,song in data.iterrows():
    if type(song['Genres']) is str: # some times there aren't any, and this is NaN
        for genre in song['Genres'].split(','):
            if len(genre) > 0: # empty string seems to be a legit genre
                genres[genre] += 1

# sort for chart
genres = pandas.DataFrame(genres.items(), columns=['Genre', 'Num Songs']
                          ).sort_values('Num Songs', ascending=False).reset_index(drop=True)
print("number of unique genres:", genres.shape[0])

pyplot.figure(figsize=(18, 6))
pyplot.bar(genres['Genre'], genres['Num Songs'])
pyplot.xticks(visible=False)
pyplot.xlabel(genres.columns[0])
pyplot.ylabel(genres.columns[1])
pyplot.title('All the genera');

number of unique genres: 373

y = []
for i in range(genres.shape[0]):
	for j in range(genres['Num Songs'][i]):
		y.append(i) 

# sanity check
#pyplot.figure()
#pyplot.hist(y, bins=30)

param = pareto.fit(y, 100)
pareto_fitted = len(y)*pareto.pdf(range(genres.shape[0]), *param)

pyplot.figure(figsize=(18, 6))
pyplot.bar(genres['Genre'], genres['Num Songs'])
pyplot.plot(pareto_fitted, color='r')
pyplot.xticks(visible=False)
pyplot.xlabel(genres.columns[0])
pyplot.ylabel(genres.columns[1])
pyplot.title('All the genera');

pyplot.figure(figsize=(18, 10))
pyplot.bar(genres['Genre'][:50], genres['Num Songs'][:50])
pyplot.xticks(rotation=80)
pyplot.xlabel(genres.columns[0])
pyplot.ylabel(genres.columns[1])
pyplot.title('top 50');

pyplot.figure(figsize=(18, 1))
pyplot.bar(genres['Genre'][-50:], genres['Num Songs'][-50:])
pyplot.xticks(rotation=80)
pyplot.xlabel(genres.columns[0])
pyplot.ylabel(genres.columns[1])
pyplot.title('bottom 50');

years = defaultdict(int)
for i,song in data.iterrows():
    if isinstance(song['Release Date'], str): #  somebody found a NaN release date!
        years[song['Release Date'][:4]] += 1

years = pandas.DataFrame(years.items(), columns=['Year', 'Num Songs']
                          ).sort_values('Year')

pyplot.figure(figsize=(10, 6))
pyplot.bar(years['Year'], years['Num Songs'])
pyplot.xticks(years['Year'], [y if i % 2 == 0 else '' for i,y in enumerate(years['Year'])], rotation=80)
pyplot.xlabel(years.columns[0])
pyplot.ylabel(years.columns[1])
pyplot.title('Songs per year');

# Some years are missing, so transform to a dataframe that covers full time period.
eldest = int(years['Year'].values[0])
youngest = int(years['Year'].values[-1])
missing_years = [str(x) for x in range(eldest+1, youngest) if
                 str(x) not in years['Year'].values]
ago = pandas.concat((years, pandas.DataFrame.from_dict(
    {'Year': missing_years, 'Num Songs': [0 for x in range(len(missing_years))]})
                  )).sort_values('Year', ascending=False).reset_index(drop=True)

y = []
for i in range(ago.shape[0]):
	for j in range(int(ago['Num Songs'][i])):
		y.append(i)

# sanity check histogram to make sure I'm constructing y properly
#pyplot.figure()
#pyplot.hist(y, bins=30)
        
param = gamma.fit(y, 10000)
gamma_fitted = len(y)*gamma.pdf(range(ago.shape[0]), *param)

pyplot.figure(figsize=(10, 6))
pyplot.bar(range(len(ago['Year'])), ago['Num Songs'])
pyplot.plot(gamma_fitted, color='g')
pyplot.xlabel('Years Ago')
pyplot.ylabel(ago.columns[1])
pyplot.title('Songs per year (in absolute time)');

print('Oldest Hall of Fame')
print(data[['Track Name', 'Artist Name(s)', 'Release Date']].sort_values(
    'Release Date')[:10])

Oldest Hall of Fame
                                             Track Name  \
2985                                       That's Amore   
2945                                    Autumn Nocturne   
3743        The Elements (Music By Sir Arthur Sullivan)   
2416                                          Take Five   
3131                            Skating In Central Park   
3100  I Guess I'll Hang My Tears Out To Dry - Rudy V...   
4257                                        Oye Cómo Va   
2625                                        Stand By Me   
0                            Fanfare for the Common Man   
3183                              In A Sentimental Mood   

                               Artist Name(s) Release Date  
2985                              Dean Martin         1954  
2945                            Lou Donaldson         1958  
3743                               Tom Lehrer   1959-01-01  
2416                 The Dave Brubeck Quartet   1959-12-14  
3131                      Bill Evans,Jim Hall         1962  
3100                            Dexter Gordon         1962  
4257                              Tito Puente   1962-01-01  
2625                              Ben E. King   1962-08-20  
0     Aaron Copland,London Symphony Orchestra         1963  
3183             Duke Ellington,John Coltrane      1963-02

popularity = defaultdict(int)
for i,song in data.iterrows():
    popularity[song['Popularity']] += 1

popularity = pandas.DataFrame(popularity.items(), columns=['Popularity', 'Num Songs']
                          ).sort_values('Popularity')

pyplot.figure(figsize=(10, 6))
pyplot.bar(popularity['Popularity'].values, popularity['Num Songs'].values)
pyplot.xlabel(popularity.columns[0])
pyplot.ylabel(popularity.columns[1])
pyplot.title('popularity distribution');

print("Average song popularity: ", popularity['Popularity'].mean())
print("Median song popularity: ", popularity['Popularity'].median())
print("Max song popularity: ", popularity['Popularity'].max())

Average song popularity:  46.02150537634409
Median song popularity:  46.0
Max song popularity:  94

pyplot.figure(figsize=(10,6))
pyplot.hist(data['Duration (ms)']/1000, bins=50);
pyplot.xlabel('Duration (s)')
pyplot.ylabel('Num Songs')
pyplot.title('Histogram of song lengths')

mean = data['Duration (ms)'].mean()/1000
median = data['Duration (ms)'].median()/1000
print("Average song length: " + str(int(mean//60)) + (":" if mean%60 >=10 else ":0")
      + str(mean%60))
print("Median song length: " + str(int(median//60)) + (":" if median%60 >=10 else ":0")
      + str(median%60))

Average song length: 4:01.9582615020912897
Median song length: 3:51.80000000000001

print("Longest Hall of Fame:")
print(data[['Track Name', 'Artist Name(s)', 'Release Date', 'Duration (ms)']].sort_values(
    'Duration (ms)', ascending=False)[:10])

Longest Hall of Fame:
                                             Track Name  \
5236                                             Echoes   
3150                              Concierto De Aranjuez   
691                                               Irene   
1908  The Return of the King (From The Lord of the R...   
4227                                     Boléro (Ravel)   
460                                   The Cure For Pain   
2344              Shine On You Crazy Diamond (Pts. 1-5)   
140   Two Step - Live At Piedmont Park Atlanta GA - ...   
5055                                             Rivers   
3469       Má vlast (My Country): No. 2 Vltava [Moldau]   

                                         Artist Name(s) Release Date  \
5236                                         Pink Floyd   1971-11-11   
3150                                           Jim Hall         1974   
691                                         Beach House   2012-05-15   
1908          The City of Prague Philharmonic Orchestra   2004-01-01   
4227                          London Symphony Orchestra         1995   
460                                        mewithoutYou   2002-01-01   
2344                                         Pink Floyd   1975-09-12   
140                                  Dave Matthews Band   2007-12-11   
5055                                         Tarek Musa   2010-01-30   
3469  Bedřich Smetana,Polish National Radio Symphony...   1994-08-05   

      Duration (ms)  
5236        1412451  
3150        1154040  
691         1017013  
1908         976893  
4227         934067  
460          908840  
2344         811077  
140          808226  
5055         807437  
3469         794000

pyplot.figure(figsize=(20,20))

for i,category in enumerate(['Tempo', 'Acousticness', 'Instrumentalness', 'Liveness',
                            'Valence', 'Speechiness', 'Loudness', 'Energy', 'Danceability']):
    pyplot.subplot(3, 3, i+1)
    pyplot.hist(data[category], bins=30)
    pyplot.text(pyplot.xlim()[1] - (pyplot.xlim()[1] - pyplot.xlim()[0])*0.3,
                pyplot.ylim()[1]*0.9, r'$\mu=$'+str(data[category].mean())[:7], fontsize=12)
    pyplot.xlabel('Value')
    pyplot.ylabel('Num Songs')
    pyplot.title(category)

pyplot.tight_layout(h_pad=2)

pyplot.figure(figsize=(15,4))

pyplot.subplot(1, 3, 1)
seaborn.countplot(data, x='Time Signature', hue='Time Signature', legend=False)
pyplot.xlabel('Beats per bar')
pyplot.ylabel('Num Songs')
pyplot.title('Time Signature')

pyplot.subplot(1, 3, 2)
seaborn.countplot(data, x='Key', hue='Key', palette='husl', legend=False)
pyplot.xticks(ticks=pyplot.xticks()[0], labels=['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'])
pyplot.ylabel('Num Songs')
pyplot.title('Key')

pyplot.subplot(1, 3, 3)
seaborn.countplot(data, x='Mode', hue='Mode', legend=False)
pyplot.xticks(ticks=pyplot.xticks()[0], labels=['minor', 'major'])
pyplot.ylabel('Num Songs')
pyplot.title('Major vs Minor Key');

pyplot.tight_layout(w_pad=2)

print('5:\n', data.loc[data['Time Signature']==5][
    ['Track Name', 'Artist Name(s)', 'Release Date']][:20])

5:
                           Track Name  \
76    Yachts - A Man Called Adam mix   
120          Good Morning Fire Eater   
223                         Carry On   
233                  Vanishing Grace   
244                          Elysium   
273                           Lately   
386                         Evenstar   
447                      Make A Fist   
459                              (B)   
567                          Animals   
726                 All That Remains   
733                 Crush The Camera   
1059                     Cold Sparks   
1160               You Are Gonna Die   
1186   Everything in Its Right Place   
1191                     The Tourist   
1192             I Am Citizen Insane   
1832        Have I Always Loved You?   
1969                       Resonance   
2130                            Pray   

                                         Artist Name(s) Release Date  
76                                  Coco Steel Lovebomb   2000-10-31  
120                                            Copeland   2008-01-01  
223                                                fun.   2012-02-21  
233                                 Gustavo Santaolalla   2013-06-07  
244   Klaus Badelt,Lisa Gerrard,Gavin Greenaway,The ...   2000-04-25  
273                                         Memoryhouse   2011-09-13  
386                                        Howard Shore   2002-12-02  
447                                          Phantogram         2011  
459                                        mewithoutYou   2002-01-01  
567                                                Muse   2012-09-24  
726                                          Rogue Wave         2010  
733                                          Rogue Wave   2005-08-23  
1059                                           Mutemath   2011-09-30  
1160                                  Marc Streitenfeld         2011  
1186                                          Radiohead   2000-10-02  
1191                                          Radiohead   1997-06-17  
1192                                          Radiohead   2003-06-09  
1832                                           Copeland   2014-11-17  
1969                                               Home   2014-07-01  
2130                                          Sam Smith   2017-10-06

print('0:\n', data.loc[data['Time Signature']==0][
    ['Track Name', 'Artist Name(s)', 'Release Date']][:10])
print('\n1:\n', data.loc[data['Time Signature']==1][
    ['Track Name', 'Artist Name(s)', 'Release Date']][:20])

0:
         Track Name Artist Name(s) Release Date
1362  Small Memory    Jon Hopkins   2009-05-05

1:
                                              Track Name  \
71                                        Clair De Lune   
119                                     Top Of The Hill   
227                     I Am the Very Model of a Modern   
239                         The Last of Us (You and Me)   
362                                              Bowery   
503                                    The Eternal City   
564                                             Prelude   
601                                       Þú ert jörðin   
604                                               Raein   
1276                                 Campfire Song Song   
1328                                        Mylo Xyloto   
1368                                            Anagram   
1911  The Fellowship (From The Lord of the Rings: Th...   
1951                                            Monsoon   
1994                               Meet Me in the Woods   
2032                                         Only Songs   
2176                                         Old Casino   
2189                                     Work This Time   
2586                                   I Don't Think So   
2665                                       Other Worlds   

                                 Artist Name(s) Release Date  
71                               Claude Debussy   2014-10-13  
119                                    Conduits   2013-04-16  
227                     The Pirates Of Penzance   1983-02-18  
239            Gustavo Santaolalla,Alan Umstead   2013-06-07  
362                               Local Natives   2013-01-29  
503                          Michele McLaughlin   2007-12-04  
564                                        Muse   2012-09-24  
601                              Ólafur Arnalds   2010-05-07  
604                              Ólafur Arnalds   2009-08-28  
1276                      Spongebob Squarepants   2009-07-14  
1328                                   Coldplay   2011-10-24  
1368                            Young the Giant   2014-01-17  
1911  The City of Prague Philharmonic Orchestra   2004-01-01  
1951                               Hippo Campus   2017-02-24  
1994                                 Lord Huron   2015-04-07  
2032                             The Wild Reeds   2017-04-07  
2176                                 Coastgaard   2016-02-26  
2189           King Gizzard & The Lizard Wizard   2014-03-07  
2586                                 Ben Phipps   2016-09-30  
2665                      Bassnectar,Dorfex Bos   2017-12-01

x = 'Energy'
y = 'Popularity'

axes = seaborn.jointplot(x=data[x], y=data[y], kind='hex', color='r')
axes.set_axis_labels(x, y, fontsize=20);

show_percent = 2

from sklearn.manifold import TSNE
from random import random
from sklearn.svm import OneClassSVM
import numpy

# Create a dataframe of only the numerical features, all normalized so embedding
# doesn't get confused by scale differences
numerical_data = data.drop(['Track URI', 'Track Name', 'Album Name', 'Artist Name(s)',
		'Explicit', 'Added By', 'Added At', 'Genres', 'Record Label'], axis=1)
numerical_data['Release Date'] = pandas.to_numeric(
    numerical_data['Release Date'].str.slice(0,4))
numerical_data = (numerical_data - numerical_data.mean())/numerical_data.std()
print('using:', list(numerical_data.columns))

# If you like, only include a subset of these, because the results with all
# is really hard to interpret
#tsne_data = numerical_data[['Popularity', 'Energy', 'Acousticness',
#                                'Valence', 'Loudness']]
#print("\nConsidering similarity with respect to the following features:")
#print(tsne_data.dtypes)

# Takes a 2D data embedding and an svm trained on it and plots the decision boundary
def plotFrontier(embedded, svm, technique_name, scale):
    # get all the points in the space, and query the svm on them
    xx, yy = numpy.meshgrid(numpy.linspace(min(embedded[:,0])*scale,
                                           max(embedded[:,0])*scale, 500),
                            numpy.linspace(min(embedded[:,1])*scale,
                                           max(embedded[:,1])*scale, 500))
    Z = svm.decision_function(numpy.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape) # positive Z means yes. negative means outliers.

    pyplot.figure(figsize=(20,20))
    pyplot.title('Decision boundary of One-class SVM in '+technique_name+' space')
    pyplot.contourf(xx, yy, Z, levels=numpy.linspace(Z.min(), 0, 7), cmap=pyplot.cm.Blues_r)
    pyplot.contour(xx, yy, Z, levels=[0], linewidths=2, colors='green') # the +/- boundary
    pyplot.contourf(xx, yy, Z, levels=[0, Z.max()], colors='lightgreen')

    pyplot.scatter(embedded[:, 0], embedded[:, 1], s=10, c='grey')
    for i,song in data.iterrows():
        if random() < show_percent*0.005: # randomly label % of points
        #if song['Artist Name(s)'] in ['Coldplay']:
            x, y = embedded[i]
            pyplot.annotate(song['Track Name'], (x,y), size=10,
                xytext=(-30,30), textcoords='offset points',
                ha='center',va='bottom',
                arrowprops={'arrowstyle':'->', 'color':'red'})

tsne_embedded = TSNE(n_components=2).fit_transform(numerical_data)

svm_tsne = OneClassSVM(gamma='scale')
svm_tsne.fit(tsne_embedded)

plotFrontier(tsne_embedded, svm_tsne, 't-SNE', 1.2)

using: ['Release Date', 'Duration (ms)', 'Popularity', 'Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Time Signature']

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_embedded = pca.fit_transform(numerical_data)
print("% variance explained by successive PCA dimensions:",
      pca.explained_variance_ratio_)

svm_pca = OneClassSVM(gamma='scale')
svm_pca.fit(pca_embedded)

plotFrontier(pca_embedded, svm_pca, 'PCA', 1)

% variance explained by successive PCA dimensions: [0.21903801 0.09219339]

Music Taste Analysis¶

How to Use¶

Read the Data¶

Artist Bar Chart¶

Volume Added Over Time¶

Eclecticness Measure (Frequency Transform)¶

Genres Bar Chart¶

Release Dates¶

Popularity Contest¶

Track Duration¶

Musical Features¶

Joint Analysis¶

The Final Frontier¶