Objective

Spotify's editorial playlists are "carefully curated by [its] music experts and genre specialists from around the globe" (Spotify for Artists). Every artist knows the importance of being added to a Spotify editorial playlist. As an artist, I know when I get the notification that a song of mine has been added to a Spotify playlist that it will gain thousands of streams a day for the duration of its time on that playlist. One of my favorite playlists to be added to in the indie genre is 'undercurrents.' Wouldn't it be great as an artist if I could write and release songs that I knew would be best suited for this playlist? Knowing what type of song a playlist curator is looking for could help an artist choose which song to pitch. Further, I think it's important that artists know how their songs are categorized in terms of features. We may think we have an upbeat, danceable song, but if Spotify categorizes it differently, we'd want to know that. Are there certain aspects of a song that affect whether or not it is playlisted? My objective is to identify those aspects, as well as predict whether or not a song will be playlisted based on those aspects.

Creating the Data

Using spotipy, I first pulled the tracks and artist names from 'undercurrents.' Then, I created a dataset of every artist's discography with an indicator variable = 1 if the song is on 'undercurrents' and 0 otherwise. Next, I wrote a function that added features of every track to the dataset. Finally, I scaled the features and dropped any songs that did not have features available. The resulting dataset has 1547 songs by 64 unique artists with 10 song features.

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np

auth_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(auth_manager=auth_manager)

# returns dataframe of all playlist tracks, track ids, artists, and artist ids given a playlist uri

def playlist_df(playlist):
    
    results = sp.playlist_tracks(playlist, fields = 'items')
    
    A = []
    B = []
    C = []
    D = []
      
    for i in range(len(results['items'])):
        A.append(results['items'][i]['track']['name'])
        B.append(results['items'][i]['track']['id'])
        C.append(results['items'][i]['track']['artists'][0]['name'])
        D.append(results['items'][i]['track']['artists'][0]['id'])
        
    dict = {'Track': A, 'Track ID': B, 'Artist': C, 'Artist ID': D}  
    
    df = pd.DataFrame(dict)
        
    return df

# returns lists of artist's discography in tracks and ids given artist id
    
def artist_albums(artist_id):
    
    track_ids = []
    albums = []
    tracks = []
    results = sp.artist_albums(artist_id, album_type='album')
    albums.extend(results['items'])
    
    
    while results['next']:
        results = sp.next(results)
        albums.extend(results['items'])
    
    unique = set()  # skip duplicate albums
    
    for album in albums:
        name = album['name'].lower()
        if name not in unique:
            unique.add(name)
            this_tracks, this_id = show_album_tracks(album)
            tracks += this_tracks
            track_ids += this_id
            
    this_tracks, this_id = top_tracks(artist_id)
    
    tracks += this_tracks
    track_ids += this_id
            
    tracks, ids = unique_tracks(tracks, track_ids)
    
    return tracks, ids

# given lists of tracks and ids, returns lists of only unique tracks and ids

def unique_tracks(ls_t, ls_i):
    unique_t = []
    unique_i = []
    for i in range(0,len(ls_t)):
        if ls_t[i] not in unique_t:
            unique_t.append(ls_t[i])
            unique_i.append(ls_i[i])
    return unique_t, unique_i

# given album, returns list of all tracks and track ids from that album

def show_album_tracks(album):
    ids = []
    tracks = []
    track_names = []
    results = sp.album_tracks(album['id'])
    tracks.extend(results['items'])
    
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
        
    for track in tracks:
        track_names.append(track['name'])
        ids.append(track['id'])
   
    return track_names, ids

# returns top tracks of artist

def top_tracks(artist_id):
    t = []
    d = []
    tracks = sp.artist_top_tracks(artist_id)
    for i in range(len(tracks['tracks'])):
        t.append(tracks['tracks'][i]['album']['name'])
        d.append(tracks['tracks'][i]['album']['id'])
    return t, d

# function that adds all artists' discographies to df given current df's artist IDs

def add_songs(df):
    old_df = df 
    
    for i in range(0, len(df['Artist ID'])):
        
        artist = df['Artist'][i]
        artist_id = df['Artist ID'][i]
        
        tracks, ids = artist_albums(artist_id)
        
        for j in range(0, len(tracks)):
            new_df = pd.DataFrame([tracks[j], ids[j], artist, artist_id])
            new_df = new_df.transpose()
            new_df = new_df.rename(columns = {0: 'Track', 1: 'Track ID', 2: 'Artist', 3: 'Artist ID'}, index = {0: len(df['Artist ID'])+ i + 1})
            
            df = df.append(new_df)
        
    df2 = df.sort_values(by = ['Artist'])
    df2 = df2.reset_index(drop = True)


    df2['Playlisted'] = df2['Track'].isin(old_df['Track']).astype(int)
    
    return df2

# function that adds features to df for every song

def features(df):
    
    danceability = []
    energy = []
    key = []
    loudness = []
    mode = []
    speechiness = []
    acousticness = []
    instrumentalness = []
    liveness = []
    valence = []
    tempo = []
    dur = []
    sig = []
    ls = [danceability, energy, key, loudness, mode, speechiness,acousticness, instrumentalness, liveness, valence,
         tempo, dur, sig]
    quotes = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness','acousticness', 'instrumentalness', 'liveness', 'valence',
         'tempo', 'duration_ms', 'time_signature']
    
    for id_ in df['Track ID']:
        features = sp.audio_features(id_)
        if features == [None]:
            for item in ls:
                item.append(np.nan)
        else:
            features = features[0]
            for i,j in zip(ls,quotes):
                i.append(features[j])
                
    dic = {'Danceability': danceability, 'Energy': energy, 'Key': key, 'Loudness': loudness, 'Mode': mode,
            'Speechiness': speechiness, 'Acoustic': acousticness, 'Instrumental': instrumentalness,
           'Liveness': liveness, 'Valence': valence, 'Tempo': tempo, 'Dur': dur, 'Sig': sig}
    
    return dic

from sklearn.preprocessing import StandardScaler

# function that cleans the dataframe (gets rid of null values, resets index, and sales features)

def clean(df):
    df = df.dropna()
    df = df.reset_index(drop = True)
    addback = [df['Playlisted'], df['Artist ID'], df['Artist'], df['Track ID'], df['Track'],]
    to_scale = df.drop(columns = ['Playlisted','Track', 'Track ID', 'Artist','Artist ID'])
    scaler = StandardScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(to_scale), columns = to_scale.columns)
    
    for column in addback:
        data_scaled = pd.concat([pd.DataFrame(column), data_scaled], axis = 1)
 
    return data_scaled
    
In [2]:
def main():
    playlist_id = 'spotify:playlist:37i9dQZF1DX9myttyycIxA'
    df = playlist_df(playlist_id)
    df = add_songs(df)
    feats = pd.DataFrame(features(df))
    df = pd.concat([feats, df], axis = 1)
    df = clean(df)
    df.to_csv(r'/Users/harlanhutton/Documents/spotify/undercurrentsdata.csv', index = False)
In [3]:
main()
In [4]:
from IPython.display import display, HTML

data = pd.read_csv('/Users/harlanhutton/Documents/spotify/undercurrentsdata.csv')
data = data.drop(columns = ['Track', 'Track ID', 'Artist', 'Artist ID'])

def getDfSummary(dat):
        '''
        Get descriptive stats
        '''
        #Get the names of the columns
        cols = data.columns.values

        c_summ = []
        #Outer Loop for the cols
        for c in cols:
            #Count the NAs
            missing = sum(pd.isnull(dat[c]))
            #Use describe to get summary statistics, and also drop the 'count' row
            sumval = dat[c].describe().drop(['count'])
            #Now count distinct values...note that nunique removes missing values for you
            distinct = dat[c].nunique()
            #Append missing and distinct to sumval
            sumval = sumval.append(pd.Series([missing, distinct], index=['missing', 'distinct']))
            #Add each sumval to a list and then convert the entire thing to a DS
            c_summ.append(sumval)

        return pd.DataFrame(c_summ, index=cols)

desc = getDfSummary(data)
display(HTML(desc.to_html()))
mean std min 25% 50% 75% max missing distinct
Playlisted 8.151476e-02 0.273712 0.000000 0.000000 0.000000 0.000000 1.000000 0.0 2.0
Danceability 3.015702e-16 1.000321 -3.482804 -0.687249 0.045870 0.739126 2.950613 0.0 539.0
Energy -1.767236e-16 1.000321 -2.021954 -0.780000 -0.038599 0.775657 2.172748 0.0 725.0
Key -1.656427e-16 1.000321 -1.507266 -0.940176 -0.089541 1.044640 1.611730 0.0 12.0
Loudness -6.860508e-17 1.000321 -5.516589 -0.556962 0.165599 0.717592 1.955375 0.0 1326.0
Mode -2.992899e-16 1.000321 -1.875085 0.533309 0.533309 0.533309 0.533309 0.0 2.0
Speechiness -7.820730e-17 1.000321 -0.932184 -0.296963 -0.197445 -0.023817 17.722146 0.0 424.0
Acoustic 1.226376e-16 1.000321 -1.359356 -1.024833 0.039588 1.035404 1.455514 0.0 886.0
Instrumental 1.881963e-16 1.000321 -0.650528 -0.650195 -0.616514 0.392726 2.486318 0.0 1058.0
Liveness 2.226147e-16 1.000321 -1.137052 -0.577648 -0.421489 0.218960 5.824867 0.0 517.0
Valence 5.529737e-17 1.000321 -1.747884 -0.827690 -0.111983 0.650825 2.760034 0.0 708.0
Tempo 4.204311e-17 1.000321 -4.244324 -0.698001 0.003112 0.637348 3.109286 0.0 1371.0
Dur 1.189722e-16 1.000321 -2.038556 -0.506255 -0.109321 0.344655 13.112724 0.0 1371.0
Sig -1.728043e-16 1.000321 -7.141354 0.287275 0.287275 0.287275 2.144433 0.0 5.0

Looking at the above description of the data, I can confirm that there are no missing values. and that all of the data minus the target variable is normalized with mean 0 and std 1. Looking at the number of distinct values, the variables range from binary (like mode), to almost all unique, like duration or tempo.

Feature Covariance

As another means of exploratory analysis, I want to look at the covariance matrix of the features. There are a few correlations between features that make sense intuitively, like loudness and energy, acousticness and energy, valence and danceability, and valence and energy. Valence is the how positive (happy, cheerful) or negative (sad, angry) a song sounds. Thinking ahead to model selection, if I use a tree-based algorithm I will not need to correct for these correlations, as they are generally robust enough to handle these correlations. If I use a linear algorithm which is more sensitive to highly correlated features, I may want to drop some features or transform the data using Principal Components. However, because I don't have an overwhelming number of features, and most have correlations less than 0.3, I will probably not need to use PCA.

In [5]:
Y = data['Playlisted']
X = data.drop('Playlisted',1)

corr = X.corr()
corr.style.background_gradient(cmap='coolwarm')
Out[5]:
Danceability Energy Key Loudness Mode Speechiness Acoustic Instrumental Liveness Valence Tempo Dur Sig
Danceability 1.000000 -0.004954 0.028199 0.014728 -0.027636 0.036696 -0.013600 -0.149455 -0.065346 0.364102 -0.059929 -0.135495 0.036750
Energy -0.004954 1.000000 0.017973 0.800081 -0.070082 0.054345 -0.706457 0.107840 0.194777 0.499373 0.224624 0.059056 0.135157
Key 0.028199 0.017973 1.000000 0.000292 -0.175347 0.055393 0.016808 -0.016727 0.053053 0.027112 -0.011045 0.005899 0.055128
Loudness 0.014728 0.800081 0.000292 1.000000 -0.051875 -0.049798 -0.635967 -0.041924 0.118603 0.355173 0.144878 0.024300 0.117194
Mode -0.027636 -0.070082 -0.175347 -0.051875 1.000000 -0.049942 0.042389 -0.024366 -0.036234 -0.038325 0.065568 -0.003192 -0.018277
Speechiness 0.036696 0.054345 0.055393 -0.049798 -0.049942 1.000000 0.055022 0.008878 0.120126 0.036652 0.062823 -0.076845 -0.012423
Acoustic -0.013600 -0.706457 0.016808 -0.635967 0.042389 0.055022 1.000000 0.071437 -0.117613 -0.364223 -0.149178 -0.047249 -0.115470
Instrumental -0.149455 0.107840 -0.016727 -0.041924 -0.024366 0.008878 0.071437 1.000000 -0.014726 -0.045794 -0.008875 0.019394 -0.022915
Liveness -0.065346 0.194777 0.053053 0.118603 -0.036234 0.120126 -0.117613 -0.014726 1.000000 0.094633 0.024642 -0.032123 -0.009176
Valence 0.364102 0.499373 0.027112 0.355173 -0.038325 0.036652 -0.364223 -0.045794 0.094633 1.000000 0.139771 -0.135911 0.091105
Tempo -0.059929 0.224624 -0.011045 0.144878 0.065568 0.062823 -0.149178 -0.008875 0.024642 0.139771 1.000000 -0.015585 0.007041
Dur -0.135495 0.059056 0.005899 0.024300 -0.003192 -0.076845 -0.047249 0.019394 -0.032123 -0.135911 -0.015585 1.000000 -0.006329
Sig 0.036750 0.135157 0.055128 0.117194 -0.018277 -0.012423 -0.115470 -0.022915 -0.009176 0.091105 0.007041 -0.006329 1.000000

Mutual Information

Next, I want to take a look at the mutual information between the features and target variable. I can get some clues as to the best drivers of playlisting, as well as use this if I decide to go for dimensionality reduction. In the graph below, I can see that key, mode, and signature do not have a strong relationship with playlisted. The rest of the mutual info between the features and playlisted make intuitive sense, except I was surprised to see that as loudness increases, the likelihood of being playlisted decreases.

In [6]:
import sklearn.metrics as skm
import matplotlib.pyplot as plt

def plotMI(dat, lab, width = 0.35, signed = 0):
    '''
    Draw a bar chart of the normalized MI between each X and Y
    '''
    X = dat.drop(lab, 1)
    Y = dat[[lab]].values
    cols = X.columns.values
    mis = []

    #Start by getting MI
    for c in cols:
        mis.append(skm.normalized_mutual_info_score(Y.ravel(), X[[c]].values.ravel()))

    #Get signs by correlation
    corrs = dat.corr()[lab]
    corrs[corrs.index != lab]
    df = pd.DataFrame(list(zip(mis, cols)), columns = ['MI', 'Lab'])
    df = pd.concat([df, pd.DataFrame(list(corrs), columns = ['corr'])], axis=1, ignore_index = False, join = "inner")

    if signed == 0:
        makeBar(df, 'MI', 'Lab', width)

    else:
        makeBarSigned(df, 'MI', 'Lab', width)


def makeBarSigned(df, h, lab,  width):
    '''
    Contains
    '''
    df_s = df.sort_values(by = [h], ascending = False)

    #Get a barplot
    ind = np.arange(df_s.shape[0])
    labs = df_s[[lab]].values.ravel()
    h_pos = (df_s[['corr']].values.ravel() > 0) * df_s.MI
    h_neg = (df_s[['corr']].values.ravel() < 0) * df_s.MI

    fig = plt.figure(facecolor = 'w', figsize = (12, 6))
    ax = plt.subplot(111)
    plt.subplots_adjust(bottom = 0.25)

    rec = ax.bar(ind + width, h_pos, width, color='r', label = 'Positive')
    rec = ax.bar(ind + width, h_neg, width, color='b', label = 'Negative')

    ax.set_xticks(ind + getTickAdj(labs, width))
    ax.set_xticklabels(labs, rotation = 45, size = 14)
    
    plt.legend()
    
def makeBar(df, h, lab,  width):
    '''
    Contains
    '''
    df_s = df.sort_values(by = [h], ascending = False)

    #Get a barplot
    ind = np.arange(df_s.shape[0])
    labs = df_s[[lab]].values.ravel() 

    fig = plt.figure(facecolor = 'w', figsize = (12, 6))
    ax = plt.subplot(111)
    plt.subplots_adjust(bottom = 0.25)

    rec = ax.bar(ind + width, df_s[h].values, width, color='r')

    ax.set_xticks(ind + getTickAdj(labs, width))
    ax.set_xticklabels(labs, rotation = 45, size = 14)


def getTickAdj(labs, width):
    lens = list(map(len, labs))
    lens = -1 * width * (lens - np.mean(lens)) / np.max(lens)
    return lens
    
plotMI(data, 'Playlisted', 0.4, signed=1)
plt.title('Mutual Information Between Feature and Playlisted');
plt.ylabel('Mutual Information');

Variable Importance

A final exploratory step I want to take is variable importance. This is just one more way I can see which features will add the most significance to a potential model. Because signature and mode are again the least important, I'm going to drop them from the dataset when I do dimensionality reduction with confidence that it will not affect a future model's accuracy. Because loudness did better than energy in terms of both mutual info and variable importance, I'm going to drop energy as well to avoid that correlation of 0.8.

In [7]:
# add features and their importances to a dictionary

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X,Y)
feature_mi = dt.feature_importances_
mi_dict = dict(zip(X.columns.values, feature_mi))

# sort dict and plot 

sorted_dict = {k: v for k, v in sorted(mi_dict.items(), reverse = True, key=lambda item: item[1])}

plt.figure(facecolor = 'w', figsize = (12, 6))
plt.bar(sorted_dict.keys(), sorted_dict.values(), color = 'cornflowerblue')
plt.xticks(range(X.shape[1]), sorted_dict.keys(), fontsize=12, rotation=45)
plt.title('Feature Importance')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()

Welcome to the Model Bakeoff!

Now that I feel I have a good grasp on my features, I am going to run a bakeoff for different models. Because my data is imbalanced, I am going to upsample as well. (I will first run the bakeoff on the data without upsampling, and then compare performance of models). To confirm my features analysis, I will also assess the performance of the models with the dropped features and without.

Design Parameters:

Algorithms: Logistic Regression, Naive Bayes, Random Forest, Decision Tree, K Nearest Neighbors, Gradient Boosted Tree

Hyperparameters: Need to choose appropriate ranges for every model's specific hyperparams.

Features: All included vs. without energy, signature, and mode

Imbalanced Data: Without upsampling playlisted songs vs. with upsampling playlisted songs

Evaluation:

I will use AUC, accuracy, precision, and recall to evaluate the models. I will split the data into 80% train, 20% test.

In [81]:
from sklearn.model_selection import *
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *
from imblearn.over_sampling import SMOTE

models = [LogisticRegression(), GaussianNB(), RandomForestClassifier(), DecisionTreeClassifier(), KNeighborsClassifier(), GradientBoostingClassifier()]
labels = ["Log Reg", "Naive Bayes", "Random Forest", "Decision Tree", "K Nearest Neighbors", "Gradient Boosting" ]

def plot_roc_curve(models, X_train, y_train, X_test, y_test, dropped, upsampled, labels, design_params, ax1, ax2):
    
    fpr = []
    tpr = []
    auc = []
    acc = []
    recall = []
    precision = []
    these_labels = []
    f_scores = []
    
    if dropped == True:
        X_train = X_train.drop(['Sig', 'Mode', 'Energy'],1)
        X_test = X_test.drop(['Sig', 'Mode', 'Energy'],1)

    
    for i in range(len(models)):
        
        mod = models[i].fit(X_train, y_train)
            
        preds = mod.predict_proba(X_test)[:,1]
        this_fpr,this_tpr,thresh = roc_curve(y_test, preds)
        auc.append( roc_auc_score(y_test, preds) )
        fpr.append( this_fpr )
        tpr.append( this_tpr )
        acc.append(accuracy_score(y_test, mod.predict(X_test)))
        recall.append(recall_score(y_test,mod.predict(X_test) ))
        precision.append(precision_score(y_test, mod.predict(X_test), zero_division = 0))
        f_scores.append(f1_score(y_test, mod.predict(X_test)))
        
        axs[ax1,ax2].plot(this_fpr,this_tpr, label = labels[i] + ', AUC = {}, F1 = {}'.format(round(auc[i],2), round(f_scores[i],2)))
        
        these_labels.append(labels[i] + design_params)
        
    axs[ax1,ax2].plot([0,1],[0,1], 'b--', label = 'Random Chance')
    axs[ax1,ax2].title.set_text('ROC Curve {}'.format(design_params))
    axs[ax1,ax2].set_xlabel('False Positive Rate')
    axs[ax1,ax2].set_ylabel('True Positive Rate')
    axs[ax1,ax2].legend();
    
    df = pd.DataFrame(these_labels, columns = ['Model'])
    df['Accuracy'],df['AUC'],df['Recall'],df['Precision'],df['F1 Score'] = acc,auc,recall,precision,f_scores
    
    return df
    
In [82]:
fig, axs = plt.subplots(2, 2, figsize=(18, 12))

X_train, X_test, y_train, y_test = train_test_split(data.drop('Playlisted',1), data['Playlisted'], test_size=0.2, random_state=42)

sm = SMOTE(random_state=12, sampling_strategy = 1.0)
up_X_train, up_y_train = sm.fit_sample(X_train, y_train)

subset_imbalanced = plot_roc_curve(models, X_train, y_train, X_test, y_test, dropped = True, upsampled = False, labels = labels, design_params = ": feat. subset, not upsampled", ax1 = 0, ax2 = 0)
allfeats_imbalanced = plot_roc_curve(models, X_train, y_train, X_test, y_test, dropped = False, upsampled = False, labels = labels, design_params = ": all feats, not upsampled", ax1 = 0, ax2 = 1)
subset_upsampled = plot_roc_curve(models, up_X_train, up_y_train, X_test, y_test, dropped = True, upsampled = True, labels = labels, design_params = ": feat. subset, upsampled", ax1 = 1, ax2 = 0)
allfeats_upsampled = plot_roc_curve(models, up_X_train, up_y_train, X_test, y_test, dropped = False, upsampled = True, labels = labels, design_params = ": all feats, upsampled", ax1 = 1, ax2 = 1)
In [83]:
comparison = pd.concat([subset_upsampled, allfeats_upsampled], ignore_index = True)
comparison.sort_values(by=['AUC'], ascending = False)
Out[83]:
Model Accuracy AUC Recall Precision F1 Score
4 K Nearest Neighbors: feat. subset, upsampled 0.740385 0.846584 0.818182 0.264706 0.400000
10 K Nearest Neighbors: all feats, upsampled 0.762821 0.821060 0.757576 0.274725 0.403226
2 Random Forest: feat. subset, upsampled 0.919872 0.818236 0.484848 0.666667 0.561404
8 Random Forest: all feats, upsampled 0.932692 0.807429 0.484848 0.800000 0.603774
11 Gradient Boosting: all feats, upsampled 0.875000 0.764961 0.363636 0.400000 0.380952
5 Gradient Boosting: feat. subset, upsampled 0.823718 0.758119 0.424242 0.280000 0.337349
3 Decision Tree: feat. subset, upsampled 0.858974 0.734115 0.575758 0.387755 0.463415
9 Decision Tree: all feats, upsampled 0.849359 0.688661 0.484848 0.347826 0.405063
7 Naive Bayes: all feats, upsampled 0.503205 0.667861 0.818182 0.153409 0.258373
1 Naive Bayes: feat. subset, upsampled 0.483974 0.665689 0.818182 0.148352 0.251163
0 Log Reg: feat. subset, upsampled 0.605769 0.642012 0.575758 0.148438 0.236025
6 Log Reg: all feats, upsampled 0.573718 0.606278 0.575758 0.137681 0.222222

Based on these out of the bag models, it is apparent that upsampling is generally important for model performance, or the models will learn to only predict the majority class to score well. Based on AUC, K Nearest Neighbors, Random Forest, and Gradient Boosting models performed the best, whether using all features or the feature subset. Because dimensionality isn't a huge problem for this project and these models are pretty robust when it comes to correlated features, I'm going to tune the hyperparameters for these three models with all of the features.

In [84]:
rf_baseline = comparison.loc[comparison['Model'] == 'Random Forest: all feats, upsampled']
rf_baseline.reset_index(drop = True)
Out[84]:
Model Accuracy AUC Recall Precision F1 Score
0 Random Forest: all feats, upsampled 0.932692 0.807429 0.484848 0.8 0.603774
In [85]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)
{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
In [86]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(up_X_train, up_y_train)
Fitting 3 folds for each of 100 candidates, totalling 300 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  4.2min finished
Out[86]:
RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)
In [87]:
print(rf_random.best_estimator_)
RandomForestClassifier(bootstrap=False, max_depth=40, n_estimators=1400)
In [88]:
# best estimator: bootstrap = False, max_depth = 40, n_estimators = 1400

rf_best = RandomForestClassifier(bootstrap=False, max_depth=40, n_estimators=1400)

fpr = []
tpr = []
auc = []
acc = []
recall = []
precision = []
these_labels = []
f_score = []

rf_best.fit(up_X_train,up_y_train)
preds = rf_best.predict_proba(X_test)[:,1]
auc.append(roc_auc_score(y_test, preds))
acc.append(accuracy_score(y_test, rf_best.predict(X_test)))
recall.append(recall_score(y_test,rf_best.predict(X_test)))
precision.append(precision_score(y_test, rf_best.predict(X_test), zero_division = 0))
f_score.append(f1_score(y_test, rf_best.predict(X_test)))
df = pd.DataFrame()
df['Model'],df['Accuracy'],df['AUC'],df['Recall'],df['Precision'],df['F1 Score'] = ['Random Forest Tuned'],acc,auc,recall,precision,f_score
In [89]:
rf_baseline.append(df)
Out[89]:
Model Accuracy AUC Recall Precision F1 Score
8 Random Forest: all feats, upsampled 0.932692 0.807429 0.484848 0.800000 0.603774
0 Random Forest Tuned 0.939103 0.823721 0.484848 0.888889 0.627451