In [2]:
import os
import pickle as cP
import joblib as jl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib
import pickle
import librosa
from IPython.display import Image
from IPython.display import YouTubeVideo

def pickleload(fp):
    with open(fp, 'rb') as f:
        return pickle.load(f)
    
files_dir = './files_song_versions/'
emo_names = ['valence', 'energy', 'tension', 'anger', 'fear', 'happy', 'sad', 'tender']

Explainable Music Emotion Recognition

Comparison of different versions of same song in terms of musical emotion perceived and explained in terms of mid-level features

Example song from YouTube with motivating comment

In [3]:
Image('files_song_versions/hurt_yt_1.png')
Out[3]:
In [4]:
Image('files_song_versions/hurt_yt_comment.png')
Out[4]:

Nine Inch Nails version

In [5]:
YouTubeVideo('kPz21cDK7dg', height=100)
Out[5]:

Johnny Cash version

In [6]:
YouTubeVideo('vt1Pwfnh5pc', height=100)
Out[6]:

Train VGG-style CNN to predict emotions and mid-level features...

In [78]:
Image('files_song_versions/net.png')
Out[78]:

...using the Soundtracks dataset and Mid-level dataset...

Mid-level: 'melody', 'articulation', 'rhythm_complexity', 'rhythm_stability', 'dissonance', 'tonal_stability', 'minorness'

Soundtracks: 'valence', 'energy', 'tension', 'anger', 'fear', 'happy', 'sad', 'tender'

...and predict the emotion ratings for the two versions

In [80]:
Image('files_song_versions/e_hurt.png')
Out[80]:

Very nice. The CNN has feelings. But can it introspect?

Thanks to an entire life of reflecting on its emotions, it can.

In [81]:
Image('files_song_versions/ml_hurt.png')
Out[81]:
In [31]:
ML2Eweights = pickleload(os.path.join(files_dir, '1295_ml2e_weights_19'))
In [92]:
emotion_annotations = pickleload(os.path.join(files_dir, '1295_st_all_emo_anns'))
emotion_predictions = pickleload(os.path.join(files_dir, '1295_st_all_emo_preds'))
midlevel_annotations = pickleload(os.path.join(files_dir, '1295_st_all_ml_anns'))
midlevel_predictions = pickleload(os.path.join(files_dir, '1295_st_all_ml_preds'))

What musical characteristics make the Nine Inch Nails version sound "angrier"?

Tonal and rhythmic (in)stability, dissonance, and (un)melodiousness

In [105]:
ml_versions = pickleload(os.path.join(files_dir, 'ml_hurt'))
import matplotlib.patches as mpatches
blue_patch = mpatches.Patch(color='blue', label=ml_versions.index[0])
orange_patch = mpatches.Patch(color='orange', label=ml_versions.index[1])
fig, ax = plt.subplots(4,2,sharey=True,figsize=(25,30))
emotion_num = 0
vert_spacing=3.6
ml_names_plot = ['melody', 'artic.', 'rh.complx', 'rh.stblty', 'diss.', 'tonal', 'minor']
font = {'size'   : 15}
matplotlib.rc('font', **font)
for i in range(ax.shape[0]):
    for j in range(ax.shape[1]):
        # Calculate effect of midlevel features for current emotion across all songs
        effect = np.multiply(midlevel_predictions, ML2Eweights.transpose()[:,emotion_num])
        
        # Get midlevel predictions for song1 and song2 of current emotion
        song1_ml = ml_versions.iloc[0]
        song2_ml = ml_versions.iloc[1]
        # Calculate effect of these midlevel predictions
        song1_ml_effect = np.multiply(song1_ml, ML2Eweights.transpose()[:,emotion_num])
        song2_ml_effect = np.multiply(song2_ml, ML2Eweights.transpose()[:,emotion_num])
        # Plot all effects in boxplot
        ax[i][j].boxplot(effect.transpose(), vert=False, positions=np.linspace(1,vert_spacing,7), showfliers=False)
        # Plot effects of midlevel for max emotion prediction
        scat1 = ax[i][j].scatter(song1_ml_effect,np.linspace(1,vert_spacing,7),color='b', s=95, alpha=0.9)
        # Plot effects of midlevel for min emotion prediction
        scat2 = ax[i][j].scatter(song2_ml_effect,np.linspace(1,vert_spacing,7),color='orange', s=95, alpha=1)
        ax[i][j].set_yticklabels(ml_names_plot);
        ax[i][j].tick_params(axis='y', direction='in');
#         ax[i][j].set_title(emo_names[emotion_num])
        ax[i][j].text(.9,.93,emo_names[emotion_num],horizontalalignment='center', transform=ax[i][j].transAxes)
        ax[i][j].axvline(0, alpha=0.5, linestyle='--')
        ax[i][j].yaxis.grid(True)
        emotion_num += 1
plt.legend(handles=[blue_patch, orange_patch], loc='upper center', bbox_to_anchor=(0.5, -0.1))
fig.subplots_adjust(wspace=0)
# plt.savefig('effects.pdf', dpi=1200, bbox_inches="tight", pad_inches=0)

Other songs

1. Hallelujah

In [88]:
Image('files_song_versions/e_hallelujah.png')
Out[88]:
In [87]:
Image('files_song_versions/ml_hallelujah.png')
Out[87]:
In [106]:
ml_versions = pickleload(os.path.join(files_dir, 'ml_hallelujah'))
import matplotlib.patches as mpatches
blue_patch = mpatches.Patch(color='blue', label=ml_versions.index[0])
orange_patch = mpatches.Patch(color='orange', label=ml_versions.index[1])
fig, ax = plt.subplots(4,2,sharey=True,figsize=(25,30))
emotion_num = 0
vert_spacing=3.6
ml_names_plot = ['melody', 'artic.', 'rh.complx', 'rh.stblty', 'diss.', 'tonal', 'minor']
font = {'size'   : 15}
matplotlib.rc('font', **font)
for i in range(ax.shape[0]):
    for j in range(ax.shape[1]):
        # Calculate effect of midlevel features for current emotion across all songs
        effect = np.multiply(midlevel_predictions, ML2Eweights.transpose()[:,emotion_num])
        
        # Get midlevel predictions for song1 and song2 of current emotion
        song1_ml = ml_versions.iloc[0]
        song2_ml = ml_versions.iloc[1]
        # Calculate effect of these midlevel predictions
        song1_ml_effect = np.multiply(song1_ml, ML2Eweights.transpose()[:,emotion_num])
        song2_ml_effect = np.multiply(song2_ml, ML2Eweights.transpose()[:,emotion_num])
        # Plot all effects in boxplot
        ax[i][j].boxplot(effect.transpose(), vert=False, positions=np.linspace(1,vert_spacing,7), showfliers=False)
        # Plot effects of midlevel for max emotion prediction
        scat1 = ax[i][j].scatter(song1_ml_effect,np.linspace(1,vert_spacing,7),color='b', s=95, alpha=0.9)
        # Plot effects of midlevel for min emotion prediction
        scat2 = ax[i][j].scatter(song2_ml_effect,np.linspace(1,vert_spacing,7),color='orange', s=95, alpha=1)
        ax[i][j].set_yticklabels(ml_names_plot);
        ax[i][j].tick_params(axis='y', direction='in');
#         ax[i][j].set_title(emo_names[emotion_num])
        ax[i][j].text(.9,.93,emo_names[emotion_num],horizontalalignment='center', transform=ax[i][j].transAxes)
        ax[i][j].axvline(0, alpha=0.5, linestyle='--')
        ax[i][j].yaxis.grid(True)
        emotion_num += 1
plt.legend(handles=[blue_patch, orange_patch], loc='upper center', bbox_to_anchor=(0.5, -0.1))
fig.subplots_adjust(wspace=0)

2. The Girl from Ipanema

The model is not yet sensitive to subtle differences between song versions

In [95]:
Image('files_song_versions/e_ipanema.png')
Out[95]:
In [96]:
Image('files_song_versions/ml_ipanema.png')
Out[96]: