import os
import pickle as cP
import joblib as jl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib
import pickle
import librosa
from IPython.display import Image
from IPython.display import YouTubeVideo

def pickleload(fp):
    with open(fp, 'rb') as f:
        return pickle.load(f)
    
files_dir = './files_song_versions/'
emo_names = ['valence', 'energy', 'tension', 'anger', 'fear', 'happy', 'sad', 'tender']

Explainable Music Emotion Recognition¶

Comparison of different versions of same song in terms of musical emotion perceived and explained in terms of mid-level features¶

Example song from YouTube with motivating comment¶

Image('files_song_versions/hurt_yt_1.png')

Image('files_song_versions/hurt_yt_comment.png')

Nine Inch Nails version¶

YouTubeVideo('kPz21cDK7dg', height=100)

Johnny Cash version¶

YouTubeVideo('vt1Pwfnh5pc', height=100)

Train VGG-style CNN to predict emotions and mid-level features...¶

Image('files_song_versions/net.png')

...using the Soundtracks dataset and Mid-level dataset...¶

Mid-level: 'melody', 'articulation', 'rhythm_complexity', 'rhythm_stability', 'dissonance', 'tonal_stability', 'minorness'

Soundtracks: 'valence', 'energy', 'tension', 'anger', 'fear', 'happy', 'sad', 'tender'

...and predict the emotion ratings for the two versions¶

Image('files_song_versions/e_hurt.png')

Very nice. The CNN has feelings. But can it introspect?¶

Thanks to an entire life of reflecting on its emotions, it can.¶

Image('files_song_versions/ml_hurt.png')

ML2Eweights = pickleload(os.path.join(files_dir, '1295_ml2e_weights_19'))

emotion_annotations = pickleload(os.path.join(files_dir, '1295_st_all_emo_anns'))
emotion_predictions = pickleload(os.path.join(files_dir, '1295_st_all_emo_preds'))
midlevel_annotations = pickleload(os.path.join(files_dir, '1295_st_all_ml_anns'))
midlevel_predictions = pickleload(os.path.join(files_dir, '1295_st_all_ml_preds'))

What musical characteristics make the Nine Inch Nails version sound "angrier"?¶

Tonal and rhythmic (in)stability, dissonance, and (un)melodiousness¶

ml_versions = pickleload(os.path.join(files_dir, 'ml_hurt'))
import matplotlib.patches as mpatches
blue_patch = mpatches.Patch(color='blue', label=ml_versions.index[0])
orange_patch = mpatches.Patch(color='orange', label=ml_versions.index[1])
fig, ax = plt.subplots(4,2,sharey=True,figsize=(25,30))
emotion_num = 0
vert_spacing=3.6
ml_names_plot = ['melody', 'artic.', 'rh.complx', 'rh.stblty', 'diss.', 'tonal', 'minor']
font = {'size'   : 15}
matplotlib.rc('font', **font)
for i in range(ax.shape[0]):
    for j in range(ax.shape[1]):
        # Calculate effect of midlevel features for current emotion across all songs
        effect = np.multiply(midlevel_predictions, ML2Eweights.transpose()[:,emotion_num])
        
        # Get midlevel predictions for song1 and song2 of current emotion
        song1_ml = ml_versions.iloc[0]
        song2_ml = ml_versions.iloc[1]
        # Calculate effect of these midlevel predictions
        song1_ml_effect = np.multiply(song1_ml, ML2Eweights.transpose()[:,emotion_num])
        song2_ml_effect = np.multiply(song2_ml, ML2Eweights.transpose()[:,emotion_num])
        # Plot all effects in boxplot
        ax[i][j].boxplot(effect.transpose(), vert=False, positions=np.linspace(1,vert_spacing,7), showfliers=False)
        # Plot effects of midlevel for max emotion prediction
        scat1 = ax[i][j].scatter(song1_ml_effect,np.linspace(1,vert_spacing,7),color='b', s=95, alpha=0.9)
        # Plot effects of midlevel for min emotion prediction
        scat2 = ax[i][j].scatter(song2_ml_effect,np.linspace(1,vert_spacing,7),color='orange', s=95, alpha=1)
        ax[i][j].set_yticklabels(ml_names_plot);
        ax[i][j].tick_params(axis='y', direction='in');
#         ax[i][j].set_title(emo_names[emotion_num])
        ax[i][j].text(.9,.93,emo_names[emotion_num],horizontalalignment='center', transform=ax[i][j].transAxes)
        ax[i][j].axvline(0, alpha=0.5, linestyle='--')
        ax[i][j].yaxis.grid(True)
        emotion_num += 1
plt.legend(handles=[blue_patch, orange_patch], loc='upper center', bbox_to_anchor=(0.5, -0.1))
fig.subplots_adjust(wspace=0)
# plt.savefig('effects.pdf', dpi=1200, bbox_inches="tight", pad_inches=0)

Other songs¶

1. Hallelujah¶

Image('files_song_versions/e_hallelujah.png')

Image('files_song_versions/ml_hallelujah.png')

ml_versions = pickleload(os.path.join(files_dir, 'ml_hallelujah'))
import matplotlib.patches as mpatches
blue_patch = mpatches.Patch(color='blue', label=ml_versions.index[0])
orange_patch = mpatches.Patch(color='orange', label=ml_versions.index[1])
fig, ax = plt.subplots(4,2,sharey=True,figsize=(25,30))
emotion_num = 0
vert_spacing=3.6
ml_names_plot = ['melody', 'artic.', 'rh.complx', 'rh.stblty', 'diss.', 'tonal', 'minor']
font = {'size'   : 15}
matplotlib.rc('font', **font)
for i in range(ax.shape[0]):
    for j in range(ax.shape[1]):
        # Calculate effect of midlevel features for current emotion across all songs
        effect = np.multiply(midlevel_predictions, ML2Eweights.transpose()[:,emotion_num])
        
        # Get midlevel predictions for song1 and song2 of current emotion
        song1_ml = ml_versions.iloc[0]
        song2_ml = ml_versions.iloc[1]
        # Calculate effect of these midlevel predictions
        song1_ml_effect = np.multiply(song1_ml, ML2Eweights.transpose()[:,emotion_num])
        song2_ml_effect = np.multiply(song2_ml, ML2Eweights.transpose()[:,emotion_num])
        # Plot all effects in boxplot
        ax[i][j].boxplot(effect.transpose(), vert=False, positions=np.linspace(1,vert_spacing,7), showfliers=False)
        # Plot effects of midlevel for max emotion prediction
        scat1 = ax[i][j].scatter(song1_ml_effect,np.linspace(1,vert_spacing,7),color='b', s=95, alpha=0.9)
        # Plot effects of midlevel for min emotion prediction
        scat2 = ax[i][j].scatter(song2_ml_effect,np.linspace(1,vert_spacing,7),color='orange', s=95, alpha=1)
        ax[i][j].set_yticklabels(ml_names_plot);
        ax[i][j].tick_params(axis='y', direction='in');
#         ax[i][j].set_title(emo_names[emotion_num])
        ax[i][j].text(.9,.93,emo_names[emotion_num],horizontalalignment='center', transform=ax[i][j].transAxes)
        ax[i][j].axvline(0, alpha=0.5, linestyle='--')
        ax[i][j].yaxis.grid(True)
        emotion_num += 1
plt.legend(handles=[blue_patch, orange_patch], loc='upper center', bbox_to_anchor=(0.5, -0.1))
fig.subplots_adjust(wspace=0)

2. The Girl from Ipanema¶

The model is not yet sensitive to subtle differences between song versions¶

Image('files_song_versions/e_ipanema.png')

Image('files_song_versions/ml_ipanema.png')

ml_versions = pickleload(os.path.join(files_dir, 'ml_ipanema'))
import matplotlib.patches as mpatches
red_patch = mpatches.Patch(color='red', label=ml_versions.index[0])
blue_patch = mpatches.Patch(color='blue', label=ml_versions.index[1])
green_patch = mpatches.Patch(color='green', label=ml_versions.index[2])
fig, ax = plt.subplots(4,2,sharey=True,figsize=(25,30))
emotion_num = 0
vert_spacing=3.6
ml_names_plot = ['melody', 'artic.', 'rh.complx', 'rh.stblty', 'diss.', 'tonal', 'minor']
font = {'size'   : 15}
matplotlib.rc('font', **font)
for i in range(ax.shape[0]):
    for j in range(ax.shape[1]):
        # Calculate effect of midlevel features for current emotion across all songs
        effect = np.multiply(midlevel_predictions, ML2Eweights.transpose()[:,emotion_num])
        
        # Get midlevel predictions for song1 and song2 of current emotion
        song1_ml = ml_versions.iloc[0]
        song2_ml = ml_versions.iloc[1]
        song3_ml = ml_versions.iloc[2]
        # Calculate effect of these midlevel predictions
        song1_ml_effect = np.multiply(song1_ml, ML2Eweights.transpose()[:,emotion_num])
        song2_ml_effect = np.multiply(song2_ml, ML2Eweights.transpose()[:,emotion_num])
        song3_ml_effect = np.multiply(song3_ml, ML2Eweights.transpose()[:,emotion_num])
        # Plot all effects in boxplot
        ax[i][j].boxplot(effect.transpose(), vert=False, positions=np.linspace(1,vert_spacing,7), showfliers=False)
        # Plot effects of midlevel for max emotion prediction
        scat1 = ax[i][j].scatter(song1_ml_effect,np.linspace(1,vert_spacing,7),color='r', s=95, alpha=0.7)
        # Plot effects of midlevel for min emotion prediction
        scat2 = ax[i][j].scatter(song2_ml_effect,np.linspace(1,vert_spacing,7),color='b', s=95, alpha=1)
        scat3 = ax[i][j].scatter(song3_ml_effect,np.linspace(1,vert_spacing,7),color='g', s=95, alpha=0.7)
        ax[i][j].set_yticklabels(ml_names_plot);
        ax[i][j].tick_params(axis='y', direction='in');
#         ax[i][j].set_title(emo_names[emotion_num])
        ax[i][j].text(.9,.93,emo_names[emotion_num],horizontalalignment='center', transform=ax[i][j].transAxes)
        ax[i][j].axvline(0, alpha=0.5, linestyle='--')
        ax[i][j].yaxis.grid(True)
        emotion_num += 1
plt.legend(handles=[red_patch, blue_patch, green_patch], loc='upper center', bbox_to_anchor=(0.5, -0.1))
fig.subplots_adjust(wspace=0)