Visualize clusters of the weird noise#

1. Import modules#

import numpy as np
import pandas as pd
import warnings
import gc

import librosa
import librosa.display
from IPython.display import Audio

import scipy as sp 
import scipy.io as scio
from scipy import signal

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.manifold import MDS

from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams

rcParams['font.family'] = 'Overpass Nerd Font'
rcParams['font.size'] = 18
rcParams['axes.titlesize'] = 20
rcParams['axes.labelsize'] = 18
rcParams['axes.linewidth'] = 1.5
rcParams['lines.linewidth'] = 1.5
rcParams['lines.markersize'] = 20
rcParams['patch.linewidth'] = 1.5
rcParams['xtick.labelsize'] = 18
rcParams['ytick.labelsize'] = 18
rcParams['xtick.major.width'] = 2
rcParams['xtick.minor.width'] = 2
rcParams['ytick.major.width'] = 2
rcParams['ytick.minor.width'] = 2
rcParams['savefig.dpi'] = 300
rcParams['savefig.transparent'] = False
rcParams['savefig.facecolor'] = 'white'
rcParams['savefig.format'] = 'svg'
rcParams['savefig.pad_inches'] = 0.5
rcParams['savefig.bbox'] = 'tight'

2. Import data#

wav_file = 'data/recording.wav'
Fs, data = scio.wavfile.read(wav_file)
Audio(data=data, rate=Fs)
t_range = [5, 115] # limit time after recording and before ending

y = data.astype(float)
t = np.arange(len(data)) / Fs

select_loc = (t >= t_range[0]) & (t <= t_range[1])
t, y = t[select_loc], y[select_loc]
t = t - t[0]
N = len(y)
y = (y - np.mean(y)) / np.std(y)
S_db = librosa.amplitude_to_db(
    np.abs(librosa.stft(y)),
    ref=np.max
)

def _plt_spectrum():
    librosa.display.specshow(
        S_db,
        sr=Fs, 
        y_axis='linear',
        x_axis='s',
        ax=plt.gca()
    )
    plt.ylim([500, 12000])
    ttl_add = ''
    plt.title('Frequency spectrum' + ttl_add)

plt.figure(figsize=(18,6))

_plt_spectrum()
plt.savefig('figures/spectrum-full.svg')

plt.figure(figsize=(10,6))
_plt_spectrum()

for t_range in [
    [21, 23],
    [81, 83],
    [93, 95]
]:
    t0, tf = t_range
    plt.xlim(t_range)
    plt.title('Frequency spectrum' + ' (selected time)')
    plt.xlabel('Time (second)')
    plt.savefig(f'figures/spectrum-selected_{t0}-{tf}.svg')
_images/d0d288f371edcd9e3dfff0c0f321f10082d0aac766e6ae2a4fc325b88a9babc6.png _images/5dd828a9801739bf658556958f6904e16b336677baccf406d11c789f54b60fff.png
gc.collect()
6656

3. Construct features#

win_anly_sec = 0.8
win_step_sec = 0.2

freq_range = [800, 16000]

win_anly_pnt = int(win_anly_sec * Fs)
win_step_pnt = int(win_step_sec * Fs)

Pyy_collection = []

for ind_start in range(0, len(y)-win_anly_pnt, win_step_pnt):
    ind_end = ind_start + win_anly_pnt
    y_win = y[ind_start:ind_end]
    f, Pyy = signal.periodogram(y_win, Fs)
    loc_f = (f >= freq_range[0]) & (f <= freq_range[1])
    f = f[loc_f]
    Pyy = Pyy[loc_f]
    Pyy_collection.append(dict(
        t0 = ind_start / Fs,
        f = f,
        Pyy = Pyy
    ))

Pyy_df = pd.DataFrame(Pyy_collection)
num_bands = 200

freq_bands = np.logspace(
    np.log10(freq_range[0]-1e-5), 
    np.log10(freq_range[1]+1e-5), 
    num_bands+1
)

freq_feat_df = []

for Pyy_dict in Pyy_collection:
    py_df = pd.DataFrame(Pyy_dict).assign(
        log_P = lambda x: np.log10(x['Pyy']),
        band = lambda x: np.digitize(x['f'], bins=freq_bands)
    )
    
    freq_feat_df.append(dict(
        t0 = Pyy_dict['t0'],
        band_index = range(num_bands),
        X = py_df.groupby('band').log_P.agg('mean').to_numpy(),  
        # X = py_df.groupby('band').Pyy.agg('median').to_numpy()     
    ))
    
freq_feat_df = pd.DataFrame(freq_feat_df)

4. Clustering#

cluster_num_vec = range(2, 10)

X = np.vstack(freq_feat_df.X.to_list())
cluster_metrics = []
for k in tqdm(cluster_num_vec):
    clusterer = KMeans(n_clusters=k, n_init=20, max_iter=1000).fit(X)
    out_labels = clusterer.labels_
    out_inertia = clusterer.inertia_
    out_silhouette = metrics.silhouette_score(X, out_labels, metric='euclidean')
    cluster_metrics.append(dict(
        k = k, 
        labels = out_labels,
        inertia = out_inertia,
        silhouette = out_silhouette
    ))
    
cluster_metrics = pd.DataFrame(cluster_metrics)
cluster_metrics['num_clusters'] = cluster_metrics['k']
sns.relplot(
    data = cluster_metrics.melt(
        id_vars='num_clusters',
        value_vars=['inertia', 'silhouette'],
        var_name='metric'
    ),
    x = 'num_clusters',
    y = 'value',
    kind = 'line',
    color='k',
    marker='o',
    markeredgecolor='none',
    markersize=15,
    lw=3,
    row = 'metric',
    facet_kws = {'sharey': False},
    height=4,
    aspect=2
)
sns.despine(trim=True, offset=10)
plt.savefig('figures/kmeans-metrics.svg')
plt.show()
_images/ce79231d5269c5f5fa23881e58104aab4f0d2354c5847babecb95f235cf7cb02.png
chosen_k = 4
freq_feat_df['cluster'] = cluster_metrics.query('k == @chosen_k').labels.to_list()[0]
warnings.filterwarnings('ignore') 

plt.figure(figsize=(18,10))
plt.subplot2grid((4,1), (0,0), rowspan=3)
sns.lineplot(
    data=freq_feat_df.explode(['X', 'band_index']),
    x='band_index', 
    y='X',
    hue='cluster',
    palette='Dark2',
    lw=2,
    alpha=0.8,
    # ci='sd'
)

plt.title('Mean feature vector based on spectrum')
plt.xlabel('$X$')
plt.xlabel('band index')
plt.subplot2grid((4,1), (3,0))

sns.scatterplot(
    data=freq_feat_df, 
    x="t0",
    y="cluster",
    s=100,
    marker='|',
    hue='cluster',
    palette='Dark2',
    edgecolors=None,
    alpha=0.99,
    ci=None,
    legend=False
)
plt.ylim([-1, chosen_k+1])
plt.xlabel('time window t$_0$ (second)')
plt.title('K-means clustering')

sns.despine(trim=True, offset=10)
plt.tight_layout()

plt.savefig('figures/features-and-clusters.svg')
plt.show()
_images/36baa9fc1415bfc8d5d59b41acfb1e829a26e27fc1bbe8438cc7ffca28351bac.png

5. Embedding#

embedding = MDS(n_components=2, normalized_stress='auto')
X_proj = embedding.fit_transform(X)
plt.figure(figsize=(10,8))
sns.scatterplot(
    x = X_proj[:,0],
    y = X_proj[:,1],
    hue=freq_feat_df['cluster'],
    # hue=freq_feat_df['t0'],
    palette='Dark2',
    s=120,
    edgecolor='none',
    alpha=0.5,
    ci=None,
)

plt.title('MDS of spectrum features, colored by K-means clusters')
plt.xlabel('PC$_1$')
plt.ylabel('PC$_2$')
sns.despine(trim=True, offset=5)
plt.tight_layout()
plt.savefig('figures/mds-colored-by-kmeans.svg')

plt.show()
_images/b7462046f8d1d7ce65015778eb969f4ccd8fbe0314a2de71c75bb7b48a94161b.png