Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

Visualization of words used in House of the Dragon Season 2 and their “rarity”

This notebook assumes the steps from collect,ipynb are already done.

The necessary data are in data folder.

  • hotd-s2-words.csv: processed word usage from show S2 scripts.

  • data4seo-word-trends-[90d,12m].json: keyword trends data using DataforSEO API.

The figure outputs are saved in figures folder:

  • hotds2-rare-word-trends.svg: Rare word trends (90d)

  • hotds2-stacked-word-trends-colored-by-base_freq_quartile.svg: Individual word trends (more coarse), colored and sorted by word rareness

  • hotds2-stacked-word-trends-colored-by-log10_ratio_quartile.svg: Individual word trends (more coarse), colored and sorted by their relative usage in the show scripts

  • hotds2-bulk-agg-word-trends.svg: Bulk trends across time

  • hotds2-after-air-trends-vs-usage.svg: Aggregate trends after show air and word’s usage metrics

Import packages & define paths

Source
import os
import re
import json

import numpy as np
import pandas as pd
from scipy import signal, interpolate

import matplotlib.pyplot as plt
from matplotlib import rcParams
import matplotlib.dates as mdates
import seaborn as sns
Source
# plot configs
rcParams['font.family'] = 'Overpass Nerd Font'
rcParams['font.size'] = 18
rcParams['axes.titlesize'] = 20
rcParams['axes.labelsize'] = 18
rcParams['axes.linewidth'] = 1.5
rcParams['lines.linewidth'] = 1.5
rcParams['lines.markersize'] = 20
rcParams['patch.linewidth'] = 1.5
rcParams['xtick.labelsize'] = 18
rcParams['ytick.labelsize'] = 18
rcParams['xtick.major.width'] = 2
rcParams['xtick.minor.width'] = 2
rcParams['ytick.major.width'] = 2
rcParams['ytick.minor.width'] = 2
rcParams['savefig.dpi'] = 300
rcParams['savefig.transparent'] = False
rcParams['savefig.facecolor'] = 'white'
rcParams['savefig.format'] = 'svg'
rcParams['savefig.pad_inches'] = 0.5
rcParams['savefig.bbox'] = 'tight'
# data file for word data frame
word_file_path = 'data/hotd-s2-words.csv'
# data file for dataforseo trend queries
trend_90d_file_path = 'data/data4seo-word-trends-90d.json'
trend_12m_file_path = 'data/data4seo-word-trends-12m.json'
# output figure directory
fig_dir = 'figures'

Load word data

word_df = (
    pd.read_csv(word_file_path)
    .query('is_selected')
    .reset_index(drop=True)
)

word_df['air_date'] = pd.to_datetime(word_df['air_date'])
num_episodes = word_df['episode'].nunique()
word_df
Loading...
Source
word_first_appearance = (
    word_df.sort_values(['episode'])
    .groupby('word').head(1)
    .reset_index(drop=True)
)

word_first_appearance
Loading...
show_start_date = word_df['air_date'].min()
show_end_date = word_df['air_date'].max()

show_start_date, show_end_date
(Timestamp('2024-06-16 00:00:00'), Timestamp('2024-08-04 00:00:00'))

Define time ranges

# summer start date
summer_startdate = pd.to_datetime('2024-06-01')

# max date of interest (relevant for 90d)
max_date = pd.to_datetime('2024-08-15')

# shift back a week for some leeway (relevant for 12m)
base_date = show_start_date - pd.Timedelta(7, unit='day')
Source
def process_trends(
    trend_file,
    base_date=None,
):

    # load raw data
    with open(trend_file, 'r') as f:
        raw_data = json.load(f)

    assert all([len(x['result']) == 1 for x in raw_data])

    # get keyword trends data
    trend_df = []

    for rd in raw_data:
        assert len(rd['result']) == 1
        trends = [
            x for x in rd['result'][0]['items']
            if x['type'] == 'dataforseo_trends_graph'
        ]
        assert len(trends) == 1
        trends = trends[0]
        trends_words = trends['keywords']

        trends_data = pd.DataFrame(trends['data'])
        trends_data['word'] = [trends_words] * len(trends_data)
        trends_data = (
            trends_data.explode(['word', 'values'])
            .rename(columns={'values': 'value'})
            .astype({'value':'float'})
            .reset_index(drop=True)
        )
        trend_df.append(trends_data)

    trend_df = pd.concat(trend_df, ignore_index=True)

    # process dates
    trend_df['date_from'] = pd.to_datetime(trend_df['date_from'])
    trend_df['date_to'] = pd.to_datetime(trend_df['date_to'])
    trend_df['num_days'] = (trend_df['date_to'] - trend_df['date_from']).dt.days
    trend_df['date_center'] = (
        trend_df['date_from'] + 
        pd.to_timedelta(np.round(trend_df['num_days'] / 2), unit='d')
    )

    # stats for norm
    min_max_df = (
        trend_df.groupby('word')
        ['value'].agg(['min', 'max'])
        .add_suffix('_value')
        .astype('float')
        .reset_index()
    )

    # other stats with optionally from base
    stat_df_base_opt = (
        (
            trend_df.query('date_to < @base_date')
            if base_date is not None
            else trend_df
        )
        .groupby('word')
        ['value'].agg(
            mean = 'mean',
            std = 'std',
            median = 'median',
            iqr = lambda x: np.subtract(*np.percentile(x, [75, 25]))
        )
        .add_suffix('_value')
        .astype('float')
        .reset_index()
    )

    # merge stats
    trend_df = (
        trend_df
        .merge(
            min_max_df,
            how='left'
        )
        .merge(
            stat_df_base_opt,
            how='left'
        )
        .query('max_value > 0')
        .reset_index(drop=True)
    )

    # min-max normalize
    trend_df['mm_value'] = (
        (trend_df['value'] - trend_df['min_value']) / 
        (trend_df['max_value'] - trend_df['min_value'])
    )

    # zscore scaling
    trend_df['z_value'] = (
        (trend_df['value'] - trend_df['mean_value']) / 
        (trend_df['std_value'])
    )
    
    # robust scaling
    trend_df['r_value'] = (
        (trend_df['value'] - trend_df['median_value']) / 
        (trend_df['iqr_value'])
    )
    
    trend_df = trend_df.drop(columns=[
        'min_value', 'max_value',
        'mean_value', 'std_value',
        'median_value', 'iqr_value'
    ])
    
    return trend_df
# note: don't need zscore for this one
trend_df = process_trends(trend_90d_file_path)
trend_df
Loading...

Select & process dates

Source
# select date range and convert date to summer / week days
trend_df = (
    trend_df
    .assign(date = pd.to_datetime(trend_df['date_from']))
    .query('date <= @max_date')
    .reset_index(drop=True)
)

# the selected date range should allow this
# if not, can remove
assert all(trend_df['date_from'] == trend_df['date_to'])

# filter only needed columns
# and add summer / week days if needed
# use `mm_value` aka min-max norm values onwards
trend_df = (
    trend_df
    .filter(['word', 'date', 'mm_value'])
    .assign(
        summer_day = (trend_df['date'] - summer_startdate).dt.days + 1,
        week_day = trend_df['date'].dt.strftime('%w').astype('int') + 1
    )
    .rename(columns={'mm_value': 'value'})
    .sort_values(['word', 'date'])
    .fillna({'value': 0})
    .reset_index(drop=True)
)

day_vec = np.arange(trend_df['summer_day'].min(), trend_df['summer_day'].max()+1)

# this is for peak detection
assert all(1 == (
    trend_df
    .groupby('word')
    ['date'].diff()
    .dropna()
    .dt.days
))

trend_df
Loading...

Detect & process peaks

peak_kws = dict(
    prominence=0.2, 
    height=0.3
)
Source
def binarize_locations(locs, length):
    x = np.full(length, False)
    x[locs] = True
    return x

# find peaks
trend_df['is_peak'] = (
    trend_df.groupby('word')
    ['value'].apply(
        lambda x: binarize_locations(
            locs = signal.find_peaks(np.array(x), **peak_kws)[0],
            length = len(x)
        )
    )
    .explode()
    .astype('bool')
    .values
)

trend_df
Loading...

Merge with word_df for air dates

Source
peak_df = (
    trend_df.query('is_peak')
    .merge(
        word_first_appearance,
        how='left'
    )
    .rename(columns={
        'air_date': 'first_air_date'
    })
)
assert peak_df.isna().sum().sum() == 0

peak_df['first_air_summer_day'] = (peak_df['first_air_date'] - summer_startdate).dt.days + 1
peak_df['peak_delay'] = (peak_df['date'] - peak_df['first_air_date']).dt.days

peak_df['appear_after_before_ratio'] = (
    (day_vec.max() - peak_df['first_air_summer_day']) /
    (peak_df['first_air_summer_day'] - day_vec.min()) 
)

peak_df['first_episode'] = peak_df.apply(
    lambda x: x['episode'] if x['peak_delay'] >= 0 else -1,
    axis=1
)
Source
# based on first episode appearances
word_order = list(
    peak_df
    .query('peak_delay >= 0')
    .sort_values(['episode', 'peak_delay'])
    ['word'].unique()
)

word_order.extend(list(set(peak_df['word']) - set(word_order)))

Select words that may be influenced by the show

Source
peak_count_df = (
    (
        peak_df.set_index([
            'word', 'appear_after_before_ratio',
        ])['peak_delay'] >= 0
    )
    .reset_index()
    .value_counts()
    .reset_index()
    .replace({'peak_delay': {True: 'after_air', False: 'before_air'}})
    .pivot(
        index=['word', 'appear_after_before_ratio'],
        columns='peak_delay',
        values='count'
    )
    .fillna(0).astype(int)
    .reset_index()
)
peak_count_df.columns.name = None

peak_count_df['total'] = peak_count_df['after_air'] + peak_count_df['before_air'] 
peak_count_df['trend_after_before_ratio'] = peak_count_df['after_air'] / peak_count_df['before_air'] 

# peak_count_df['trend_after_air_ratio'] = peak_count_df['after_air'] / peak_count_df['total'] 

peak_count_df
Loading...
thres_ratio = 2

selected_high_rare = word_df.query('is_high_rare_selected')['word'].unique()

selected_word_df = (
    peak_count_df
    .query(
        '(after_air > 0 and before_air == 0) or '
        '(trend_after_before_ratio > @thres_ratio * appear_after_before_ratio)'
    )
    .sort_values(
        ['trend_after_before_ratio', 'after_air', 'before_air'],
        ascending=[False, False, True]
    )
    .query('word in @selected_high_rare')
    .reset_index(drop=True)
)

selected_words = selected_word_df['word'].to_list()
selected_word_df
Loading...

Visualize of words that are possibly influenced by the show

Source
selected_word_order = [
    x for x in word_order if x in selected_words
][::-1]

selected_trends = (
    trend_df
    .query('word in @selected_words')
    .reset_index(drop=True)
)

selected_trends['word_idx'] = selected_trends['word'].map({
    x: i for i, x in enumerate(selected_word_order)
})
    
selected_trends['value'] = selected_trends['value']*0.8 + selected_trends['word_idx'] 
Source
plt.figure(figsize=(19,10))

first_ep_cmap = dict(zip(
    [-1] + list(range(1,num_episodes+1,1)),
    ['#afafaf'] + sns.color_palette('Paired', num_episodes, desat=0.8)
))

# plot peak ~ markers
sns.stripplot(
    peak_df.query('word in @selected_words'),
    x='date',
    y='word',
    hue='first_episode',
    order=selected_word_order,
    orient='y',
    hue_order=first_ep_cmap.keys(),
    palette=first_ep_cmap,
    marker=2,
    s=15,
    linewidth=3,
    jitter=0,
    zorder=3,
)

# overlay (shifted) trend series
sns.lineplot(
    selected_trends,
    x='date',
    y='value',
    units='word',
    estimator=None,
    lw=1,
    c='.2',
    zorder=2,
)

# when word appears
sns.scatterplot(
    word_df.query('word in @selected_words'),
    x='air_date',
    y='word',
    c='k',
    zorder=3,
    marker='.',
    edgecolor='none',
    s=60,
)

# when episode airs
[
    plt.axvline(x=x, color='.5', ls='-', lw=0.5, zorder=1)
    for x in word_df['air_date'].drop_duplicates()
]

# put episode names
[
    plt.text(
        x=r['air_date'] + pd.Timedelta('12h'),
        y=len(selected_words) + 0.25, 
        s='EP. %d'%(r['episode']),
        fontstyle='italic',
        fontsize=15
    )
    for _, r in word_df[['episode','air_date',]].drop_duplicates().iterrows()
]

plt.text(
    x=show_start_date - pd.Timedelta('12h'),
    y=-1,
    s=show_start_date.strftime('Premiere %b %d, %Y'), 
    fontstyle='italic',
    fontsize=15,
    ha='right'
)

# put words right next to trends (aka y-axis removed)
[
    plt.text(
        x=trend_df['date'].min() - pd.Timedelta('1D'),
        y=w,
        s=w,
        fontstyle='italic',
        ha='right',
        fontsize=16
    )
    for w in selected_words
]

# legends
plt.legend(
    handles=[
        plt.Line2D(
            [0],[0],
            label='Word appears in episode',
            color='k',
            marker='.',
            ls='none',
            markersize=10
        )
    ] + [
        plt.Line2D(
            [0],[0],
            label=(
                'Trend peak AWFA in EP. %d' %(k) if k > 0 
                else 'Trend peak before S2'
            ),
            color=v,
            marker='|',
            ls='none',
            markersize=10,
            markeredgewidth=3
        )
        for k, v in first_ep_cmap.items()
    ] + [
        plt.Line2D(
            [0], [0],
            label = "(AWFA: after word's 1$^{st}$ appear.)",
            color = 'none',
            ls='none',
        )
    ],
    bbox_to_anchor = (0.95,1),
    frameon=False,
    fontsize=18,
)


plt.gca().invert_yaxis()
plt.yticks([])
plt.xlabel(None)
plt.ylabel(None)
sns.despine(trim=True, offset=10, left=True)

plt.title(
    'Highly used rare words in House of the Dragon Season 2, ' \
    'with high search trends (possibly due it)', 
    y=1.02
)

plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))

plt.tight_layout()

plt.savefig('figures/hotds2-rare-word-trends.png')
plt.savefig('figures/hotds2-rare-word-trends.svg')
plt.show()
<Figure size 1368x720 with 1 Axes>
Source
def interpolate_trends(
    trend_df,
    freq='7d',
    xcol='date_from',
    interp1d_kws=dict()
):
    date_vec = pd.date_range(
        trend_df['date_from'].min(),
        trend_df['date_to'].max(),
        freq=freq
    )

    trend_df = (
        trend_df
        .groupby(['word'])
        .apply(
            lambda x: pd.DataFrame({
                k: interpolate.interp1d(
                    x=(x[xcol] - date_vec.min()).dt.days.to_numpy(),
                    y=x[k],
                    **interp1d_kws
                )(
                    (date_vec - date_vec.min()).days   
                )
                for k in trend_df.filter(regex='.*value.*').columns
            }).assign(date = date_vec),
            include_groups=False
        )
        .reset_index()
    )

    trend_df = trend_df.drop(
        columns=trend_df.filter(regex='level_\d+').columns
    )
    
    return trend_df
# need to normalize by the base period (i.e. before the show)
# in order to plot the comparisons
trend_df = process_trends(
    trend_12m_file_path,
    base_date=base_date,
)

trend_df
Loading...
# interpolate so that taking averages / medians
# in `sns.relplot/lineplot` is valid
# to avoid cases with sparse time points
trend_df = interpolate_trends(
    trend_df,
    freq='7d',
    xcol='date_from',
    interp1d_kws=dict(
        kind='slinear',
        fill_value='extrapolate'
    )
)

Merge with word metadata

Source
word_meta_df = (
    word_df
    .sort_values('log10_ratio')
    .groupby('word').tail(1)
    .filter(regex='word|freq|ratio|is_.*_selected')
)

assert (
    word_meta_df
    .filter(regex='is_.*')
    .any(axis=1)
    .all()
)

word_meta_df = (
    word_meta_df
    .melt(
        id_vars=['word', 'script_freq', 'base_freq', 'log10_ratio'],
        var_name='category'
    )
    .query('value')
    .drop(columns='value')
    .groupby(['word', 'category'])
    .agg('median')
    .reset_index()
)
Source
word_meta_df['category'] = (
    word_meta_df['category']
    .str.replace('is_', '')
    .str.replace('_selected', '')
    .map({
        'other': 'other',
        'hotd': 'show-specific',
        'high_rare': 'highly used & rare',
    })
)
Source
for k in ['script_freq', 'base_freq', 'log10_ratio']:
    word_meta_df[k + '_quartile'] = pd.qcut(
        word_meta_df[k],
        4,
        labels=['Q' + str(x+1) for x in range(4)],
        precision=10
    )
word_meta_df
Loading...
trend_df = trend_df.merge(
    word_meta_df,    
    how='left',
    on='word'
)

trend_df
Loading...

Prepare for visualization

viz_start_date = pd.Timestamp('2024-02-01')
viz_stop_date = pd.Timestamp('2024-09-07')
Source
cat_cmap = {
    'show-specific': '#b2182b',
    'highly used & rare': '#2166ac',
    'other': '#969696',
}
Source
trend_df = trend_df.merge(
    trend_df
    .groupby(['category', 'word'])
    ['z_value'].max()
    .reset_index()
    .set_index('word')
    .groupby('category')
    .rank(method='dense')
    .rename(columns={'z_value': 'rank_z'})
    .reset_index()
)
Source
for k in ['script_freq', 'base_freq', 'log10_ratio']:
    trend_df['rank_' + k] = (
        trend_df
        .groupby('category')
        [k].rank(method='dense', ascending='base' in k)
    )
Source
usage_order = ['Word rareness','Relative usage']
coarse_cat_cmap = {
    'show-specific': '#b2182b',
    'non show-specific': '#969696',
}
Source
agg_show_df = (
    trend_df.query('date >= @show_start_date')
    .groupby(['word', 'category', 'base_freq', 'log10_ratio'])
    ['z_value'].agg('median')
    .reset_index()
    .rename(columns={'log10_ratio': 'Relative usage'})
)

agg_show_df['Word rareness'] = -np.log10(agg_show_df.pop('base_freq'))

agg_show_df['category'] = agg_show_df['category'].apply(
    lambda x: 'non show-specific' if 'show' not in x else x
)   

agg_show_df = agg_show_df.melt(
    id_vars=['word', 'category', 'z_value'],
    var_name='word_metric',
)

agg_show_df
Loading...
Source
for hue_col, cmap in zip(
    ['base_freq_quartile','log10_ratio_quartile'], 
    ['Greys_r', 'Greys']
):
    rank_vec = trend_df['rank_' + hue_col.replace('_quartile', '')]

    trend_df['shifted_z_value'] = (
        trend_df['z_value'] * 0.8 
        - rank_vec
    )
    
    g = sns.relplot(
        trend_df,
        x='date',
        # y='z_value',
        y='shifted_z_value',
        hue=hue_col, 
        palette=cmap,
        units='word',estimator=None,
        row_order=cat_cmap.keys(),
        row='category',
        kind='line',
        n_boot=10,
        zorder=2,
        height=3,
        aspect=2,
        alpha=0.8,
    )

    [
        g.refline(x=x, color='k', ls=':', lw=0.5, zorder=1)
        for x in word_df['air_date'].drop_duplicates()
    ]
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b'))

    g.set_titles('')
    g.set_xlabels('')
    g.set_ylabels('')

    plt.xlim([viz_start_date,viz_stop_date])

    for i, (k, ax) in enumerate(g.axes_dict.items()):
        hide_xaxis = i < (len(g.axes) - 1)
        sns.despine(
            trim=True, offset=5,
            left=True,
            bottom = hide_xaxis, 
            ax=ax
        )
        ax.label_outer(True)
        ax.set_yticks([])
        ax.text(
            x=viz_start_date + pd.Timedelta('5d'),
            y=20,
            s=k + ' words',
            fontstyle='italic',
            fontsize=18
        )

    g.legend.get_title().set_fontsize(15)
    g.legend.get_title().set_text(hue_col.replace('_', ' ').title())
    [l.set_linewidth(10) for l in g.legend.get_lines()]
    [t.set_fontsize(15) for t in g.legend.texts]

    g.tight_layout(h_pad=-2)
    
    plt.savefig(f'figures/hotds2-stacked-word-trends-colored-by-{hue_col}.png')
    plt.savefig(f'figures/hotds2-stacked-word-trends-colored-by-{hue_col}.svg')
<Figure size 524.375x648 with 3 Axes>
<Figure size 530.125x648 with 3 Axes>
Source
plt.figure(figsize=(10,8))

for i, est_fn in enumerate([
    np.nanmedian, np.nanmean
]):

    plt.subplot(2,1,i+1)
    sns.lineplot(
        trend_df,
        x='date',
        y='z_value',
        hue='category',
        palette=cat_cmap,
        errorbar=('ci', 95),
        estimator=est_fn,
        marker='o',
        markersize=10,
        markeredgecolor='none',
        n_boot=50,
        zorder=2,
        lw=2,
        legend=False,
        err_kws={'alpha':0.1}
    )
    
    [
        plt.axvline(x=x, color='k', ls=':', lw=0.5, zorder=1)
        for x in word_df['air_date'].drop_duplicates()
    ]

    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
    plt.xlim([viz_start_date,viz_stop_date])
    
    plt.xlabel(None)
    ylim_vec = plt.gca().get_ylim()
    
    
    if i == 0:
        plt.title('Trends of words used in Season 2')
        plt.xticks([])
    else:
        [
            plt.text(
                x=viz_start_date + pd.Timedelta('30d'),
                y=max(ylim_vec) - 0.85*j - 2,
                s=cat + ' words',
                c=col,
                fontsize=18,
                fontstyle='italic'
            ) for j, (cat, col) in enumerate(cat_cmap.items())
        ]
    
        
    plt.ylabel(f'{est_fn.__name__.replace("nan","")} of z-scored trends')
    sns.despine(trim=True, offset=5, ax=plt.gca(), bottom=i==0)
    
plt.tight_layout()

plt.savefig('figures/hotds2-bulk-agg-word-trends.png')
plt.savefig('figures/hotds2-bulk-agg-word-trends.svg')

plt.show()
<Figure size 720x576 with 2 Axes>

Visualize aggregate after show

Source
g = sns.lmplot(
    agg_show_df,
    x='value',
    y='z_value',
    hue='category',
    col='word_metric',
    col_order=usage_order,
    palette=coarse_cat_cmap,
    hue_order=coarse_cat_cmap.keys(),
    markers='.',
    scatter_kws={'s': 30, 'alpha': 0.6},
    aspect=0.8,
    height=5,
    legend=False,
    facet_kws={'sharex':False},
)

plt.ylim([-3, 8])

g.set_titles('')
g.set_ylabels('median z-scored trends')

[
    g.axes[0,0].text(
        x=3, y=8-i, s=cat + ' words', c=col,
        fontsize=18, fontstyle='italic', va='top'
    ) for i, (cat, col) in enumerate(coarse_cat_cmap.items())
]

[
    ax.set_xlabel(k) for k, ax in g.axes_dict.items()
]

g.fig.suptitle(
    'Relationship between word usage & searches after show air',
    fontsize=18, y=1, x=0.52,
)
sns.despine(trim=True, offset=5)
g.tight_layout(w_pad=1)


plt.savefig('figures/hotds2-after-air-trends-vs-usage.png')
plt.savefig('figures/hotds2-after-air-trends-vs-usage.svg')

plt.show()
<Figure size 576x360 with 2 Axes>