Visualization of words used in House of the Dragon Season 2 and their “rarity”
This notebook assumes the steps from collect,ipynb are already done.
The necessary data are in data folder.
hotd-s2-words.csv: processed word usage from show S2 scripts.data4seo-word-trends-[90d,12m].json: keyword trends data using DataforSEO API.
The figure outputs are saved in figures folder:
hotds2-rare-word-trends.svg: Rare word trends (90d)hotds2-stacked-word-trends-colored-by-base_freq_quartile.svg: Individual word trends (more coarse), colored and sorted by word rarenesshotds2-stacked-word-trends-colored-by-log10_ratio_quartile.svg: Individual word trends (more coarse), colored and sorted by their relative usage in the show scriptshotds2-bulk-agg-word-trends.svg: Bulk trends across timehotds2-after-air-trends-vs-usage.svg: Aggregate trends after show air and word’s usage metrics
Import packages & define paths¶
Source
import os
import re
import json
import numpy as np
import pandas as pd
from scipy import signal, interpolate
import matplotlib.pyplot as plt
from matplotlib import rcParams
import matplotlib.dates as mdates
import seaborn as snsSource
# plot configs
rcParams['font.family'] = 'Overpass Nerd Font'
rcParams['font.size'] = 18
rcParams['axes.titlesize'] = 20
rcParams['axes.labelsize'] = 18
rcParams['axes.linewidth'] = 1.5
rcParams['lines.linewidth'] = 1.5
rcParams['lines.markersize'] = 20
rcParams['patch.linewidth'] = 1.5
rcParams['xtick.labelsize'] = 18
rcParams['ytick.labelsize'] = 18
rcParams['xtick.major.width'] = 2
rcParams['xtick.minor.width'] = 2
rcParams['ytick.major.width'] = 2
rcParams['ytick.minor.width'] = 2
rcParams['savefig.dpi'] = 300
rcParams['savefig.transparent'] = False
rcParams['savefig.facecolor'] = 'white'
rcParams['savefig.format'] = 'svg'
rcParams['savefig.pad_inches'] = 0.5
rcParams['savefig.bbox'] = 'tight'# data file for word data frame
word_file_path = 'data/hotd-s2-words.csv'# data file for dataforseo trend queries
trend_90d_file_path = 'data/data4seo-word-trends-90d.json'
trend_12m_file_path = 'data/data4seo-word-trends-12m.json'# output figure directory
fig_dir = 'figures'Load word data¶
word_df = (
pd.read_csv(word_file_path)
.query('is_selected')
.reset_index(drop=True)
)
word_df['air_date'] = pd.to_datetime(word_df['air_date'])
num_episodes = word_df['episode'].nunique()
word_dfSource
word_first_appearance = (
word_df.sort_values(['episode'])
.groupby('word').head(1)
.reset_index(drop=True)
)
word_first_appearanceshow_start_date = word_df['air_date'].min()
show_end_date = word_df['air_date'].max()
show_start_date, show_end_date(Timestamp('2024-06-16 00:00:00'), Timestamp('2024-08-04 00:00:00'))Define time ranges¶
# summer start date
summer_startdate = pd.to_datetime('2024-06-01')
# max date of interest (relevant for 90d)
max_date = pd.to_datetime('2024-08-15')
# shift back a week for some leeway (relevant for 12m)
base_date = show_start_date - pd.Timedelta(7, unit='day')Fine-grained 90-day trends for highly used rare words¶
Process trends data¶
Source
def process_trends(
trend_file,
base_date=None,
):
# load raw data
with open(trend_file, 'r') as f:
raw_data = json.load(f)
assert all([len(x['result']) == 1 for x in raw_data])
# get keyword trends data
trend_df = []
for rd in raw_data:
assert len(rd['result']) == 1
trends = [
x for x in rd['result'][0]['items']
if x['type'] == 'dataforseo_trends_graph'
]
assert len(trends) == 1
trends = trends[0]
trends_words = trends['keywords']
trends_data = pd.DataFrame(trends['data'])
trends_data['word'] = [trends_words] * len(trends_data)
trends_data = (
trends_data.explode(['word', 'values'])
.rename(columns={'values': 'value'})
.astype({'value':'float'})
.reset_index(drop=True)
)
trend_df.append(trends_data)
trend_df = pd.concat(trend_df, ignore_index=True)
# process dates
trend_df['date_from'] = pd.to_datetime(trend_df['date_from'])
trend_df['date_to'] = pd.to_datetime(trend_df['date_to'])
trend_df['num_days'] = (trend_df['date_to'] - trend_df['date_from']).dt.days
trend_df['date_center'] = (
trend_df['date_from'] +
pd.to_timedelta(np.round(trend_df['num_days'] / 2), unit='d')
)
# stats for norm
min_max_df = (
trend_df.groupby('word')
['value'].agg(['min', 'max'])
.add_suffix('_value')
.astype('float')
.reset_index()
)
# other stats with optionally from base
stat_df_base_opt = (
(
trend_df.query('date_to < @base_date')
if base_date is not None
else trend_df
)
.groupby('word')
['value'].agg(
mean = 'mean',
std = 'std',
median = 'median',
iqr = lambda x: np.subtract(*np.percentile(x, [75, 25]))
)
.add_suffix('_value')
.astype('float')
.reset_index()
)
# merge stats
trend_df = (
trend_df
.merge(
min_max_df,
how='left'
)
.merge(
stat_df_base_opt,
how='left'
)
.query('max_value > 0')
.reset_index(drop=True)
)
# min-max normalize
trend_df['mm_value'] = (
(trend_df['value'] - trend_df['min_value']) /
(trend_df['max_value'] - trend_df['min_value'])
)
# zscore scaling
trend_df['z_value'] = (
(trend_df['value'] - trend_df['mean_value']) /
(trend_df['std_value'])
)
# robust scaling
trend_df['r_value'] = (
(trend_df['value'] - trend_df['median_value']) /
(trend_df['iqr_value'])
)
trend_df = trend_df.drop(columns=[
'min_value', 'max_value',
'mean_value', 'std_value',
'median_value', 'iqr_value'
])
return trend_df# note: don't need zscore for this one
trend_df = process_trends(trend_90d_file_path)
trend_dfSelect & process dates¶
Source
# select date range and convert date to summer / week days
trend_df = (
trend_df
.assign(date = pd.to_datetime(trend_df['date_from']))
.query('date <= @max_date')
.reset_index(drop=True)
)
# the selected date range should allow this
# if not, can remove
assert all(trend_df['date_from'] == trend_df['date_to'])
# filter only needed columns
# and add summer / week days if needed
# use `mm_value` aka min-max norm values onwards
trend_df = (
trend_df
.filter(['word', 'date', 'mm_value'])
.assign(
summer_day = (trend_df['date'] - summer_startdate).dt.days + 1,
week_day = trend_df['date'].dt.strftime('%w').astype('int') + 1
)
.rename(columns={'mm_value': 'value'})
.sort_values(['word', 'date'])
.fillna({'value': 0})
.reset_index(drop=True)
)
day_vec = np.arange(trend_df['summer_day'].min(), trend_df['summer_day'].max()+1)
# this is for peak detection
assert all(1 == (
trend_df
.groupby('word')
['date'].diff()
.dropna()
.dt.days
))
trend_dfDetect & process peaks¶
peak_kws = dict(
prominence=0.2,
height=0.3
)Source
def binarize_locations(locs, length):
x = np.full(length, False)
x[locs] = True
return x
# find peaks
trend_df['is_peak'] = (
trend_df.groupby('word')
['value'].apply(
lambda x: binarize_locations(
locs = signal.find_peaks(np.array(x), **peak_kws)[0],
length = len(x)
)
)
.explode()
.astype('bool')
.values
)
trend_dfMerge with word_df for air dates
Source
peak_df = (
trend_df.query('is_peak')
.merge(
word_first_appearance,
how='left'
)
.rename(columns={
'air_date': 'first_air_date'
})
)
assert peak_df.isna().sum().sum() == 0
peak_df['first_air_summer_day'] = (peak_df['first_air_date'] - summer_startdate).dt.days + 1
peak_df['peak_delay'] = (peak_df['date'] - peak_df['first_air_date']).dt.days
peak_df['appear_after_before_ratio'] = (
(day_vec.max() - peak_df['first_air_summer_day']) /
(peak_df['first_air_summer_day'] - day_vec.min())
)
peak_df['first_episode'] = peak_df.apply(
lambda x: x['episode'] if x['peak_delay'] >= 0 else -1,
axis=1
)Source
# based on first episode appearances
word_order = list(
peak_df
.query('peak_delay >= 0')
.sort_values(['episode', 'peak_delay'])
['word'].unique()
)
word_order.extend(list(set(peak_df['word']) - set(word_order)))Select words that may be influenced by the show¶
Source
peak_count_df = (
(
peak_df.set_index([
'word', 'appear_after_before_ratio',
])['peak_delay'] >= 0
)
.reset_index()
.value_counts()
.reset_index()
.replace({'peak_delay': {True: 'after_air', False: 'before_air'}})
.pivot(
index=['word', 'appear_after_before_ratio'],
columns='peak_delay',
values='count'
)
.fillna(0).astype(int)
.reset_index()
)
peak_count_df.columns.name = None
peak_count_df['total'] = peak_count_df['after_air'] + peak_count_df['before_air']
peak_count_df['trend_after_before_ratio'] = peak_count_df['after_air'] / peak_count_df['before_air']
# peak_count_df['trend_after_air_ratio'] = peak_count_df['after_air'] / peak_count_df['total']
peak_count_dfthres_ratio = 2
selected_high_rare = word_df.query('is_high_rare_selected')['word'].unique()
selected_word_df = (
peak_count_df
.query(
'(after_air > 0 and before_air == 0) or '
'(trend_after_before_ratio > @thres_ratio * appear_after_before_ratio)'
)
.sort_values(
['trend_after_before_ratio', 'after_air', 'before_air'],
ascending=[False, False, True]
)
.query('word in @selected_high_rare')
.reset_index(drop=True)
)
selected_words = selected_word_df['word'].to_list()
selected_word_dfVisualize of words that are possibly influenced by the show¶
Source
selected_word_order = [
x for x in word_order if x in selected_words
][::-1]
selected_trends = (
trend_df
.query('word in @selected_words')
.reset_index(drop=True)
)
selected_trends['word_idx'] = selected_trends['word'].map({
x: i for i, x in enumerate(selected_word_order)
})
selected_trends['value'] = selected_trends['value']*0.8 + selected_trends['word_idx'] Source
plt.figure(figsize=(19,10))
first_ep_cmap = dict(zip(
[-1] + list(range(1,num_episodes+1,1)),
['#afafaf'] + sns.color_palette('Paired', num_episodes, desat=0.8)
))
# plot peak ~ markers
sns.stripplot(
peak_df.query('word in @selected_words'),
x='date',
y='word',
hue='first_episode',
order=selected_word_order,
orient='y',
hue_order=first_ep_cmap.keys(),
palette=first_ep_cmap,
marker=2,
s=15,
linewidth=3,
jitter=0,
zorder=3,
)
# overlay (shifted) trend series
sns.lineplot(
selected_trends,
x='date',
y='value',
units='word',
estimator=None,
lw=1,
c='.2',
zorder=2,
)
# when word appears
sns.scatterplot(
word_df.query('word in @selected_words'),
x='air_date',
y='word',
c='k',
zorder=3,
marker='.',
edgecolor='none',
s=60,
)
# when episode airs
[
plt.axvline(x=x, color='.5', ls='-', lw=0.5, zorder=1)
for x in word_df['air_date'].drop_duplicates()
]
# put episode names
[
plt.text(
x=r['air_date'] + pd.Timedelta('12h'),
y=len(selected_words) + 0.25,
s='EP. %d'%(r['episode']),
fontstyle='italic',
fontsize=15
)
for _, r in word_df[['episode','air_date',]].drop_duplicates().iterrows()
]
plt.text(
x=show_start_date - pd.Timedelta('12h'),
y=-1,
s=show_start_date.strftime('Premiere %b %d, %Y'),
fontstyle='italic',
fontsize=15,
ha='right'
)
# put words right next to trends (aka y-axis removed)
[
plt.text(
x=trend_df['date'].min() - pd.Timedelta('1D'),
y=w,
s=w,
fontstyle='italic',
ha='right',
fontsize=16
)
for w in selected_words
]
# legends
plt.legend(
handles=[
plt.Line2D(
[0],[0],
label='Word appears in episode',
color='k',
marker='.',
ls='none',
markersize=10
)
] + [
plt.Line2D(
[0],[0],
label=(
'Trend peak AWFA in EP. %d' %(k) if k > 0
else 'Trend peak before S2'
),
color=v,
marker='|',
ls='none',
markersize=10,
markeredgewidth=3
)
for k, v in first_ep_cmap.items()
] + [
plt.Line2D(
[0], [0],
label = "(AWFA: after word's 1$^{st}$ appear.)",
color = 'none',
ls='none',
)
],
bbox_to_anchor = (0.95,1),
frameon=False,
fontsize=18,
)
plt.gca().invert_yaxis()
plt.yticks([])
plt.xlabel(None)
plt.ylabel(None)
sns.despine(trim=True, offset=10, left=True)
plt.title(
'Highly used rare words in House of the Dragon Season 2, ' \
'with high search trends (possibly due it)',
y=1.02
)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
plt.tight_layout()
plt.savefig('figures/hotds2-rare-word-trends.png')
plt.savefig('figures/hotds2-rare-word-trends.svg')
plt.show()

Coarse 12-month trends for different highly used words¶
Process trends data¶
Source
def interpolate_trends(
trend_df,
freq='7d',
xcol='date_from',
interp1d_kws=dict()
):
date_vec = pd.date_range(
trend_df['date_from'].min(),
trend_df['date_to'].max(),
freq=freq
)
trend_df = (
trend_df
.groupby(['word'])
.apply(
lambda x: pd.DataFrame({
k: interpolate.interp1d(
x=(x[xcol] - date_vec.min()).dt.days.to_numpy(),
y=x[k],
**interp1d_kws
)(
(date_vec - date_vec.min()).days
)
for k in trend_df.filter(regex='.*value.*').columns
}).assign(date = date_vec),
include_groups=False
)
.reset_index()
)
trend_df = trend_df.drop(
columns=trend_df.filter(regex='level_\d+').columns
)
return trend_df# need to normalize by the base period (i.e. before the show)
# in order to plot the comparisons
trend_df = process_trends(
trend_12m_file_path,
base_date=base_date,
)
trend_df# interpolate so that taking averages / medians
# in `sns.relplot/lineplot` is valid
# to avoid cases with sparse time points
trend_df = interpolate_trends(
trend_df,
freq='7d',
xcol='date_from',
interp1d_kws=dict(
kind='slinear',
fill_value='extrapolate'
)
)Merge with word metadata¶
Source
word_meta_df = (
word_df
.sort_values('log10_ratio')
.groupby('word').tail(1)
.filter(regex='word|freq|ratio|is_.*_selected')
)
assert (
word_meta_df
.filter(regex='is_.*')
.any(axis=1)
.all()
)
word_meta_df = (
word_meta_df
.melt(
id_vars=['word', 'script_freq', 'base_freq', 'log10_ratio'],
var_name='category'
)
.query('value')
.drop(columns='value')
.groupby(['word', 'category'])
.agg('median')
.reset_index()
)Source
word_meta_df['category'] = (
word_meta_df['category']
.str.replace('is_', '')
.str.replace('_selected', '')
.map({
'other': 'other',
'hotd': 'show-specific',
'high_rare': 'highly used & rare',
})
)Source
for k in ['script_freq', 'base_freq', 'log10_ratio']:
word_meta_df[k + '_quartile'] = pd.qcut(
word_meta_df[k],
4,
labels=['Q' + str(x+1) for x in range(4)],
precision=10
)word_meta_dftrend_df = trend_df.merge(
word_meta_df,
how='left',
on='word'
)
trend_dfPrepare for visualization¶
viz_start_date = pd.Timestamp('2024-02-01')
viz_stop_date = pd.Timestamp('2024-09-07')Source
cat_cmap = {
'show-specific': '#b2182b',
'highly used & rare': '#2166ac',
'other': '#969696',
}
Source
trend_df = trend_df.merge(
trend_df
.groupby(['category', 'word'])
['z_value'].max()
.reset_index()
.set_index('word')
.groupby('category')
.rank(method='dense')
.rename(columns={'z_value': 'rank_z'})
.reset_index()
)Source
for k in ['script_freq', 'base_freq', 'log10_ratio']:
trend_df['rank_' + k] = (
trend_df
.groupby('category')
[k].rank(method='dense', ascending='base' in k)
)Source
usage_order = ['Word rareness','Relative usage']
coarse_cat_cmap = {
'show-specific': '#b2182b',
'non show-specific': '#969696',
}Source
agg_show_df = (
trend_df.query('date >= @show_start_date')
.groupby(['word', 'category', 'base_freq', 'log10_ratio'])
['z_value'].agg('median')
.reset_index()
.rename(columns={'log10_ratio': 'Relative usage'})
)
agg_show_df['Word rareness'] = -np.log10(agg_show_df.pop('base_freq'))
agg_show_df['category'] = agg_show_df['category'].apply(
lambda x: 'non show-specific' if 'show' not in x else x
)
agg_show_df = agg_show_df.melt(
id_vars=['word', 'category', 'z_value'],
var_name='word_metric',
)
agg_show_dfVisualize individual trends¶
Source
for hue_col, cmap in zip(
['base_freq_quartile','log10_ratio_quartile'],
['Greys_r', 'Greys']
):
rank_vec = trend_df['rank_' + hue_col.replace('_quartile', '')]
trend_df['shifted_z_value'] = (
trend_df['z_value'] * 0.8
- rank_vec
)
g = sns.relplot(
trend_df,
x='date',
# y='z_value',
y='shifted_z_value',
hue=hue_col,
palette=cmap,
units='word',estimator=None,
row_order=cat_cmap.keys(),
row='category',
kind='line',
n_boot=10,
zorder=2,
height=3,
aspect=2,
alpha=0.8,
)
[
g.refline(x=x, color='k', ls=':', lw=0.5, zorder=1)
for x in word_df['air_date'].drop_duplicates()
]
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b'))
g.set_titles('')
g.set_xlabels('')
g.set_ylabels('')
plt.xlim([viz_start_date,viz_stop_date])
for i, (k, ax) in enumerate(g.axes_dict.items()):
hide_xaxis = i < (len(g.axes) - 1)
sns.despine(
trim=True, offset=5,
left=True,
bottom = hide_xaxis,
ax=ax
)
ax.label_outer(True)
ax.set_yticks([])
ax.text(
x=viz_start_date + pd.Timedelta('5d'),
y=20,
s=k + ' words',
fontstyle='italic',
fontsize=18
)
g.legend.get_title().set_fontsize(15)
g.legend.get_title().set_text(hue_col.replace('_', ' ').title())
[l.set_linewidth(10) for l in g.legend.get_lines()]
[t.set_fontsize(15) for t in g.legend.texts]
g.tight_layout(h_pad=-2)
plt.savefig(f'figures/hotds2-stacked-word-trends-colored-by-{hue_col}.png')
plt.savefig(f'figures/hotds2-stacked-word-trends-colored-by-{hue_col}.svg')


Visualize bulk trends¶
Source
plt.figure(figsize=(10,8))
for i, est_fn in enumerate([
np.nanmedian, np.nanmean
]):
plt.subplot(2,1,i+1)
sns.lineplot(
trend_df,
x='date',
y='z_value',
hue='category',
palette=cat_cmap,
errorbar=('ci', 95),
estimator=est_fn,
marker='o',
markersize=10,
markeredgecolor='none',
n_boot=50,
zorder=2,
lw=2,
legend=False,
err_kws={'alpha':0.1}
)
[
plt.axvline(x=x, color='k', ls=':', lw=0.5, zorder=1)
for x in word_df['air_date'].drop_duplicates()
]
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
plt.xlim([viz_start_date,viz_stop_date])
plt.xlabel(None)
ylim_vec = plt.gca().get_ylim()
if i == 0:
plt.title('Trends of words used in Season 2')
plt.xticks([])
else:
[
plt.text(
x=viz_start_date + pd.Timedelta('30d'),
y=max(ylim_vec) - 0.85*j - 2,
s=cat + ' words',
c=col,
fontsize=18,
fontstyle='italic'
) for j, (cat, col) in enumerate(cat_cmap.items())
]
plt.ylabel(f'{est_fn.__name__.replace("nan","")} of z-scored trends')
sns.despine(trim=True, offset=5, ax=plt.gca(), bottom=i==0)
plt.tight_layout()
plt.savefig('figures/hotds2-bulk-agg-word-trends.png')
plt.savefig('figures/hotds2-bulk-agg-word-trends.svg')
plt.show()
Visualize aggregate after show¶
Source
g = sns.lmplot(
agg_show_df,
x='value',
y='z_value',
hue='category',
col='word_metric',
col_order=usage_order,
palette=coarse_cat_cmap,
hue_order=coarse_cat_cmap.keys(),
markers='.',
scatter_kws={'s': 30, 'alpha': 0.6},
aspect=0.8,
height=5,
legend=False,
facet_kws={'sharex':False},
)
plt.ylim([-3, 8])
g.set_titles('')
g.set_ylabels('median z-scored trends')
[
g.axes[0,0].text(
x=3, y=8-i, s=cat + ' words', c=col,
fontsize=18, fontstyle='italic', va='top'
) for i, (cat, col) in enumerate(coarse_cat_cmap.items())
]
[
ax.set_xlabel(k) for k, ax in g.axes_dict.items()
]
g.fig.suptitle(
'Relationship between word usage & searches after show air',
fontsize=18, y=1, x=0.52,
)
sns.despine(trim=True, offset=5)
g.tight_layout(w_pad=1)
plt.savefig('figures/hotds2-after-air-trends-vs-usage.png')
plt.savefig('figures/hotds2-after-air-trends-vs-usage.svg')
plt.show()
