Visualization of words used in House of the Dragon Season 2 and their “rarity”#

This notebook assumes the steps from collect,ipynb are already done.

The necessary data are in data folder.

  • hotd-s2-words.csv: processed word usage from show S2 scripts.

  • data4seo-word-trends-[90d,12m].json: keyword trends data using DataforSEO API.

The figure outputs are saved in figures folder:

  • hotds2-rare-word-trends.svg: Rare word trends (90d)

  • hotds2-stacked-word-trends-colored-by-base_freq_quartile.svg: Individual word trends (more coarse), colored and sorted by word rareness

  • hotds2-stacked-word-trends-colored-by-log10_ratio_quartile.svg: Individual word trends (more coarse), colored and sorted by their relative usage in the show scripts

  • hotds2-bulk-agg-word-trends.svg: Bulk trends across time

  • hotds2-after-air-trends-vs-usage.svg: Aggregate trends after show air and word’s usage metrics

Import packages & define paths#

Hide code cell source
import os
import re
import json

import numpy as np
import pandas as pd
from scipy import signal, interpolate

import matplotlib.pyplot as plt
from matplotlib import rcParams
import matplotlib.dates as mdates
import seaborn as sns
Hide code cell source
# plot configs
rcParams['font.family'] = 'Overpass Nerd Font'
rcParams['font.size'] = 18
rcParams['axes.titlesize'] = 20
rcParams['axes.labelsize'] = 18
rcParams['axes.linewidth'] = 1.5
rcParams['lines.linewidth'] = 1.5
rcParams['lines.markersize'] = 20
rcParams['patch.linewidth'] = 1.5
rcParams['xtick.labelsize'] = 18
rcParams['ytick.labelsize'] = 18
rcParams['xtick.major.width'] = 2
rcParams['xtick.minor.width'] = 2
rcParams['ytick.major.width'] = 2
rcParams['ytick.minor.width'] = 2
rcParams['savefig.dpi'] = 300
rcParams['savefig.transparent'] = False
rcParams['savefig.facecolor'] = 'white'
rcParams['savefig.format'] = 'svg'
rcParams['savefig.pad_inches'] = 0.5
rcParams['savefig.bbox'] = 'tight'
# data file for word data frame
word_file_path = 'data/hotd-s2-words.csv'
# data file for dataforseo trend queries
trend_90d_file_path = 'data/data4seo-word-trends-90d.json'
trend_12m_file_path = 'data/data4seo-word-trends-12m.json'
# output figure directory
fig_dir = 'figures'

Load word data#

word_df = (
    pd.read_csv(word_file_path)
    .query('is_selected')
    .reset_index(drop=True)
)

word_df['air_date'] = pd.to_datetime(word_df['air_date'])
num_episodes = word_df['episode'].nunique()
word_df
word script_freq base_freq est_syll log10_ratio is_high_rare season episode episode_title is_hotd is_high_rare_selected is_hotd_selected is_other_selected is_selected air_date num_episodes
0 advance 0.000226 4.470000e-05 3 0.703583 False 2 1 A Son for a Son False False False True True 2024-06-16 3
1 aegon 0.007454 4.370000e-07 2 4.231923 True 2 1 A Son for a Son True False True False True 2024-06-16 8
2 andal 0.000452 7.590000e-08 2 3.774679 True 2 1 A Son for a Son True False True False True 2024-06-16 1
3 answer 0.000226 1.480000e-04 2 0.183629 False 2 1 A Son for a Son False False False True True 2024-06-16 7
4 arryn 0.000226 1.740000e-07 2 3.113341 True 2 1 A Son for a Son True False True False True 2024-06-16 3
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
592 victorious 0.000192 3.720000e-06 3 1.712286 False 2 8 The Queen Who Ever Was False False False True True 2024-08-04 1
593 warrior 0.000192 1.450000e-05 2 1.121461 False 2 8 The Queen Who Ever Was True False True False True 2024-08-04 2
594 waver 0.000192 5.620000e-07 2 2.533093 True 2 8 The Queen Who Ever Was False False False True True 2024-08-04 2
595 winter 0.000192 7.760000e-05 2 0.392967 False 2 8 The Queen Who Ever Was True False True False True 2024-08-04 3
596 wylde 0.000192 2.000000e-07 2 2.981799 True 2 8 The Queen Who Ever Was True False True False True 2024-08-04 3

597 rows × 16 columns

Hide code cell source
word_first_appearance = (
    word_df.sort_values(['episode'])
    .groupby('word').head(1)
    .reset_index(drop=True)
)

word_first_appearance
word script_freq base_freq est_syll log10_ratio is_high_rare season episode episode_title is_hotd is_high_rare_selected is_hotd_selected is_other_selected is_selected air_date num_episodes
0 advance 0.000226 4.470000e-05 3 0.703583 False 2 1 A Son for a Son False False False True True 2024-06-16 3
1 rumble 0.000226 2.690000e-06 2 1.924138 False 2 1 A Son for a Son False False False True True 2024-06-16 4
2 riverland 0.000678 7.940000e-08 3 3.931191 True 2 1 A Son for a Son False True False False True 2024-06-16 7
3 refuse 0.000678 2.450000e-05 3 1.441846 False 2 1 A Son for a Son False False False True True 2024-06-16 4
4 ratcatcher 0.000452 2.290000e-08 3 4.295085 True 2 1 A Son for a Son True False True False True 2024-06-16 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
260 humor 0.000192 1.550000e-05 2 1.092497 False 2 8 The Queen Who Ever Was False False False True True 2024-08-04 1
261 headsman 0.000192 3.470000e-08 2 3.742499 True 2 8 The Queen Who Ever Was False True False False True 2024-08-04 1
262 halfheartedly 0.000192 6.170000e-08 4 3.492544 True 2 8 The Queen Who Ever Was False True False False True 2024-08-04 1
263 godswood 0.000192 1.510000e-08 2 4.103852 True 2 8 The Queen Who Ever Was False True False False True 2024-08-04 1
264 futilely 0.000192 7.590000e-08 4 3.402587 True 2 8 The Queen Who Ever Was False True False False True 2024-08-04 1

265 rows × 16 columns

show_start_date = word_df['air_date'].min()
show_end_date = word_df['air_date'].max()

show_start_date, show_end_date
(Timestamp('2024-06-16 00:00:00'), Timestamp('2024-08-04 00:00:00'))

Define time ranges#

# summer start date
summer_startdate = pd.to_datetime('2024-06-01')

# max date of interest (relevant for 90d)
max_date = pd.to_datetime('2024-08-15')

# shift back a week for some leeway (relevant for 12m)
base_date = show_start_date - pd.Timedelta(7, unit='day')