Construct word lists and collect trend data#
0. Prerequisites#
This notebook assumes the requirements have been installed from requirements.txt
, for example with:
pip install -r requirements.txt
This also assumes make
is installed to do the following. However, this can be bypassed by just manually copying the commands in Makefile
.
Then the scripts of season 2 of House of the Dragon can be downloaded using:
make download-scripts
The scripts would be downloaded into data/house-of-the-dragon-2022_scripts.json
.
Additionally the some NLP models / data are also needed:
make download-nlp-essentials
1. Initialize#
1.1. Import packages#
import os
import re
import json
from collections import Counter
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import spacy
import nltk
from nltk.corpus import stopwords
import syllables
from wordfreq import word_frequency
import requests
from requests.auth import HTTPBasicAuth
from imdb import Cinemagoer
1.2. Objects & Parameters#
1.2.1. NLP essentials#
# Load stop words
eng_stopwords = stopwords.words('english')
# Load spacy model
nlp = spacy.load("en_core_web_lg")
1.2.2. IMDB / Cinemagoer#
# Initialize IMDB getter
ia = Cinemagoer()
# HotD IMDB ID
imdb_id = '11198330'
1.2.3. DataforSEO API#
%load_ext dotenv
%dotenv
# API key
LOGIN = os.environ['DATA4SEO_API_LOGIN']
PASSWORD = os.environ['DATA4SEO_API_PASSWORD']
AUTHORIZATION = HTTPBasicAuth(LOGIN, PASSWORD)
API_URL = 'https://api.dataforseo.com/v3/keywords_data/dataforseo_trends/merged_data/live'
MAX_BATCH_SIZE = 5 # max 5 per query
1.3. Define paths#
# input file of show scrips
script_file_path = 'data/house-of-the-dragon-2022_scripts.json'
# output file for word data frame
word_file_path = 'data/hotd-s2-words.csv'
# output file for lists of words to query
word_list_90d_file_path = 'data/list-of-search-words-90d.txt' # finer
word_list_12m_file_path = 'data/list-of-search-words-12m.txt' # coarse
# output file for dataforseo trend queries
trend_90d_file_path = 'data/data4seo-word-trends-90d.json'
trend_12m_file_path = 'data/data4seo-word-trends-12m.json'
2. Process script data#
2.1. Load data#
script_df = (
pd.read_json(script_file_path)
.sort_values(['season', 'episode'])
.reset_index(drop=True)
)
script_df
2.2. Process & tag highly used rare words#
def process_scripts(
script,
lower_log10_ratio = 2,
upper_log10_base = -6,
upper_num_syllables = 1
):
"""
Process scripts to extract words.
Also, tag rare words that are highly used.
Rare words here means low base frequency.
Highly used means high frequency ratios (script / base).
Avoid words with few syllables if need be.
"""
# tokenize and counts
text = re.sub(r'\W+', ' ', script.lower())
freq = Counter(text.split())
freq = pd.DataFrame(dict(
word = freq.keys(),
script_freq = freq.values()
))
freq['word'] = freq['word'].apply(
lambda x: [xi.lemma_.lower() for xi in nlp(x)]
)
freq = (
freq
.explode('word')
.groupby('word')
.sum()
.reset_index()
)
# get frequencies
freq['script_freq'] = freq['script_freq'] / freq['script_freq'].sum()
freq['base_freq'] = freq['word'].apply(
lambda x: word_frequency(x, 'en', wordlist='best')
)
# estimate syllables
freq['est_syll'] = freq['word'].apply(
lambda x: syllables.estimate(x)
)
# filter
freq = (
freq
# remove stop words
.query('word not in @eng_stopwords')
# keep only words that are used in scripts more than baseline
.query('script_freq > base_freq')
# keep only base frequencies > 0
.query('base_freq > 0')
.reset_index(drop=True)
)
# log 10 ratio
freq['log10_ratio'] = np.log10(
freq['script_freq'] / freq['base_freq']
)
# tag highly used & rare
high_rare_idx = freq.query(
'log10_ratio > @lower_log10_ratio and '
'log10(base_freq) < @upper_log10_base and '
'est_syll > @upper_num_syllables'
).index
freq['is_high_rare'] = False
freq.loc[high_rare_idx, 'is_high_rare'] = True
# remove digits
freq['is_digit'] = freq['word'].str.isdigit()
freq = (
freq.query('~is_digit')
.drop(columns='is_digit')
.reset_index(drop=True)
)
return freq
results = []
thresholds = dict(
lower_log10_ratio = 2.5,
upper_log10_base = -6,
upper_num_syllables = 1
)
metadata_keys = ['season', 'episode', 'episode_title']
for _, row in tqdm(script_df.iterrows(), total=len(script_df)):
results.append(
process_scripts(
row['script'],
**thresholds
).assign(**{
k: row[k] for k in metadata_keys
})
)
word_df = pd.concat(results, ignore_index=True)
word_df
3. Get show-specific words#
There are rare words that do not appear in the dictionary as they are show-specific or names.
It’s not straightforward to detect them all the time but one way is to get names of characters from the show via IMDB data. This is done in the following cells.
Note that an alternative way is to get the words from the wiki related to both HotD and GoT, aka A Wiki of Ice and Fire. However, I was too lazy to dig through their API.
3.1. Get characters’ names from IMDB#
series = ia.get_movie(imdb_id)
ia.update(series, 'full credits')
characters = [x.currentRole for x in series['cast']]
# some minor processing
char_word_df = pd.DataFrame(list(np.concatenate([
[dict(x)] if not isinstance(x, list)
else [dict(xi) for xi in x]
for x in characters
])))
characters = list(
char_word_df['name']
.str.lower()
.str.replace(r'[\W\d]+', ' ', regex=True)
.str.split()
.explode()
.dropna()
.unique()
)
len(characters)
3.2. Additional words#
These are from reviewing the data
hotd_words = [
'andal',
'asshai',
'braavos',
'valyria',
'targaryens',
'dracarys',
'brackens',
'daeron',
'davo',
'essos',
'dracarys',
'dragonfire',
'dragonheart',
'dragonlord',
'graybeards'
'riverland',
'harrenhal',
'riverman',
'selyse',
'harren',
'highgarden',
'torrhen',
'graybeards',
'pentos',
'tyrells',
'lannisport',
'silverwing'
]
3.3. Tag in the data#
word_df['is_hotd'] = word_df['word'].isin(characters + hotd_words)
4. Select words#
4.1. Highly used rare words (non show-specific)#
# limit to very rare since API has limits
selected_high_rare_words = sorted(
word_df
# highly used & rare, non-show specific
.query('is_high_rare and ~is_hotd')
# limit to even more rare
.query('log10(base_freq) < -7')
.drop_duplicates('word')
.reset_index(drop=True)
['word'].to_list()
)
len(selected_high_rare_words)
word_df['is_high_rare_selected'] = word_df['word'].isin(selected_high_rare_words)
Note:
This serves as list for the 90 days queries
The first version included 2 show words
with open(word_list_90d_file_path, 'w') as f:
f.write('\n'.join(selected_high_rare_words))
4.2. Show-specific words#
Since the trends API have limits, sampling is needed.
Note: I forgot to set the seeds so the sampling won’t be reproducible.
num_samples = 100
sample_hotd_words = np.random.choice(
(
word_df
.query('is_hotd and est_syll > 1')
['word'].unique()
),
num_samples,
replace=False
)
word_df['is_hotd_selected'] = word_df['word'].isin(sample_hotd_words)
4.3. Other words#
These words serve as alternative words to compare against both (1) the highly used rare and (2) show-specific words.
sample_other_words = np.random.choice(
(
word_df
.query('~is_hotd and ~is_high_rare and est_syll > 1')
['word'].unique()
),
num_samples,
replace=False
)
word_df['is_other_selected'] = word_df['word'].isin(sample_other_words)
4.4. Finalize word list#
word_df = word_df.merge(
word_df
.drop_duplicates('word')
.set_index('word')
.filter(regex='is_.*_selected')
.any(axis=1)
.to_frame('is_selected')
.reset_index()
)
coarse_words = list(word_df.query('is_selected')['word'].unique())
len(coarse_words)
Note:
This serves as list for the 12-month queries (and tend to be more coarse than 90d)
with open(word_list_12m_file_path, 'w') as f:
f.write('\n'.join(coarse_words))
5. Append air-date data & save word_df
#
ep2date = {
# from wikipedia
'S2E1': 'June 16, 2024',
'S2E2': 'June 23, 2024',
'S2E3': 'June 30, 2024',
'S2E4': 'July 7, 2024',
'S2E5': 'July 14, 2024',
'S2E6': 'July 21, 2024',
'S2E7': 'July 28, 2024',
'S2E8': 'August 4, 2024',
}
word_df['air_date'] = pd.to_datetime(word_df.apply(
lambda x: ep2date[f'S{x["season"]}E{x["episode"]}'],
axis=1
))
word_df = word_df.merge(
word_df.groupby('word')
['episode'].nunique()
.to_frame('num_episodes')
.reset_index()
)
word_df
word_df.to_csv(word_file_path, index=False)
6. Get trend data from DataforSEO#
def query_word_trends(
input_file,
time_range,
output_file
):
pay_load = {
"time_range": time_range,
"type": "web",
}
# get word list
with open(input_file, 'r') as f:
words = f.read().split('\n')
print('Input file:', input_file)
print('Number of words:', len(words))
# split into batches
word_batches = list(map(
list, np.array_split(words, np.ceil(len(words) / MAX_BATCH_SIZE))
))
assert all([len(x) <= MAX_BATCH_SIZE for x in word_batches])
print(
f'Total {len(words)} words split into {len(word_batches)} batches, '
f'with max {MAX_BATCH_SIZE} per batch'
)
# query
results = []
for batch in tqdm(word_batches):
r = requests.post(
API_URL,
auth=AUTHORIZATION,
data=json.dumps([{
"keywords": batch,
**pay_load
}])
)
if r.status_code == 200:
results.append(r.json())
# process
data = []
assert len(results) == len(word_batches)
for r, b in zip(results, word_batches):
assert len(r['tasks']) == 1
assert r['tasks'][0]['data']['keywords'] == b
data.append({
k: v for k, v in r['tasks'][0].items()
if k in ['path', 'data', 'result']
})
with open(output_file, 'w') as f:
json.dump(data, f, indent=4)
print('File saved at:', output_file)
6.1. Query trends 90 days (finer)#
query_word_trends(
input_file=word_list_90d_file_path,
time_range='past_90_days',
output_file=trend_90d_file_path
)
6.2. Query trends 12 months (coarse)#
query_word_trends(
input_file=word_list_12m_file_path,
time_range='past_12_months',
output_file=trend_12m_file_path
)