Construct word lists and collect trend data

Construct word lists and collect trend data#

0. Prerequisites#

This notebook assumes the requirements have been installed from requirements.txt, for example with:

pip install -r requirements.txt

This also assumes make is installed to do the following. However, this can be bypassed by just manually copying the commands in Makefile.

Then the scripts of season 2 of House of the Dragon can be downloaded using:

make download-scripts

The scripts would be downloaded into data/house-of-the-dragon-2022_scripts.json.

Additionally the some NLP models / data are also needed:

make download-nlp-essentials

1. Initialize#

1.1. Import packages#

import os
import re
import json
from collections import Counter

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import spacy
import nltk
from nltk.corpus import stopwords
import syllables
from wordfreq import word_frequency

import requests
from requests.auth import HTTPBasicAuth
from imdb import Cinemagoer

1.2. Objects & Parameters#

1.2.1. NLP essentials#

# Load stop words
eng_stopwords = stopwords.words('english')

# Load spacy model
nlp = spacy.load("en_core_web_lg")

1.2.2. IMDB / Cinemagoer#

# Initialize IMDB getter
ia = Cinemagoer()

# HotD IMDB ID
imdb_id = '11198330'

1.2.3. DataforSEO API#

%load_ext dotenv
%dotenv

# API key
LOGIN = os.environ['DATA4SEO_API_LOGIN']
PASSWORD = os.environ['DATA4SEO_API_PASSWORD']

AUTHORIZATION = HTTPBasicAuth(LOGIN, PASSWORD)

API_URL = 'https://api.dataforseo.com/v3/keywords_data/dataforseo_trends/merged_data/live'
MAX_BATCH_SIZE = 5 # max 5 per query

1.3. Define paths#

# input file of show scrips
script_file_path = 'data/house-of-the-dragon-2022_scripts.json'

# output file for word data frame
word_file_path = 'data/hotd-s2-words.csv'

# output file for lists of words to query
word_list_90d_file_path = 'data/list-of-search-words-90d.txt' # finer
word_list_12m_file_path = 'data/list-of-search-words-12m.txt' # coarse

# output file for dataforseo trend queries
trend_90d_file_path = 'data/data4seo-word-trends-90d.json'
trend_12m_file_path = 'data/data4seo-word-trends-12m.json'

2. Process script data#

2.1. Load data#

script_df = (
    pd.read_json(script_file_path)
    .sort_values(['season', 'episode'])
    .reset_index(drop=True)
)

script_df

2.2. Process & tag highly used rare words#

def process_scripts(
    script,
    lower_log10_ratio = 2,
    upper_log10_base = -6,
    upper_num_syllables = 1
):
    """
    Process scripts to extract words.
    Also, tag rare words that are highly used.
    Rare words here means low base frequency.
    Highly used means high frequency ratios (script / base).
    Avoid words with few syllables if need be.
    """
    # tokenize and counts
    text = re.sub(r'\W+', ' ', script.lower())
    freq = Counter(text.split())
    freq = pd.DataFrame(dict(
        word = freq.keys(),
        script_freq = freq.values()
    ))
    
    freq['word'] = freq['word'].apply(
        lambda x: [xi.lemma_.lower() for xi in nlp(x)]
    )
    
    freq = (
        freq
        .explode('word')
        .groupby('word')
        .sum()
        .reset_index()
    )
    
    # get frequencies
    freq['script_freq'] = freq['script_freq'] / freq['script_freq'].sum()
    freq['base_freq'] = freq['word'].apply(
        lambda x: word_frequency(x, 'en', wordlist='best')
    )
    
    # estimate syllables
    freq['est_syll'] = freq['word'].apply(
        lambda x: syllables.estimate(x)
    )
    
    # filter
    freq = (
        freq
        # remove stop words
        .query('word not in @eng_stopwords')
        # keep only words that are used in scripts more than baseline
        .query('script_freq > base_freq')
        # keep only base frequencies > 0
        .query('base_freq > 0')
        .reset_index(drop=True)
    )

    # log 10 ratio
    freq['log10_ratio'] = np.log10(
        freq['script_freq'] / freq['base_freq']
    )
    
    # tag highly used & rare
    high_rare_idx = freq.query(
        'log10_ratio > @lower_log10_ratio and '
        'log10(base_freq) < @upper_log10_base and '
        'est_syll > @upper_num_syllables'
    ).index
    
    freq['is_high_rare'] = False    
    freq.loc[high_rare_idx, 'is_high_rare'] = True
    
    # remove digits
    freq['is_digit'] = freq['word'].str.isdigit()
    freq = (
        freq.query('~is_digit')
        .drop(columns='is_digit')
        .reset_index(drop=True)
    )
    
    return freq

results = []

thresholds = dict(
    lower_log10_ratio = 2.5,
    upper_log10_base = -6,
    upper_num_syllables = 1
)

metadata_keys = ['season', 'episode', 'episode_title']
for _, row in tqdm(script_df.iterrows(), total=len(script_df)):
    results.append(
        process_scripts(
            row['script'],
            **thresholds
        ).assign(**{
            k: row[k] for k in metadata_keys 
        })
    )
    
word_df = pd.concat(results, ignore_index=True)

word_df

3. Get show-specific words#

There are rare words that do not appear in the dictionary as they are show-specific or names.

It’s not straightforward to detect them all the time but one way is to get names of characters from the show via IMDB data. This is done in the following cells.

Note that an alternative way is to get the words from the wiki related to both HotD and GoT, aka A Wiki of Ice and Fire. However, I was too lazy to dig through their API.

3.1. Get characters’ names from IMDB#

series = ia.get_movie(imdb_id)
ia.update(series, 'full credits')

characters = [x.currentRole for x in series['cast']]

# some minor processing
char_word_df = pd.DataFrame(list(np.concatenate([
    [dict(x)] if not isinstance(x, list)
    else [dict(xi) for xi in x]
    for x in characters
])))

characters = list(
    char_word_df['name']
    .str.lower()
    .str.replace(r'[\W\d]+', ' ', regex=True)
    .str.split()
    .explode()
    .dropna()
    .unique()
)

len(characters)

3.2. Additional words#

These are from reviewing the data

hotd_words = [
    'andal', 
    'asshai',
    'braavos',
    'valyria',
    'targaryens',
    'dracarys',
    'brackens',
    'daeron', 
    'davo',
    'essos',
    'dracarys',
    'dragonfire',
    'dragonheart',
    'dragonlord',
    'graybeards'
    'riverland',
    'harrenhal',
    'riverman',
    'selyse',
    'harren',
    'highgarden',
    'torrhen',
    'graybeards',
    'pentos',
    'tyrells',
    'lannisport',
    'silverwing'
]

3.3. Tag in the data#

word_df['is_hotd'] = word_df['word'].isin(characters + hotd_words)

4. Select words#

4.1. Highly used rare words (non show-specific)#

# limit to very rare since API has limits
selected_high_rare_words = sorted(
    word_df
    # highly used & rare, non-show specific
    .query('is_high_rare and ~is_hotd')
     # limit to even more rare 
    .query('log10(base_freq) < -7')
    .drop_duplicates('word')
    .reset_index(drop=True)
    ['word'].to_list()
)

len(selected_high_rare_words)

word_df['is_high_rare_selected'] = word_df['word'].isin(selected_high_rare_words)

Note:

This serves as list for the 90 days queries
The first version included 2 show words

with open(word_list_90d_file_path, 'w') as f:
    f.write('\n'.join(selected_high_rare_words))

4.2. Show-specific words#

Since the trends API have limits, sampling is needed.

Note: I forgot to set the seeds so the sampling won’t be reproducible.

num_samples = 100

sample_hotd_words = np.random.choice(
    (
        word_df
        .query('is_hotd and est_syll > 1')
        ['word'].unique()
    ),
    num_samples,
    replace=False
)

word_df['is_hotd_selected'] = word_df['word'].isin(sample_hotd_words)

4.3. Other words#

These words serve as alternative words to compare against both (1) the highly used rare and (2) show-specific words.

sample_other_words = np.random.choice(
    (
        word_df
        .query('~is_hotd and ~is_high_rare and est_syll > 1')
        ['word'].unique()
    ),
    num_samples,
    replace=False
)

word_df['is_other_selected'] = word_df['word'].isin(sample_other_words)

4.4. Finalize word list#

word_df = word_df.merge(
    word_df
    .drop_duplicates('word')
    .set_index('word')
    .filter(regex='is_.*_selected')
    .any(axis=1)
    .to_frame('is_selected')
    .reset_index()    
)

coarse_words = list(word_df.query('is_selected')['word'].unique())
len(coarse_words)