Scraping movies by years (produced/released vs. set in)

1. Import and define paths¶

import os, re, pickle, time
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import wikipediaapi as wkapi

produced_data_path = 'data/films_by_produced.csv'
setin_data_path = 'data/films_by_set_in.csv'
processed_data_path = 'data/films_set_in_and_produced.csv'

2. Define scraping functions¶

wk = wkapi.Wikipedia('Data of movies by years (penguins@antarct.ica)', 'en')

def traverse_category(category, output=[], level=0, max_level=1):
    parent = category.title
    for c in category.categorymembers.values():
        output.append(dict(
            title = c.title,
            ns = c.ns,
            pageid = c.pageid,
            level = level,
            parent = parent
        ))
            
        if c.ns == wkapi.Namespace.CATEGORY and level < max_level:
            output = traverse_category(
                c,
                output,
                level=level + 1, 
                max_level=max_level
            )
    return output

def scrape_film_list(category_seed, max_try = 2):
    num_try = 1
    success = False
    t0 = time.time()
    print(f'Scrape {category_seed}')
    while num_try <= max_try and not success:
        try:
            cat_page = wk.page(category_seed)
            cat_tree = traverse_category(cat_page)
            df = pd.DataFrame(cat_tree)
            success = True
        except:
            print('\tElapsed %.2f seconds' %(time.time() - t0))
            print(f'\tError with {category_seed}. Will try again!')
        
        num_try += 1
    print('Elapsed %.2f seconds' %(time.time() - t0))
    return df

3. Scrape movies by produced year¶

df_produced = scrape_film_list(
    'Category:Films_by_year', 
    max_try=3
)

Scrape Category:Films_by_year
Elapsed 146.34 seconds

df_produced

df_produced['year_produced'] = df_produced['parent'].apply(
    lambda x: int(d['year']) if 
        (d:=re.search('Category\:(?P<year>\d{4})\s+(films)?', x)) is not None
        else None
)

df_produced = (
    df_produced
    .dropna(subset='year_produced')
    .query('ns == 0')
    .astype({'year_produced': 'int'})
    .drop(columns=['parent', 'ns', 'level'])
    .reset_index(drop=True)
)

df_produced

df_produced.to_csv(produced_data_path, index=False)

4. Scrape movies by set-in year/decade/century¶

categories = [
    'Category:Films_by_century_of_setting',
    'Category:Films_by_decade_of_setting',
    'Category:Films_by_year_of_setting'
]

dfs_setin = []
for c in categories:
    dfs_setin.append(scrape_film_list(c, max_try=3))
    
df_setin = pd.concat(dfs_setin, ignore_index=True)

Scrape Category:Films_by_century_of_setting
Elapsed 11.18 seconds
Scrape Category:Films_by_decade_of_setting
Elapsed 20.52 seconds
Scrape Category:Films_by_year_of_setting
Elapsed 82.91 seconds

df_setin

4.1. Checking a few things¶

Check the variable ns

df_setin.query('parent.str.contains("Films set in")')['ns'].value_counts()

0     32037
14     2136
2         2
Name: ns, dtype: int64

Checking what that ns=14 are those with title starts with 'Category:', which means these should be filtered out

(
    df_setin.query('parent.str.contains("Films set in")')
    .query('ns == 14')
    ['title'].apply(lambda x: x.startswith('Category:'))
    .all()
)

True

Checking what that ns=0 are those without title startswith 'Category:'. These should be kept

(
    df_setin.query('parent.str.contains("Films set in") and ns == 0')
    ['title'].apply(lambda x: x.startswith('Category:'))
    .any()
)

False

4.2. Process and save data¶

df_setin = pd.concat(dfs_setin, ignore_index=True)

df_setin = (
    df_setin
    .query(
        'parent.str.contains("Films set in")'\
        'and not parent.str.contains("Infamous Decade")'\
        'and ns == 0'
    )
    .drop(columns=['ns','level'])
)

# obtain year/decade/century it is set in
df_setin['year_setin'] = (
    df_setin.pop('parent')
    .str.replace('Category:Films set in ','')
    .str.replace('the ', '')
    .str.replace(' century', '')
)

df_setin = (
    df_setin
    .groupby(['pageid', 'title'])
    .agg(set)
    .reset_index()
)

df_setin['year_setin'] = df_setin['year_setin'].apply(list)

df_setin

df_setin.to_csv(setin_data_path, index=False)

5. Combine and process¶

df_films = (
    df_setin
    .merge(df_produced, on=['pageid', 'title'])
    .explode('year_setin')
    .reset_index(drop=True)
)

def convert_non_standard_time(x):
    if isinstance(x, int):
        return dict(value = x, type = 'year')
    if isinstance(x, float):
        return dict(value = x, type = 'year')
    if (value:=re.match('^\d{4}$', x)) is not None:
        value = int(value.group())
        return dict(value = value, type = 'year')
    if (value:=re.search('(\d+)\s*(st|nd|rd|th)', x)) is not None:
        value = int(value.groups()[0]) * 100
        if 'BC' in x:
            value = -value
        else:
            value -= 100
        return dict(value = value, type = 'century')
    if (value:=re.search('(\d+)\s+AD', x)) is not None:
        value = int(value.groups()[0])
        return dict(value = value, type = 'year')
    if (value:=re.search('(\d{4})s', x)) is not None:
        value = int(value.groups()[0])
        return dict(value = value, type = 'decade')

df_year_aux = df_films['year_setin'].apply(convert_non_standard_time)

df_films = pd.concat([
    df_films.rename(columns={'year_setin': 'source_year_setin'}),
    (
        pd.DataFrame(df_year_aux.to_list())
        .rename(columns={
            'value': 'year_setin',
            'type': 'year_setin_type'
        })
    )
], axis=1)

df_films['year_setin'].dtype

dtype('int64')

df_films

df_films.to_csv(processed_data_path,index=False)

6. Simple plot¶

plt.figure(figsize=(20,15))
sns.scatterplot(
    df_films,
    y = 'year_produced',
    x = 'year_setin',
    hue = 'year_setin_type',
    palette={
        'year': '#ababab',
        'decade': 'b',
        'century': 'g'
    },
    s = 25,
    alpha = 0.05,
    edgecolor = 'none',
)
plt.gca().set_aspect(5, 'box')
# plt.xlim([1900, 2000])