Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

Scraping movies by years (produced/released vs. set in)

1. Import and define paths

import os, re, pickle, time
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import wikipediaapi as wkapi
produced_data_path = 'data/films_by_produced.csv'
setin_data_path = 'data/films_by_set_in.csv'
processed_data_path = 'data/films_set_in_and_produced.csv'

2. Define scraping functions

wk = wkapi.Wikipedia('Data of movies by years (penguins@antarct.ica)', 'en')
def traverse_category(category, output=[], level=0, max_level=1):
    parent = category.title
    for c in category.categorymembers.values():
        output.append(dict(
            title = c.title,
            ns = c.ns,
            pageid = c.pageid,
            level = level,
            parent = parent
        ))
            
        if c.ns == wkapi.Namespace.CATEGORY and level < max_level:
            output = traverse_category(
                c,
                output,
                level=level + 1, 
                max_level=max_level
            )
    return output
def scrape_film_list(category_seed, max_try = 2):
    num_try = 1
    success = False
    t0 = time.time()
    print(f'Scrape {category_seed}')
    while num_try <= max_try and not success:
        try:
            cat_page = wk.page(category_seed)
            cat_tree = traverse_category(cat_page)
            df = pd.DataFrame(cat_tree)
            success = True
        except:
            print('\tElapsed %.2f seconds' %(time.time() - t0))
            print(f'\tError with {category_seed}. Will try again!')
        
        num_try += 1
    print('Elapsed %.2f seconds' %(time.time() - t0))
    return df

3. Scrape movies by produced year

df_produced = scrape_film_list(
    'Category:Films_by_year', 
    max_try=3
)
Scrape Category:Films_by_year
Elapsed 146.34 seconds
df_produced
Loading...
df_produced['year_produced'] = df_produced['parent'].apply(
    lambda x: int(d['year']) if 
        (d:=re.search('Category\:(?P<year>\d{4})\s+(films)?', x)) is not None
        else None
)

df_produced = (
    df_produced
    .dropna(subset='year_produced')
    .query('ns == 0')
    .astype({'year_produced': 'int'})
    .drop(columns=['parent', 'ns', 'level'])
    .reset_index(drop=True)
)
df_produced
Loading...
df_produced.to_csv(produced_data_path, index=False)

4. Scrape movies by set-in year/decade/century

categories = [
    'Category:Films_by_century_of_setting',
    'Category:Films_by_decade_of_setting',
    'Category:Films_by_year_of_setting'
]

dfs_setin = []
for c in categories:
    dfs_setin.append(scrape_film_list(c, max_try=3))
    
df_setin = pd.concat(dfs_setin, ignore_index=True)
Scrape Category:Films_by_century_of_setting
Elapsed 11.18 seconds
Scrape Category:Films_by_decade_of_setting
Elapsed 20.52 seconds
Scrape Category:Films_by_year_of_setting
Elapsed 82.91 seconds
df_setin
Loading...

4.1. Checking a few things

Check the variable ns

df_setin.query('parent.str.contains("Films set in")')['ns'].value_counts()
0 32037 14 2136 2 2 Name: ns, dtype: int64

Checking what that ns=14 are those with title starts with 'Category:', which means these should be filtered out

(
    df_setin.query('parent.str.contains("Films set in")')
    .query('ns == 14')
    ['title'].apply(lambda x: x.startswith('Category:'))
    .all()
)
True

Checking what that ns=0 are those without title startswith 'Category:'. These should be kept

(
    df_setin.query('parent.str.contains("Films set in") and ns == 0')
    ['title'].apply(lambda x: x.startswith('Category:'))
    .any()
)
False

4.2. Process and save data

df_setin = pd.concat(dfs_setin, ignore_index=True)
df_setin = (
    df_setin
    .query(
        'parent.str.contains("Films set in")'\
        'and not parent.str.contains("Infamous Decade")'\
        'and ns == 0'
    )
    .drop(columns=['ns','level'])
)

# obtain year/decade/century it is set in
df_setin['year_setin'] = (
    df_setin.pop('parent')
    .str.replace('Category:Films set in ','')
    .str.replace('the ', '')
    .str.replace(' century', '')
)

df_setin = (
    df_setin
    .groupby(['pageid', 'title'])
    .agg(set)
    .reset_index()
)

df_setin['year_setin'] = df_setin['year_setin'].apply(list)
df_setin
Loading...
df_setin.to_csv(setin_data_path, index=False)

5. Combine and process

df_films = (
    df_setin
    .merge(df_produced, on=['pageid', 'title'])
    .explode('year_setin')
    .reset_index(drop=True)
)
def convert_non_standard_time(x):
    if isinstance(x, int):
        return dict(value = x, type = 'year')
    if isinstance(x, float):
        return dict(value = x, type = 'year')
    if (value:=re.match('^\d{4}$', x)) is not None:
        value = int(value.group())
        return dict(value = value, type = 'year')
    if (value:=re.search('(\d+)\s*(st|nd|rd|th)', x)) is not None:
        value = int(value.groups()[0]) * 100
        if 'BC' in x:
            value = -value
        else:
            value -= 100
        return dict(value = value, type = 'century')
    if (value:=re.search('(\d+)\s+AD', x)) is not None:
        value = int(value.groups()[0])
        return dict(value = value, type = 'year')
    if (value:=re.search('(\d{4})s', x)) is not None:
        value = int(value.groups()[0])
        return dict(value = value, type = 'decade')

df_year_aux = df_films['year_setin'].apply(convert_non_standard_time)
    
df_films = pd.concat([
    df_films.rename(columns={'year_setin': 'source_year_setin'}),
    (
        pd.DataFrame(df_year_aux.to_list())
        .rename(columns={
            'value': 'year_setin',
            'type': 'year_setin_type'
        })
    )
], axis=1)
df_films['year_setin'].dtype
dtype('int64')
df_films
Loading...
df_films.to_csv(processed_data_path,index=False)

6. Simple plot

plt.figure(figsize=(20,15))
sns.scatterplot(
    df_films,
    y = 'year_produced',
    x = 'year_setin',
    hue = 'year_setin_type',
    palette={
        'year': '#ababab',
        'decade': 'b',
        'century': 'g'
    },
    s = 25,
    alpha = 0.05,
    edgecolor = 'none',
)
plt.gca().set_aspect(5, 'box')
# plt.xlim([1900, 2000])
<Figure size 1440x1080 with 1 Axes>