Scraping movies by years (produced/released vs. set in)#

1. Import and define paths#

import os, re, pickle, time
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import wikipediaapi as wkapi
produced_data_path = 'data/films_by_produced.csv'
setin_data_path = 'data/films_by_set_in.csv'
processed_data_path = 'data/films_set_in_and_produced.csv'

2. Define scraping functions#

wk = wkapi.Wikipedia('Data of movies by years (penguins@antarct.ica)', 'en')
def traverse_category(category, output=[], level=0, max_level=1):
    parent = category.title
    for c in category.categorymembers.values():
        output.append(dict(
            title = c.title,
            ns = c.ns,
            pageid = c.pageid,
            level = level,
            parent = parent
        ))
            
        if c.ns == wkapi.Namespace.CATEGORY and level < max_level:
            output = traverse_category(
                c,
                output,
                level=level + 1, 
                max_level=max_level
            )
    return output
def scrape_film_list(category_seed, max_try = 2):
    num_try = 1
    success = False
    t0 = time.time()
    print(f'Scrape {category_seed}')
    while num_try <= max_try and not success:
        try:
            cat_page = wk.page(category_seed)
            cat_tree = traverse_category(cat_page)
            df = pd.DataFrame(cat_tree)
            success = True
        except:
            print('\tElapsed %.2f seconds' %(time.time() - t0))
            print(f'\tError with {category_seed}. Will try again!')
        
        num_try += 1
    print('Elapsed %.2f seconds' %(time.time() - t0))
    return df

3. Scrape movies by produced year#

df_produced = scrape_film_list(
    'Category:Films_by_year', 
    max_try=3
)
Scrape Category:Films_by_year
Elapsed 146.34 seconds
df_produced
title ns pageid level parent
0 Category:3D films by year 14 47763114 0 Category:Films_by_year
1 Category:1953 3D films 14 70933910 1 Category:3D films by year
2 Category:1954 3D films 14 70933932 1 Category:3D films by year
3 Category:1982 3D films 14 70926523 1 Category:3D films by year
4 Category:1983 3D films 14 65491608 1 Category:3D films by year
... ... ... ... ... ...
166171 Category:2025 films 14 74026790 0 Category:Films_by_year
166172 Avatar 3 0 27442998 1 Category:2025 films
166173 Category:2025 science fiction films 14 74026793 1 Category:2025 films
166174 Category:2115 films 14 56366623 0 Category:Films_by_year
166175 100 Years (film) 0 48973524 1 Category:2115 films

166176 rows × 5 columns

df_produced['year_produced'] = df_produced['parent'].apply(
    lambda x: int(d['year']) if 
        (d:=re.search('Category\:(?P<year>\d{4})\s+(films)?', x)) is not None
        else None
)

df_produced = (
    df_produced
    .dropna(subset='year_produced')
    .query('ns == 0')
    .astype({'year_produced': 'int'})
    .drop(columns=['parent', 'ns', 'level'])
    .reset_index(drop=True)
)
df_produced
title pageid year_produced
0 Passage de Vénus 47863608 1874
1 The Horse in Motion 26044155 1878
2 Man Walking Around a Corner 14360688 1887
3 Accordion Player 14360591 1888
4 Roundhay Garden Scene 1082508 1888
... ... ... ...
161762 Wicked (2024 film) 60455274 2024
161763 Wise Guys (2024 film) 71537782 2024
161764 Witchboard (2024 film) 73430842 2024
161765 Avatar 3 27442998 2025
161766 100 Years (film) 48973524 2115

161767 rows × 3 columns

df_produced.to_csv(produced_data_path, index=False)

4. Scrape movies by set-in year/decade/century#

categories = [
    'Category:Films_by_century_of_setting',
    'Category:Films_by_decade_of_setting',
    'Category:Films_by_year_of_setting'
]

dfs_setin = []
for c in categories:
    dfs_setin.append(scrape_film_list(c, max_try=3))
    
df_setin = pd.concat(dfs_setin, ignore_index=True)
Scrape Category:Films_by_century_of_setting
Elapsed 11.18 seconds
Scrape Category:Films_by_decade_of_setting
Elapsed 20.52 seconds
Scrape Category:Films_by_year_of_setting
Elapsed 82.91 seconds
df_setin
title ns pageid level parent
0 Category:3D films by year 14 47763114 0 Category:Films_by_year
1 Category:1953 3D films 14 70933910 1 Category:3D films by year
2 Category:1954 3D films 14 70933932 1 Category:3D films by year
3 Category:1982 3D films 14 70926523 1 Category:3D films by year
4 Category:1983 3D films 14 65491608 1 Category:3D films by year
... ... ... ... ... ...
533442 The Last Days of Pompeii (1950 film) 0 27139182 1 Category:Films set in 79 AD
533443 The Last Days of Pompeii (1959 film) 0 25972799 1 Category:Films set in 79 AD
533444 Pompeii (film) 0 39083980 1 Category:Films set in 79 AD
533445 Pompeii: The Last Day 0 1005379 1 Category:Films set in 79 AD
533446 Up Pompeii (film) 0 22812968 1 Category:Films set in 79 AD

533447 rows × 5 columns

4.1. Checking a few things#

Check the variable ns

df_setin.query('parent.str.contains("Films set in")')['ns'].value_counts()
0     32037
14     2136
2         2
Name: ns, dtype: int64

Checking what that ns=14 are those with title starts with 'Category:', which means these should be filtered out

(
    df_setin.query('parent.str.contains("Films set in")')
    .query('ns == 14')
    ['title'].apply(lambda x: x.startswith('Category:'))
    .all()
)
True

Checking what that ns=0 are those without title startswith 'Category:'. These should be kept

(
    df_setin.query('parent.str.contains("Films set in") and ns == 0')
    ['title'].apply(lambda x: x.startswith('Category:'))
    .any()
)
False

4.2. Process and save data#

df_setin = pd.concat(dfs_setin, ignore_index=True)
df_setin = (
    df_setin
    .query(
        'parent.str.contains("Films set in")'\
        'and not parent.str.contains("Infamous Decade")'\
        'and ns == 0'
    )
    .drop(columns=['ns','level'])
)

# obtain year/decade/century it is set in
df_setin['year_setin'] = (
    df_setin.pop('parent')
    .str.replace('Category:Films set in ','')
    .str.replace('the ', '')
    .str.replace(' century', '')
)

df_setin = (
    df_setin
    .groupby(['pageid', 'title'])
    .agg(set)
    .reset_index()
)

df_setin['year_setin'] = df_setin['year_setin'].apply(list)
df_setin
pageid title year_setin
0 3217 Army of Darkness [13th]
1 3333 The Birth of a Nation [1860s, 19th, 1870s]
2 3746 Blade Runner [2019]
3 3837 Blazing Saddles [1856, 1874]
4 4227 Barry Lyndon [1763, 1773, 1780s, 1750s]
... ... ... ...
13837 74597836 The Sacrifice Game [1971]
13838 74607682 Joachim and the Apocalypse [12th]
13839 74612478 The Kitchen (2023 film) [2040s]
13840 74619750 Jailer (2023 Malayalam film) [1950s]
13841 74621258 Robot Dreams (film) [1980s]

13842 rows × 3 columns

df_setin.to_csv(setin_data_path, index=False)

5. Combine and process#

df_films = (
    df_setin
    .merge(df_produced, on=['pageid', 'title'])
    .explode('year_setin')
    .reset_index(drop=True)
)
def convert_non_standard_time(x):
    if isinstance(x, int):
        return dict(value = x, type = 'year')
    if isinstance(x, float):
        return dict(value = x, type = 'year')
    if (value:=re.match('^\d{4}$', x)) is not None:
        value = int(value.group())
        return dict(value = value, type = 'year')
    if (value:=re.search('(\d+)\s*(st|nd|rd|th)', x)) is not None:
        value = int(value.groups()[0]) * 100
        if 'BC' in x:
            value = -value
        else:
            value -= 100
        return dict(value = value, type = 'century')
    if (value:=re.search('(\d+)\s+AD', x)) is not None:
        value = int(value.groups()[0])
        return dict(value = value, type = 'year')
    if (value:=re.search('(\d{4})s', x)) is not None:
        value = int(value.groups()[0])
        return dict(value = value, type = 'decade')

df_year_aux = df_films['year_setin'].apply(convert_non_standard_time)
    
df_films = pd.concat([
    df_films.rename(columns={'year_setin': 'source_year_setin'}),
    (
        pd.DataFrame(df_year_aux.to_list())
        .rename(columns={
            'value': 'year_setin',
            'type': 'year_setin_type'
        })
    )
], axis=1)
df_films['year_setin'].dtype
dtype('int64')
df_films
pageid title source_year_setin year_produced year_setin year_setin_type
0 3217 Army of Darkness 13th 1992 1200 century
1 3333 The Birth of a Nation 1860s 1915 1860 decade
2 3333 The Birth of a Nation 19th 1915 1800 century
3 3333 The Birth of a Nation 1870s 1915 1870 decade
4 3746 Blade Runner 2019 1982 2019 year
... ... ... ... ... ... ...
20150 74597836 The Sacrifice Game 1971 2023 1971 year
20151 74607682 Joachim and the Apocalypse 12th 2023 1100 century
20152 74612478 The Kitchen (2023 film) 2040s 2023 2040 decade
20153 74619750 Jailer (2023 Malayalam film) 1950s 2023 1950 decade
20154 74621258 Robot Dreams (film) 1980s 2023 1980 decade

20155 rows × 6 columns

df_films.to_csv(processed_data_path,index=False)

6. Simple plot#

plt.figure(figsize=(20,15))
sns.scatterplot(
    df_films,
    y = 'year_produced',
    x = 'year_setin',
    hue = 'year_setin_type',
    palette={
        'year': '#ababab',
        'decade': 'b',
        'century': 'g'
    },
    s = 25,
    alpha = 0.05,
    edgecolor = 'none',
)
plt.gca().set_aspect(5, 'box')
# plt.xlim([1900, 2000])
../_images/6369527910ea1f47a707f513b6b55af24f989d5862629f727603e9a896ecc826.png