Scraping movies by years (produced/released vs. set in)

Scraping movies by years (produced/released vs. set in)#

1. Import and define paths#

import os, re, pickle, time
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import wikipediaapi as wkapi

produced_data_path = 'data/films_by_produced.csv'
setin_data_path = 'data/films_by_set_in.csv'
processed_data_path = 'data/films_set_in_and_produced.csv'

2. Define scraping functions#

wk = wkapi.Wikipedia('Data of movies by years (penguins@antarct.ica)', 'en')

def traverse_category(category, output=[], level=0, max_level=1):
    parent = category.title
    for c in category.categorymembers.values():
        output.append(dict(
            title = c.title,
            ns = c.ns,
            pageid = c.pageid,
            level = level,
            parent = parent
        ))
            
        if c.ns == wkapi.Namespace.CATEGORY and level < max_level:
            output = traverse_category(
                c,
                output,
                level=level + 1, 
                max_level=max_level
            )
    return output

def scrape_film_list(category_seed, max_try = 2):
    num_try = 1
    success = False
    t0 = time.time()
    print(f'Scrape {category_seed}')
    while num_try <= max_try and not success:
        try:
            cat_page = wk.page(category_seed)
            cat_tree = traverse_category(cat_page)
            df = pd.DataFrame(cat_tree)
            success = True
        except:
            print('\tElapsed %.2f seconds' %(time.time() - t0))
            print(f'\tError with {category_seed}. Will try again!')
        
        num_try += 1
    print('Elapsed %.2f seconds' %(time.time() - t0))
    return df

3. Scrape movies by produced year#

df_produced = scrape_film_list(
    'Category:Films_by_year', 
    max_try=3
)

Scrape Category:Films_by_year
Elapsed 146.34 seconds

df_produced

	title	ns	pageid	level	parent
0	Category:3D films by year	14	47763114	0	Category:Films_by_year
1	Category:1953 3D films	14	70933910	1	Category:3D films by year
2	Category:1954 3D films	14	70933932	1	Category:3D films by year
3	Category:1982 3D films	14	70926523	1	Category:3D films by year
4	Category:1983 3D films	14	65491608	1	Category:3D films by year
...	...	...	...	...	...
166171	Category:2025 films	14	74026790	0	Category:Films_by_year
166172	Avatar 3	0	27442998	1	Category:2025 films
166173	Category:2025 science fiction films	14	74026793	1	Category:2025 films
166174	Category:2115 films	14	56366623	0	Category:Films_by_year
166175	100 Years (film)	0	48973524	1	Category:2115 films

166176 rows × 5 columns

df_produced['year_produced'] = df_produced['parent'].apply(
    lambda x: int(d['year']) if 
        (d:=re.search('Category\:(?P<year>\d{4})\s+(films)?', x)) is not None
        else None
)

df_produced = (
    df_produced
    .dropna(subset='year_produced')
    .query('ns == 0')
    .astype({'year_produced': 'int'})
    .drop(columns=['parent', 'ns', 'level'])
    .reset_index(drop=True)
)

df_produced

	title	pageid	year_produced
0	Passage de Vénus	47863608	1874
1	The Horse in Motion	26044155	1878
2	Man Walking Around a Corner	14360688	1887
3	Accordion Player	14360591	1888
4	Roundhay Garden Scene	1082508	1888
...	...	...	...
161762	Wicked (2024 film)	60455274	2024
161763	Wise Guys (2024 film)	71537782	2024
161764	Witchboard (2024 film)	73430842	2024
161765	Avatar 3	27442998	2025
161766	100 Years (film)	48973524	2115

161767 rows × 3 columns

df_produced.to_csv(produced_data_path, index=False)

4. Scrape movies by set-in year/decade/century#

categories = [
    'Category:Films_by_century_of_setting',
    'Category:Films_by_decade_of_setting',
    'Category:Films_by_year_of_setting'
]

dfs_setin = []
for c in categories:
    dfs_setin.append(scrape_film_list(c, max_try=3))
    
df_setin = pd.concat(dfs_setin, ignore_index=True)

Scrape Category:Films_by_century_of_setting
Elapsed 11.18 seconds
Scrape Category:Films_by_decade_of_setting
Elapsed 20.52 seconds
Scrape Category:Films_by_year_of_setting
Elapsed 82.91 seconds

df_setin

	title	ns	pageid	level	parent
0	Category:3D films by year	14	47763114	0	Category:Films_by_year
1	Category:1953 3D films	14	70933910	1	Category:3D films by year
2	Category:1954 3D films	14	70933932	1	Category:3D films by year
3	Category:1982 3D films	14	70926523	1	Category:3D films by year
4	Category:1983 3D films	14	65491608	1	Category:3D films by year
...	...	...	...	...	...
533442	The Last Days of Pompeii (1950 film)	0	27139182	1	Category:Films set in 79 AD
533443	The Last Days of Pompeii (1959 film)	0	25972799	1	Category:Films set in 79 AD
533444	Pompeii (film)	0	39083980	1	Category:Films set in 79 AD
533445	Pompeii: The Last Day	0	1005379	1	Category:Films set in 79 AD
533446	Up Pompeii (film)	0	22812968	1	Category:Films set in 79 AD

533447 rows × 5 columns

4.1. Checking a few things#

Check the variable ns

df_setin.query('parent.str.contains("Films set in")')['ns'].value_counts()

   32037
   2136
       2
Name: ns, dtype: int64

Checking what that ns=14 are those with title starts with 'Category:', which means these should be filtered out

(
    df_setin.query('parent.str.contains("Films set in")')
    .query('ns == 14')
    ['title'].apply(lambda x: x.startswith('Category:'))
    .all()
)

True

Checking what that ns=0 are those without title startswith 'Category:'. These should be kept

(
    df_setin.query('parent.str.contains("Films set in") and ns == 0')
    ['title'].apply(lambda x: x.startswith('Category:'))
    .any()
)

False

4.2. Process and save data#

df_setin = pd.concat(dfs_setin, ignore_index=True)

df_setin = (
    df_setin
    .query(
        'parent.str.contains("Films set in")'\
        'and not parent.str.contains("Infamous Decade")'\
        'and ns == 0'
    )
    .drop(columns=['ns','level'])
)

# obtain year/decade/century it is set in
df_setin['year_setin'] = (
    df_setin.pop('parent')
    .str.replace('Category:Films set in ','')
    .str.replace('the ', '')
    .str.replace(' century', '')
)

df_setin = (
    df_setin
    .groupby(['pageid', 'title'])
    .agg(set)
    .reset_index()
)

df_setin['year_setin'] = df_setin['year_setin'].apply(list)

df_setin

	pageid	title	year_setin
0	3217	Army of Darkness	[13th]
1	3333	The Birth of a Nation	[1860s, 19th, 1870s]
2	3746	Blade Runner	[2019]
3	3837	Blazing Saddles	[1856, 1874]
4	4227	Barry Lyndon	[1763, 1773, 1780s, 1750s]
...	...	...	...
13837	74597836	The Sacrifice Game	[1971]
13838	74607682	Joachim and the Apocalypse	[12th]
13839	74612478	The Kitchen (2023 film)	[2040s]
13840	74619750	Jailer (2023 Malayalam film)	[1950s]
13841	74621258	Robot Dreams (film)	[1980s]

13842 rows × 3 columns

df_setin.to_csv(setin_data_path, index=False)

5. Combine and process#

df_films = (
    df_setin
    .merge(df_produced, on=['pageid', 'title'])
    .explode('year_setin')
    .reset_index(drop=True)
)

def convert_non_standard_time(x):
    if isinstance(x, int):
        return dict(value = x, type = 'year')
    if isinstance(x, float):
        return dict(value = x, type = 'year')
    if (value:=re.match('^\d{4}$', x)) is not None:
        value = int(value.group())
        return dict(value = value, type = 'year')
    if (value:=re.search('(\d+)\s*(st|nd|rd|th)', x)) is not None:
        value = int(value.groups()[0]) * 100
        if 'BC' in x:
            value = -value
        else:
            value -= 100
        return dict(value = value, type = 'century')
    if (value:=re.search('(\d+)\s+AD', x)) is not None:
        value = int(value.groups()[0])
        return dict(value = value, type = 'year')
    if (value:=re.search('(\d{4})s', x)) is not None:
        value = int(value.groups()[0])
        return dict(value = value, type = 'decade')

df_year_aux = df_films['year_setin'].apply(convert_non_standard_time)
    

df_films = pd.concat([
    df_films.rename(columns={'year_setin': 'source_year_setin'}),
    (
        pd.DataFrame(df_year_aux.to_list())
        .rename(columns={
            'value': 'year_setin',
            'type': 'year_setin_type'
        })
    )
], axis=1)

df_films['year_setin'].dtype

dtype('int64')

df_films

	pageid	title	source_year_setin	year_produced	year_setin	year_setin_type
0	3217	Army of Darkness	13th	1992	1200	century
1	3333	The Birth of a Nation	1860s	1915	1860	decade
2	3333	The Birth of a Nation	19th	1915	1800	century
3	3333	The Birth of a Nation	1870s	1915	1870	decade
4	3746	Blade Runner	2019	1982	2019	year
...	...	...	...	...	...	...
20150	74597836	The Sacrifice Game	1971	2023	1971	year
20151	74607682	Joachim and the Apocalypse	12th	2023	1100	century
20152	74612478	The Kitchen (2023 film)	2040s	2023	2040	decade
20153	74619750	Jailer (2023 Malayalam film)	1950s	2023	1950	decade
20154	74621258	Robot Dreams (film)	1980s	2023	1980	decade

20155 rows × 6 columns

df_films.to_csv(processed_data_path,index=False)

6. Simple plot#

plt.figure(figsize=(20,15))
sns.scatterplot(
    df_films,
    y = 'year_produced',
    x = 'year_setin',
    hue = 'year_setin_type',
    palette={
        'year': '#ababab',
        'decade': 'b',
        'century': 'g'
    },
    s = 25,
    alpha = 0.05,
    edgecolor = 'none',
)
plt.gca().set_aspect(5, 'box')
# plt.xlim([1900, 2000])

../_images/6369527910ea1f47a707f513b6b55af24f989d5862629f727603e9a896ecc826.png