Scraping movies by years (produced/released vs. set in)
1. Import and define paths¶
import os, re, pickle, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import wikipediaapi as wkapiproduced_data_path = 'data/films_by_produced.csv'
setin_data_path = 'data/films_by_set_in.csv'
processed_data_path = 'data/films_set_in_and_produced.csv'2. Define scraping functions¶
wk = wkapi.Wikipedia('Data of movies by years (penguins@antarct.ica)', 'en')def traverse_category(category, output=[], level=0, max_level=1):
parent = category.title
for c in category.categorymembers.values():
output.append(dict(
title = c.title,
ns = c.ns,
pageid = c.pageid,
level = level,
parent = parent
))
if c.ns == wkapi.Namespace.CATEGORY and level < max_level:
output = traverse_category(
c,
output,
level=level + 1,
max_level=max_level
)
return outputdef scrape_film_list(category_seed, max_try = 2):
num_try = 1
success = False
t0 = time.time()
print(f'Scrape {category_seed}')
while num_try <= max_try and not success:
try:
cat_page = wk.page(category_seed)
cat_tree = traverse_category(cat_page)
df = pd.DataFrame(cat_tree)
success = True
except:
print('\tElapsed %.2f seconds' %(time.time() - t0))
print(f'\tError with {category_seed}. Will try again!')
num_try += 1
print('Elapsed %.2f seconds' %(time.time() - t0))
return df
3. Scrape movies by produced year¶
df_produced = scrape_film_list(
'Category:Films_by_year',
max_try=3
)Scrape Category:Films_by_year
Elapsed 146.34 seconds
df_producedLoading...
df_produced['year_produced'] = df_produced['parent'].apply(
lambda x: int(d['year']) if
(d:=re.search('Category\:(?P<year>\d{4})\s+(films)?', x)) is not None
else None
)
df_produced = (
df_produced
.dropna(subset='year_produced')
.query('ns == 0')
.astype({'year_produced': 'int'})
.drop(columns=['parent', 'ns', 'level'])
.reset_index(drop=True)
)df_producedLoading...
df_produced.to_csv(produced_data_path, index=False)4. Scrape movies by set-in year/decade/century¶
categories = [
'Category:Films_by_century_of_setting',
'Category:Films_by_decade_of_setting',
'Category:Films_by_year_of_setting'
]
dfs_setin = []
for c in categories:
dfs_setin.append(scrape_film_list(c, max_try=3))
df_setin = pd.concat(dfs_setin, ignore_index=True)Scrape Category:Films_by_century_of_setting
Elapsed 11.18 seconds
Scrape Category:Films_by_decade_of_setting
Elapsed 20.52 seconds
Scrape Category:Films_by_year_of_setting
Elapsed 82.91 seconds
df_setinLoading...
4.1. Checking a few things¶
Check the variable ns
df_setin.query('parent.str.contains("Films set in")')['ns'].value_counts()0 32037
14 2136
2 2
Name: ns, dtype: int64Checking what that ns=14 are those with title starts with 'Category:', which means these should be filtered out
(
df_setin.query('parent.str.contains("Films set in")')
.query('ns == 14')
['title'].apply(lambda x: x.startswith('Category:'))
.all()
)TrueChecking what that ns=0 are those without title startswith 'Category:'. These should be kept
(
df_setin.query('parent.str.contains("Films set in") and ns == 0')
['title'].apply(lambda x: x.startswith('Category:'))
.any()
)False4.2. Process and save data¶
df_setin = pd.concat(dfs_setin, ignore_index=True)df_setin = (
df_setin
.query(
'parent.str.contains("Films set in")'\
'and not parent.str.contains("Infamous Decade")'\
'and ns == 0'
)
.drop(columns=['ns','level'])
)
# obtain year/decade/century it is set in
df_setin['year_setin'] = (
df_setin.pop('parent')
.str.replace('Category:Films set in ','')
.str.replace('the ', '')
.str.replace(' century', '')
)
df_setin = (
df_setin
.groupby(['pageid', 'title'])
.agg(set)
.reset_index()
)
df_setin['year_setin'] = df_setin['year_setin'].apply(list)df_setinLoading...
df_setin.to_csv(setin_data_path, index=False)5. Combine and process¶
df_films = (
df_setin
.merge(df_produced, on=['pageid', 'title'])
.explode('year_setin')
.reset_index(drop=True)
)def convert_non_standard_time(x):
if isinstance(x, int):
return dict(value = x, type = 'year')
if isinstance(x, float):
return dict(value = x, type = 'year')
if (value:=re.match('^\d{4}$', x)) is not None:
value = int(value.group())
return dict(value = value, type = 'year')
if (value:=re.search('(\d+)\s*(st|nd|rd|th)', x)) is not None:
value = int(value.groups()[0]) * 100
if 'BC' in x:
value = -value
else:
value -= 100
return dict(value = value, type = 'century')
if (value:=re.search('(\d+)\s+AD', x)) is not None:
value = int(value.groups()[0])
return dict(value = value, type = 'year')
if (value:=re.search('(\d{4})s', x)) is not None:
value = int(value.groups()[0])
return dict(value = value, type = 'decade')
df_year_aux = df_films['year_setin'].apply(convert_non_standard_time)
df_films = pd.concat([
df_films.rename(columns={'year_setin': 'source_year_setin'}),
(
pd.DataFrame(df_year_aux.to_list())
.rename(columns={
'value': 'year_setin',
'type': 'year_setin_type'
})
)
], axis=1)df_films['year_setin'].dtypedtype('int64')df_filmsLoading...
df_films.to_csv(processed_data_path,index=False)6. Simple plot¶
plt.figure(figsize=(20,15))
sns.scatterplot(
df_films,
y = 'year_produced',
x = 'year_setin',
hue = 'year_setin_type',
palette={
'year': '#ababab',
'decade': 'b',
'century': 'g'
},
s = 25,
alpha = 0.05,
edgecolor = 'none',
)
plt.gca().set_aspect(5, 'box')
# plt.xlim([1900, 2000])