Obtain robots.txt
of select sites from Wayback#
1. Import#
import os
import time
import glob
import re
import warnings
import gzip
import urllib
import requests
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import wayback
if 'notebooks' in os.getcwd():
os.chdir('..')
2. Select sites and format#
main_urls = [
'https://abc12.com',
'https://arstechnica.com',
'https://cell.com',
'https://www.cnn.com',
'https://foxbaltimore.com',
'https://mynbc5.com',
'https://www.nationalgeographic.com',
'https://www.npr.org',
'https://www.nytimes.com',
'https://www.reuters.com',
'https://science.org',
'https://www.theonion.com',
'https://the-sun.com',
'https://www.thesun.co.uk',
'https://www.vice.com',
'https://www.vox.com',
'https://www.who.int',
'https://www.zdnet.com',
'https://joerogan.com',
]
def stringify_url(url):
if url.endswith('/'):
url = url[:-1]
url = (
url.replace('https:', '')
.replace('http:', '')
.replace('//www.', '')
.replace('//','')
)
s = (
url.lower()
.replace(':','[C]')
.replace('/','[S]')
.replace('.','[D]')
)
return s
robot_urls = {
stringify_url(x): urllib.parse.urljoin(x, 'robots.txt')
for x in main_urls
}
3. Select date range#
To avoid making too many requests to the Wayback machine, only do in increments of 1 week from beginning of 2023, up until now
today_date = datetime.datetime.now()
start_dates = [
datetime.datetime(today_date.year,1,1) + datetime.timedelta(weeks=i)
for i in range(today_date.isocalendar().week)
]
stop_dates = [x - datetime.timedelta(days=1) for x in start_dates[1:]] + [today_date]
date_ranges = list(zip(start_dates, stop_dates))
date_ranges
4. Obtain wayback links#
client = wayback.WaybackClient()
links = dict()
site_is_done = dict()
sleep_between_site = 10
for site in tqdm(list(robot_urls.keys()), leave=True, desc='Site'):
site_robot_url = robot_urls[site]
if site_is_done.get(site, False):
continue
found_dates = 0
for from_date, to_date in tqdm(list(date_ranges), leave=False, desc=site):
range_str = from_date.strftime('%Y%m%d') + '::' + to_date.strftime('%Y%m%d')
link_key = site + '@' + range_str
if link_key in links:
continue
for r in client.search(
site_robot_url,
from_date=from_date,
to_date=to_date,
limit=-2
):
wb_ts = r.timestamp.strftime('%Y%m%d-%H%M%S')
links[link_key] = dict(
site = site,
robot_url = r.raw_url,
status_code = r.status_code,
date_range = range_str,
timestamp = wb_ts
)
found_dates += 1
site_is_done[site] = True
time.sleep(sleep_between_site)
print(f'For "{site}", found {found_dates} "robots.txt" links on wayback')
5. Prepare aria
input files#
select_done_sites = list(site_is_done.keys())
df_wb = pd.DataFrame(links.values()).query('site in @select_done_sites').reset_index(drop=True)
df_wb
data_dir = 'data/robots-wayback/'
link_dir = 'links/wayback/'
df_wb['out_aria'] = df_wb.apply(
lambda x: '\tout=' + os.path.join(data_dir, x['site'], x['site']+'_'+x['timestamp']),
axis=1
)
for site in df_wb['site'].unique():
link_file = os.path.join(link_dir, site + '-links.aria')
if os.path.exists(link_file):
continue
df_wb_site = df_wb.query('site == @site').reset_index(drop=True)
with open(link_file, 'w') as f:
for _, row in df_wb_site.iterrows():
f.write(row['robot_url'] + '\n')
f.write(row['out_aria'] + '\n')
Then head to CLI to do each link separately (or write a bash function over it)
aria2c -x 16 -i links/wayback/<INPUT-FILE>