Obtain robots.txt of select sites from Wayback#

1. Import#

import os
import time
import glob
import re
import warnings
import gzip

import urllib
import requests

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import wayback

if 'notebooks' in os.getcwd():
    os.chdir('..')

2. Select sites and format#

main_urls = [
    'https://abc12.com',
    'https://arstechnica.com',
    'https://cell.com',
    'https://www.cnn.com',
    'https://foxbaltimore.com',
    'https://mynbc5.com',
    'https://www.nationalgeographic.com',
    'https://www.npr.org',
    'https://www.nytimes.com',
    'https://www.reuters.com',
    'https://science.org',
    'https://www.theonion.com',
    'https://the-sun.com',
    'https://www.thesun.co.uk',    
    'https://www.vice.com',
    'https://www.vox.com',    
    'https://www.who.int',    
    'https://www.zdnet.com',
    'https://joerogan.com',

]
def stringify_url(url):
    if url.endswith('/'):
        url = url[:-1]
        
    url = (
        url.replace('https:', '')
        .replace('http:', '')
        .replace('//www.', '')
        .replace('//','')
    )
    s = (
        url.lower()
        .replace(':','[C]')
        .replace('/','[S]')
        .replace('.','[D]')
    )
    return s

robot_urls = {
    stringify_url(x): urllib.parse.urljoin(x, 'robots.txt')
    for x in main_urls
}

3. Select date range#

To avoid making too many requests to the Wayback machine, only do in increments of 1 week from beginning of 2023, up until now

today_date = datetime.datetime.now()
start_dates = [
    datetime.datetime(today_date.year,1,1) + datetime.timedelta(weeks=i)
    for i in range(today_date.isocalendar().week)
]
stop_dates = [x - datetime.timedelta(days=1) for x in start_dates[1:]] + [today_date]
date_ranges = list(zip(start_dates, stop_dates))
date_ranges

5. Prepare aria input files#

select_done_sites = list(site_is_done.keys())
df_wb = pd.DataFrame(links.values()).query('site in @select_done_sites').reset_index(drop=True)
df_wb
data_dir = 'data/robots-wayback/'
link_dir = 'links/wayback/'

df_wb['out_aria'] = df_wb.apply(
    lambda x: '\tout=' + os.path.join(data_dir, x['site'], x['site']+'_'+x['timestamp']),
    axis=1
)
for site in df_wb['site'].unique():
    link_file = os.path.join(link_dir, site + '-links.aria')
    if os.path.exists(link_file):
        continue
    df_wb_site = df_wb.query('site == @site').reset_index(drop=True)
    with open(link_file, 'w') as f:
        for _, row in df_wb_site.iterrows():
            f.write(row['robot_url'] + '\n')
            f.write(row['out_aria'] + '\n')
            

Then head to CLI to do each link separately (or write a bash function over it)

aria2c -x 16 -i links/wayback/<INPUT-FILE>