Obtain robots.txt of sites listed on mediabiasfactcheck#

1. Import#

import os
import time
import re
import warnings
import gzip

import urllib
import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

if 'notebooks' in os.getcwd():
    os.chdir('..')

2. Media sites from mediabiasfactcheck#

2.1. Obtain sites#

Essentially this will scrape the list of, e.g. https://mediabiasfactcheck.com/center and save the site names

main_url = 'https://mediabiasfactcheck.com'
categories = [
    'center',
    'left',
    'leftcenter',
    'right-center',
    'right',
    'conspiracy',
    'fake-news',
    'pro-science',
    'satire'
]
results = dict()

for cat in tqdm(categories):
    url = urllib.parse.urljoin(main_url, cat)
    if cat in results:
        continue
    request = requests.get(url)
    assert request.status_code == 200
    
    soup = BeautifulSoup(request.content, 'lxml')
    
    if cat != "satire":
        table = soup.find_all('table', {'id': 'mbfc-table'})
        assert len(table) == 1
        table = table[0]

        sites = [
            s for x in table.find_all('td') 
            if len(s:=x.text.strip()) > 0
        ]
    else:
        div_content = soup.find_all('div', {'class': 'entry-content'})
        assert len(div_content) == 1
        div_content = div_content[0]
        
        spans = [
            sp for a in div_content.find_all('a') 
            if (sp:=a.find('span')) is not None
        ]
        
        sites = [
            s for sp in spans 
            if len(s:=sp.text.strip()) > 0
        ]
        
    print(f'For {url}, there are {len(sites)} named sites found (unprocessed)')
    results[cat] = sites
    time.sleep(2)
df = (
    pd.DataFrame([
        dict(category=k, site=v) 
        for k, v in results.items()
    ])
    .explode('site')
    .reset_index(drop=True)
)

2.2. Parse site names#

This parses the actual site URLs from the listed site names, e.g. NYTimes (nytimes.com) to nytimes.com

def parse_site(s):
    if '.' not in s:
        print(f'No "." detected for "{s}". Skipping')
        return
    
    url = re.findall(r'\(([^\)]+)\)', s)
    url = [u for u in url if '.' in u]
    
    if len(url) == 0:
        url = [s]
    
    if len(url) > 1:
        print(f'For "{s}", multiple urls found {url}. Will use the last')
        
    url = url[-1].strip()
    
    if not url.startswith('http'):
        assert not url.startswith('/')
        url = 'https://' + url
        
    parsed_url = urllib.parse.urlparse(url)
    url = f'{parsed_url.scheme}://{parsed_url.netloc}/'
    return url
    
df['url'] = df['site'].apply(parse_site)
df.to_csv('data/mbfc/sites.csv', index=False)

3. Prepare list of URLs to download#

This filters out missing links and narrows down to unique links, then constructs a links.aria for aria2c to download

num_sites = len(df)
num_err = df['url'].isna().sum()
df = df.dropna(subset='url').reset_index(drop=True)
print(f'{num_err} / {num_sites} rows dropped due to initial parsing issues')
unique_urls = df['url'].unique()
with open('links/robotstxt-from-mbfc.aria', 'w') as f:
    for url in unique_urls:
        robot_url = urllib.parse.urljoin(url, 'robots.txt')
        f.write(robot_url + '\n')
        file_name = (
            url.replace(':', '[C]')
            .replace('/','[S]')
            .replace('.','[D]')
        )
        f.write(f'\tout=data/robotstxt/{file_name}\n')
    

Then use aria2c on CLI to download into data/robotstxt/.

The following was run:

aria2c -i robotstxt-from-mbfc.aria 
    -j 12 \
    --console-log-level=error \
    --log=logs/robotstxt-from-mbfc.log