Obtain robots.txt
of sites listed on mediabiasfactcheck
#
1. Import#
import os
import time
import re
import warnings
import gzip
import urllib
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
if 'notebooks' in os.getcwd():
os.chdir('..')
2. Media sites from mediabiasfactcheck
#
2.1. Obtain sites#
Essentially this will scrape the list of, e.g. https://mediabiasfactcheck.com/center
and save the site names
main_url = 'https://mediabiasfactcheck.com'
categories = [
'center',
'left',
'leftcenter',
'right-center',
'right',
'conspiracy',
'fake-news',
'pro-science',
'satire'
]
results = dict()
for cat in tqdm(categories):
url = urllib.parse.urljoin(main_url, cat)
if cat in results:
continue
request = requests.get(url)
assert request.status_code == 200
soup = BeautifulSoup(request.content, 'lxml')
if cat != "satire":
table = soup.find_all('table', {'id': 'mbfc-table'})
assert len(table) == 1
table = table[0]
sites = [
s for x in table.find_all('td')
if len(s:=x.text.strip()) > 0
]
else:
div_content = soup.find_all('div', {'class': 'entry-content'})
assert len(div_content) == 1
div_content = div_content[0]
spans = [
sp for a in div_content.find_all('a')
if (sp:=a.find('span')) is not None
]
sites = [
s for sp in spans
if len(s:=sp.text.strip()) > 0
]
print(f'For {url}, there are {len(sites)} named sites found (unprocessed)')
results[cat] = sites
time.sleep(2)
df = (
pd.DataFrame([
dict(category=k, site=v)
for k, v in results.items()
])
.explode('site')
.reset_index(drop=True)
)
2.2. Parse site names#
This parses the actual site URLs from the listed site names, e.g. NYTimes (nytimes.com)
to nytimes.com
def parse_site(s):
if '.' not in s:
print(f'No "." detected for "{s}". Skipping')
return
url = re.findall(r'\(([^\)]+)\)', s)
url = [u for u in url if '.' in u]
if len(url) == 0:
url = [s]
if len(url) > 1:
print(f'For "{s}", multiple urls found {url}. Will use the last')
url = url[-1].strip()
if not url.startswith('http'):
assert not url.startswith('/')
url = 'https://' + url
parsed_url = urllib.parse.urlparse(url)
url = f'{parsed_url.scheme}://{parsed_url.netloc}/'
return url
df['url'] = df['site'].apply(parse_site)
df.to_csv('data/mbfc/sites.csv', index=False)
3. Prepare list of URLs to download#
This filters out missing links and narrows down to unique links, then constructs a links.aria
for aria2c
to download
num_sites = len(df)
num_err = df['url'].isna().sum()
df = df.dropna(subset='url').reset_index(drop=True)
print(f'{num_err} / {num_sites} rows dropped due to initial parsing issues')
unique_urls = df['url'].unique()
with open('links/robotstxt-from-mbfc.aria', 'w') as f:
for url in unique_urls:
robot_url = urllib.parse.urljoin(url, 'robots.txt')
f.write(robot_url + '\n')
file_name = (
url.replace(':', '[C]')
.replace('/','[S]')
.replace('.','[D]')
)
f.write(f'\tout=data/robotstxt/{file_name}\n')
Then use aria2c
on CLI to download into data/robotstxt/
.
The following was run:
aria2c -i robotstxt-from-mbfc.aria
-j 12 \
--console-log-level=error \
--log=logs/robotstxt-from-mbfc.log