Process robots.txt files#

1. Import#

import os
import glob

import numpy as np
import pandas as pd
import yaml
if 'notebooks' in os.getcwd():
    os.chdir('..')

from src.preprocess import *

2. Define user agent tags#

These could be considered AI-related agents to tag first

# https://neil-clarke.com/block-the-bots-that-feed-ai-models-by-scraping-your-website/
# https://www.nytimes.com/robots.txt
known_ai_bots = [
    'aibot',
    'aihitbot', # https://www.aihitdata.com/about
    'anthropic-ai',
    'claude-web',
    'cohere-ai',
    'google-extended', 
    'openai',
    'gptbot',
    'chatgpt-user', 
    'chatgpt',
    'ccbot',
    'ccbots',
    'facebookbot',
    'perplexity.ai', 
    'jasper.ai',
    'omgili', 
    'omgilibot',
    'amazonbot', # not sure whether Amazon would use this for Alexa
    'sentibot' # based on https://user-agents.net/bots/sentibot, this is of
]

bot_tag_dict = {x: 'possible-ai-crawler' for x in known_ai_bots}

3. Process the sites from the mbfc list#

mbfc_file = 'data/mbfc/sites.csv'
robottxt_files = glob.glob('data/robotstxt/*')
df_agents, info = obtain_user_agents(
    robottxt_files, 
    mbfc_file, 
    agent_tags=bot_tag_dict,
    from_wayback=False,
    drop_dup_sites=True
)

Save outputs#

with open('data/proc/mbfc_sites_user_agents_proc_info.yml', 'w') as f:
    yaml.safe_dump(info, f)
df_agents.to_csv('data/proc/mbfc_sites_user_agents.csv', index=False)

4. Process the select sites from wayback#

wayback_robottxt_files = [
    x for x in glob.glob('data/robots-wayback/*/*')
    if os.path.basename(x).startswith(os.path.basename(os.path.dirname(x)))
]

df_agents, info = obtain_user_agents(
    wayback_robottxt_files,
    mbfc_file,
    agent_tags=bot_tag_dict,
    from_wayback=True,
    drop_dup_sites=False
)

Save outputs#

with open('data/proc/wayback_select_sites_user_agents_proc_info.yml', 'w') as f:
    yaml.safe_dump(info, f)
df_agents.to_csv('data/proc/wayback_select_sites_user_agents.csv', index=False)