Process robots.txt
files#
1. Import#
import os
import glob
import numpy as np
import pandas as pd
import yaml
if 'notebooks' in os.getcwd():
os.chdir('..')
from src.preprocess import *
3. Process the sites from the mbfc
list#
mbfc_file = 'data/mbfc/sites.csv'
robottxt_files = glob.glob('data/robotstxt/*')
df_agents, info = obtain_user_agents(
robottxt_files,
mbfc_file,
agent_tags=bot_tag_dict,
from_wayback=False,
drop_dup_sites=True
)
Save outputs#
with open('data/proc/mbfc_sites_user_agents_proc_info.yml', 'w') as f:
yaml.safe_dump(info, f)
df_agents.to_csv('data/proc/mbfc_sites_user_agents.csv', index=False)
4. Process the select sites from wayback
#
wayback_robottxt_files = [
x for x in glob.glob('data/robots-wayback/*/*')
if os.path.basename(x).startswith(os.path.basename(os.path.dirname(x)))
]
df_agents, info = obtain_user_agents(
wayback_robottxt_files,
mbfc_file,
agent_tags=bot_tag_dict,
from_wayback=True,
drop_dup_sites=False
)
Save outputs#
with open('data/proc/wayback_select_sites_user_agents_proc_info.yml', 'w') as f:
yaml.safe_dump(info, f)
df_agents.to_csv('data/proc/wayback_select_sites_user_agents.csv', index=False)