forked from jcchouinard/SEO-Projects
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrobotstxt_to_df.py
84 lines (71 loc) · 2.67 KB
/
robotstxt_to_df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
'''
Parse Robots.txt to a DataFrame with Python
This script will read the robots.txt file.
Parse the robots.txt and add it to a Dataframe.
@author: Jean-Christophe Chouinard. Technical SEO and Python for SEO.
@website: jcchouinard.com
@original post: https://www.jcchouinard.com/robots-txt-parsing-with-python/
@other work: https://www.jcchouinard.com/python-for-seo, https://www.jcchouinard.com/google-search-console-api
@LinkedIn: https://www.linkedin.com/in/jeanchristophechouinard/
@Twitter: https://www.twitter.com/@ChouinardJC
'''
import pandas as pd
import os
from urllib.parse import urlparse
ua = 'User-agent'
url = 'https://www.example.com'
def get_robots_url(url):
domain_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
robots_url = domain_url + '/robots.txt'
return robots_url
def read_robots_txt(url):
robot_url = get_robots_url(url)
robot_file = os.popen(f'curl {robot_url}').read()
return robot_file
def initialize_dict(url):
robot_file = read_robots_txt(url)
result_data_set = {ua:{}}
for line in robot_file.split("\n"):
if line.startswith(ua):
result_data_set[ua].update({line.split(':')[1].strip():{}})
keys = []
for key in result_data_set[ua].keys():
keys.append(key)
return result_data_set, keys, robot_file
def parse_robot(url):
idict = initialize_dict(url)
result_data_set = idict[0]
keys = idict[1]
robot_file = idict[2]
print_flag = False
for i in range(len(keys)):
if i <= len(keys)-2:
end_str = keys[i+1]
else:
end_str = 'We are done'
result_data_set[ua][keys[i]]['Disallow'] = []
result_data_set[ua][keys[i]]['Allow'] = []
for line in robot_file.split("\n"):
if end_str in line:
print_flag = False
elif keys[i] in line:
print_flag = True
elif print_flag:
if line.startswith('Disallow') or line.startswith('Allow'):
status = line.split(':')[0].strip()
val = line.split(':')[1].strip()
result_data_set[ua][keys[i]][status].append(val)
return result_data_set
def robots_to_df(url):
result_data_set = parse_robot(url)
ls = {ua:[],'Status':[],'Pattern':[]}
for k,v in result_data_set.items():
for v in result_data_set[k]:
for key,value in result_data_set[k][v].items():
for value in result_data_set[k][v][key]:
ls[ua].append(v)
ls['Status'].append(key)
ls['Pattern'].append(value)
robots_df = pd.DataFrame.from_dict(ls)
return robots_df
robots_to_df(url)