-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlink_extractor.py
62 lines (51 loc) · 2.68 KB
/
link_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python
import sys, argparse, os, re
from collections import OrderedDict
from BeautifulSoup import BeautifulSoup
import requests
def main(args):
"""
Command line tool to extract links from web pages
"""
parser = argparse.ArgumentParser(description="Command line tool to extract links from web pages")
parser.add_argument("-i", "--input-file", help="Location of csv file containing pages to scrape, and what to scrape for (see sample_input.csv for format)", required=True)
parser.add_argument("-o", "--output-dir", help="Location of output directory where results.csv should be written", required=True)
parsed_args = parser.parse_args(args=args)
cmd_vars = vars(parsed_args)
output_dir = cmd_vars["output_dir"]
if not os.path.isdir(output_dir):
print "Output directory should exist before running this script"
sys.exit()
output_file = os.path.join(output_dir,"results.csv")
try:
input_lines = [line.strip().split(",") for line in open(cmd_vars["input_file"], 'rU')]
source_urls = [line[0] for line in input_lines[1:] if line[0]]
look_for_urls = [line[1] for line in input_lines[1:] if len(line) > 1 and line[1]]
except IOError:
print "There was a problem opening the input file"
sys.exit()
source_urls_to_results = OrderedDict()
for source_url in source_urls:
try:
r = requests.get("http://{0}?".format(source_url), allow_redirects=True)
source_urls_to_results[source_url] = OrderedDict()
except requests.exceptions.ConnectionError:
print "Connection error for {0}, skipping".format(source_url)
continue
soup = BeautifulSoup(r.text)
for look_for_url in look_for_urls:
matches = soup.findAll("a", href=re.compile(look_for_url))
# Do some basic deduplication of links that are found
source_urls_to_results[source_url][look_for_url] = \
set([match["href"].lower().replace("https://","").replace("http://","").replace("www.","") for match in matches])
print source_urls_to_results
with open(output_file, "w") as out_file:
out_file.write("urls," + ", ".join(look_for_urls) + "\n")
for source_url in source_urls_to_results.keys():
out_file.write(source_url + ",")
for cur_url_set in source_urls_to_results[source_url].values():
# Wrap cell results in '' so that we can easily import into Excel
out_file.write("'" + ",".join(cur_url_set) + "',")
out_file.write("\n")
if __name__ == "__main__":
main(sys.argv[1:])