Skip to content

Commit

Permalink
Much needed cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
root committed Jun 2, 2013
1 parent df21f82 commit 0ef9c13
Show file tree
Hide file tree
Showing 13 changed files with 63 additions and 1,867 deletions.
26 changes: 6 additions & 20 deletions comments.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,11 @@
from lxml.html.soupparser import fromstring

from datetime import datetime
import unicodecsv

from BeautifulSoup import UnicodeDammit

def decode_html(html_string):
converted = UnicodeDammit(html_string, isHTML=True)
if not converted.unicode:
raise UnicodeDecodeError(
"Failed to detect encoding, tried [%s]",
', '.join(converted.triedEncodings))
# print converted.originalEncoding
return converted.unicode
import csv

def get_comments(torrent_id, protocol):
print "Getting comments:",
r = requests.get(protocol + "://thepiratebay.se/ajax_details_comments.php?id=" + str(torrent_id), headers={'user-agent': 'Archiving The Pirate Bay!'})
r = requests.get(protocol + "://thepiratebay.sx/ajax_details_comments.php?id=" + str(torrent_id), headers={'user-agent': 'Archiving The Pirate Bay!'})
if (r.status_code == 200):
if (r.content == ''):
print str(r.status_code) + ", but no comments"
Expand All @@ -50,10 +39,10 @@ def get_comments(torrent_id, protocol):

comments_csv = open(path + "/comments.csv", 'w')
comments_csv.write(u'\ufeff'.encode('utf-8')) # BOM
csv_writer = unicodecsv.writer(comments_csv, encoding='utf-8')
csv_writer = csv.writer(comments_csv)
csv_writer.writerow(['User Type', 'Username', 'Date', 'Text'])

root = fromstring(decode_html(r.content))
root = fromstring(unicode(r.content, 'utf-8'))

comment_array = root.xpath('.//div[starts-with(@id, "comment-")]')

Expand All @@ -66,14 +55,11 @@ def get_comments(torrent_id, protocol):
try:
username = comment.find('p/a').get('title')[7:]
timestamp = comment.find('p').text_content()[-22:-6]
text = unicode(comment.find('div[@class="comment"]').text_content()[1:-1]).replace(u'\xa0', ' ')
text = unicode(comment.find('div[@class="comment"]').text_content())[1:-1].replace(u'\xa0', ' ')

comment_date = datetime.strptime(timestamp, "%Y-%m-%d %H:%M")

cleaned_up_data = []
for i in [usertype, username, comment_date.isoformat()[:-3] + "Z", text]:
cleaned_up_data.append(i.encode('utf-8', 'replace'))
csv_writer.writerow(cleaned_up_data)
csv_writer.writerow([entry.encode('utf-8') for entry in [usertype, username, comment_date.isoformat()[:-3] + "Z", text]])
except AttributeError:
pass

Expand Down
73 changes: 0 additions & 73 deletions comments_bs.py

This file was deleted.

74 changes: 0 additions & 74 deletions comments_old.py

This file was deleted.

57 changes: 7 additions & 50 deletions download.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,11 @@
import sys
import os
import HTMLParser

import torrent_page
import torrent_page_old
import torrent_page_bs
import filelist
import filelist_old
import filelist_bs
import comments
import comments_old
import comments_bs

import requests
import datetime

Expand All @@ -40,38 +36,10 @@ def main():

while True:
try:
try:
tp_status_code = torrent_page.get_torrent_page(torrent_id, protocol)

except IndexError:
print "Falling back to BeautifulSoup for torrent page:"
tp_status_code = torrent_page_bs.get_torrent_page(torrent_id, protocol)
if (tp_status_code == 200):
try:
filelist.get_filelist(torrent_id, protocol)
except ValueError:
print "Falling back to BeautifulSoup for filelist:"
filelist_bs.get_filelist(torrent_id, protocol)
try:
comments.get_comments(torrent_id, protocol)
except ValueError:
print "Falling back to BeautifulSoup for comments:"
comments_bs.get_comments(torrent_id, protocol)

except ValueError:
print "Falling back to BeautifulSoup for torrent page:"
tp_status_code = torrent_page_old.get_torrent_page(torrent_id, protocol)
tp_status_code = torrent_page.get_torrent_page(torrent_id, protocol)
if (tp_status_code == 200):
try:
filelist.get_filelist(torrent_id, protocol)
except ValueError:
print "Falling back to BeautifulSoup for filelist:"
filelist_old.get_filelist(torrent_id, protocol)
try:
comments.get_comments(torrent_id, protocol)
except ValueError:
print "Falling back to BeautifulSoup for comments:"
comments_old.get_comments(torrent_id, protocol)
filelist.get_filelist(torrent_id, protocol)
comments.get_comments(torrent_id, protocol)
elif (tp_status_code == 404):
print "Skipping filelist..."
print "Skipping comments..."
Expand All @@ -90,11 +58,7 @@ def main():
tp_status_code = torrent_page.get_torrent_page(torrent_id, protocol)
if (tp_status_code == 200):
filelist.get_filelist(torrent_id, protocol)
try:
comments.get_comments(torrent_id, protocol)
except ValueError:
print "Falling back to BeautifulSoup for comments:"
comments_old.get_comments(torrent_id, protocol)
comments.get_comments(torrent_id, protocol)
else:
print "Skipping filelist..."
print "Skipping comments..."
Expand All @@ -108,11 +72,7 @@ def main():
tp_status_code = torrent_page.get_torrent_page(torrent_id, protocol)
if (tp_status_code == 200):
filelist.get_filelist(torrent_id, protocol)
try:
comments.get_comments(torrent_id, protocol)
except ValueError:
print "Falling back to BeautifulSoup for comments:"
comments_old.get_comments(torrent_id, protocol)
comments.get_comments(torrent_id, protocol)
else:
print "Skipping filelist..."
print "Skipping comments..."
Expand Down Expand Up @@ -142,19 +102,16 @@ def main():
torrent_id = sys.argv[1+offset]
print torrent_id
main()
print str(float(100000 - (torrent_id % 100000) ) / 1000.0) + "%"

elif (len(sys.argv) == 3+offset):
if (int(sys.argv[1+offset]) > (int(sys.argv[2+offset])+1)):
for torrent_id in range(int(sys.argv[1+offset]),int(sys.argv[2+offset])-1, -1):
print torrent_id
main()
print str(float(100000 - (torrent_id % 100000) ) / 1000.0) + "%"
else:
for torrent_id in range(int(sys.argv[1+offset]),int(sys.argv[2+offset])+1):
print torrent_id
main()
print str(float(100000 - (torrent_id % 100000) ) / 1000.0) + "%"

elif (len(sys.argv) > 3 and not https):
print "ERROR: Too many arguments"
10 changes: 5 additions & 5 deletions filelist.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@
import lxml.html
from lxml.html.soupparser import fromstring

import unicodecsv
import csv

empty_filelist = u'<div style="background:#FFFFFF none repeat scroll 0%clear:left;margin:0;min-height:0px;padding:0;width:100%;">\n<table style="border:0pt none;width:100%;font-family:verdana,Arial,Helvetica,sans-serif;font-size:11px;">\n</table>\n</div>\n'

def get_filelist(torrent_id, protocol):
print "Getting filelist:",
r = requests.get(protocol + "://thepiratebay.se/ajax_details_filelist.php?id=" + str(torrent_id), headers={'user-agent': 'Archiving The Pirate Bay!'})
r = requests.get(protocol + "://thepiratebay.sx/ajax_details_filelist.php?id=" + str(torrent_id), headers={'user-agent': 'Archiving The Pirate Bay!'})
if (r.status_code == 200):
if (unicode(r.content.decode('ISO-8859-1')) == empty_filelist):
print str(r.status_code) + ", but no filelist"
Expand All @@ -54,14 +54,14 @@ def get_filelist(torrent_id, protocol):

filelist_csv = open(path + "/filelist.csv", 'w')
filelist_csv.write(u'\ufeff'.encode('utf-8')) # BOM
csv_writer = unicodecsv.writer(filelist_csv, encoding='utf-8')
csv_writer = csv.writer(filelist_csv)
csv_writer.writerow(['Filename','Size','Unit'])
html = fromstring(r.content.decode('ISO-8859-1').replace('</td><td align="right">',u'\xa0'))
html = fromstring(unicode(r.content, 'utf-8').replace('</td><td align="right">',u'\xa0'))
filetable = [fileentry.split(u'\xa0') for fileentry in html.xpath('div/table')[0].text_content().split('\n')[1:-1]]
for entry in filetable:
entry[-1] = entry[-1][0]
for entry in filetable:
csv_writer.writerow([column.encode('utf-8', 'replace') for column in entry])
csv_writer.writerow([column.encode('utf-8') for column in entry])
else:
print r.status_code

Expand Down
Loading

0 comments on commit 0ef9c13

Please sign in to comment.