Much needed cleanup

andronikov · Jun 2, 2013 · 0ef9c13 · 0ef9c13
1 parent df21f82
commit 0ef9c13
Show file tree

Hide file tree

Showing 13 changed files with 63 additions and 1,867 deletions.
diff --git a/comments.py b/comments.py
@@ -8,22 +8,11 @@
 from lxml.html.soupparser import fromstring
 
 from datetime import datetime
-import unicodecsv
-
-from BeautifulSoup import UnicodeDammit
-
-def decode_html(html_string):
-    converted = UnicodeDammit(html_string, isHTML=True)
-    if not converted.unicode:
-        raise UnicodeDecodeError(
-            "Failed to detect encoding, tried [%s]",
-            ', '.join(converted.triedEncodings))
-    # print converted.originalEncoding
-    return converted.unicode
+import csv
 
 def get_comments(torrent_id, protocol):
     print "Getting comments:",
-    r = requests.get(protocol + "://thepiratebay.se/ajax_details_comments.php?id=" + str(torrent_id), headers={'user-agent': 'Archiving The Pirate Bay!'})
+    r = requests.get(protocol + "://thepiratebay.sx/ajax_details_comments.php?id=" + str(torrent_id), headers={'user-agent': 'Archiving The Pirate Bay!'})
     if (r.status_code == 200):
         if (r.content == ''):
             print str(r.status_code) + ", but no comments"
@@ -50,10 +39,10 @@ def get_comments(torrent_id, protocol):
 
     comments_csv = open(path + "/comments.csv", 'w')
     comments_csv.write(u'\ufeff'.encode('utf-8')) # BOM
-    csv_writer = unicodecsv.writer(comments_csv, encoding='utf-8')
+    csv_writer = csv.writer(comments_csv)
     csv_writer.writerow(['User Type', 'Username', 'Date', 'Text'])
 
-    root = fromstring(decode_html(r.content))
+    root = fromstring(unicode(r.content, 'utf-8'))
 
     comment_array = root.xpath('.//div[starts-with(@id, "comment-")]')
 
@@ -66,14 +55,11 @@ def get_comments(torrent_id, protocol):
         try:
             username = comment.find('p/a').get('title')[7:]
             timestamp = comment.find('p').text_content()[-22:-6]
-            text = unicode(comment.find('div[@class="comment"]').text_content()[1:-1]).replace(u'\xa0', ' ')
+            text = unicode(comment.find('div[@class="comment"]').text_content())[1:-1].replace(u'\xa0', ' ')
 
             comment_date = datetime.strptime(timestamp, "%Y-%m-%d %H:%M")
 
-            cleaned_up_data = []
-            for i in [usertype, username, comment_date.isoformat()[:-3] + "Z", text]:
-                cleaned_up_data.append(i.encode('utf-8', 'replace'))
-            csv_writer.writerow(cleaned_up_data)
+            csv_writer.writerow([entry.encode('utf-8') for entry in [usertype, username, comment_date.isoformat()[:-3] + "Z", text]])
         except AttributeError:
             pass
 

diff --git a/comments_bs.py b/comments_bs.py
diff --git a/comments_old.py b/comments_old.py
diff --git a/download.py b/download.py
@@ -18,15 +18,11 @@
 import sys
 import os
 import HTMLParser
+
 import torrent_page
-import torrent_page_old
-import torrent_page_bs
 import filelist
-import filelist_old
-import filelist_bs
 import comments
-import comments_old
-import comments_bs
+
 import requests
 import datetime
 
@@ -40,38 +36,10 @@ def main():
 
     while True:
         try:
-            try:
-                tp_status_code = torrent_page.get_torrent_page(torrent_id, protocol)
-
-            except IndexError:
-                print "Falling back to BeautifulSoup for torrent page:"
-                tp_status_code = torrent_page_bs.get_torrent_page(torrent_id, protocol)
-            if (tp_status_code == 200):
-                try:
-                    filelist.get_filelist(torrent_id, protocol)
-                except ValueError:
-                    print "Falling back to BeautifulSoup for filelist:"
-                    filelist_bs.get_filelist(torrent_id, protocol)
-                try:
-                    comments.get_comments(torrent_id, protocol)
-                except ValueError:
-                    print "Falling back to BeautifulSoup for comments:"
-                    comments_bs.get_comments(torrent_id, protocol)
-
-            except ValueError:
-                print "Falling back to BeautifulSoup for torrent page:"
-                tp_status_code = torrent_page_old.get_torrent_page(torrent_id, protocol)
+            tp_status_code = torrent_page.get_torrent_page(torrent_id, protocol)
             if (tp_status_code == 200):
-                try:
-                    filelist.get_filelist(torrent_id, protocol)
-                except ValueError:
-                    print "Falling back to BeautifulSoup for filelist:"
-                    filelist_old.get_filelist(torrent_id, protocol)
-                try:
-                    comments.get_comments(torrent_id, protocol)
-                except ValueError:
-                    print "Falling back to BeautifulSoup for comments:"
-                    comments_old.get_comments(torrent_id, protocol)
+                filelist.get_filelist(torrent_id, protocol)
+                comments.get_comments(torrent_id, protocol)
             elif (tp_status_code == 404):
                 print "Skipping filelist..."
                 print "Skipping comments..."
@@ -90,11 +58,7 @@ def main():
             tp_status_code = torrent_page.get_torrent_page(torrent_id, protocol)
             if (tp_status_code == 200):
                 filelist.get_filelist(torrent_id, protocol)
-                try:
-                    comments.get_comments(torrent_id, protocol)
-                except ValueError:
-                    print "Falling back to BeautifulSoup for comments:"
-                    comments_old.get_comments(torrent_id, protocol)
+                comments.get_comments(torrent_id, protocol)
             else:
                 print "Skipping filelist..."
                 print "Skipping comments..."
@@ -108,11 +72,7 @@ def main():
             tp_status_code = torrent_page.get_torrent_page(torrent_id, protocol)
             if (tp_status_code == 200):
                 filelist.get_filelist(torrent_id, protocol)
-                try:
-                    comments.get_comments(torrent_id, protocol)
-                except ValueError:
-                    print "Falling back to BeautifulSoup for comments:"
-                    comments_old.get_comments(torrent_id, protocol)
+                comments.get_comments(torrent_id, protocol)
             else:
                 print "Skipping filelist..."
                 print "Skipping comments..."
@@ -142,19 +102,16 @@ def main():
     torrent_id = sys.argv[1+offset]
     print torrent_id
     main()
-    print str(float(100000 - (torrent_id % 100000) ) / 1000.0) + "%"
 
 elif (len(sys.argv) == 3+offset):
     if (int(sys.argv[1+offset]) > (int(sys.argv[2+offset])+1)):
         for torrent_id in range(int(sys.argv[1+offset]),int(sys.argv[2+offset])-1, -1):
             print torrent_id
             main()
-            print str(float(100000 - (torrent_id % 100000) ) / 1000.0) + "%"
     else:
         for torrent_id in range(int(sys.argv[1+offset]),int(sys.argv[2+offset])+1):
             print torrent_id
             main()
-            print str(float(100000 - (torrent_id % 100000) ) / 1000.0) + "%"
 
 elif (len(sys.argv) > 3 and not https):
     print "ERROR: Too many arguments"
diff --git a/filelist.py b/filelist.py
@@ -22,13 +22,13 @@
 import lxml.html
 from lxml.html.soupparser import fromstring
 
-import unicodecsv
+import csv
 
 empty_filelist = u'<div style="background:#FFFFFF none repeat scroll 0%clear:left;margin:0;min-height:0px;padding:0;width:100%;">\n<table style="border:0pt none;width:100%;font-family:verdana,Arial,Helvetica,sans-serif;font-size:11px;">\n</table>\n</div>\n'
 
 def get_filelist(torrent_id, protocol):
     print "Getting filelist:",
-    r = requests.get(protocol + "://thepiratebay.se/ajax_details_filelist.php?id=" + str(torrent_id), headers={'user-agent': 'Archiving The Pirate Bay!'})
+    r = requests.get(protocol + "://thepiratebay.sx/ajax_details_filelist.php?id=" + str(torrent_id), headers={'user-agent': 'Archiving The Pirate Bay!'})
     if (r.status_code == 200):
         if (unicode(r.content.decode('ISO-8859-1')) == empty_filelist):
             print str(r.status_code) + ", but no filelist"
@@ -54,14 +54,14 @@ def get_filelist(torrent_id, protocol):
 
         filelist_csv = open(path + "/filelist.csv", 'w')
         filelist_csv.write(u'\ufeff'.encode('utf-8')) # BOM
-        csv_writer = unicodecsv.writer(filelist_csv, encoding='utf-8')
+        csv_writer = csv.writer(filelist_csv)
         csv_writer.writerow(['Filename','Size','Unit'])
-        html = fromstring(r.content.decode('ISO-8859-1').replace('</td><td align="right">',u'\xa0'))
+        html = fromstring(unicode(r.content, 'utf-8').replace('</td><td align="right">',u'\xa0'))
         filetable = [fileentry.split(u'\xa0') for fileentry in html.xpath('div/table')[0].text_content().split('\n')[1:-1]]
         for entry in filetable:
             entry[-1] = entry[-1][0]
         for entry in filetable:
-            csv_writer.writerow([column.encode('utf-8', 'replace') for column in entry])
+            csv_writer.writerow([column.encode('utf-8') for column in entry])
     else:
         print r.status_code