-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpychan_utils.py
80 lines (69 loc) · 2.73 KB
/
pychan_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import urllib2
from re import sub, match
from HTMLParser import HTMLParser
######################## Text Processing Utilities ########################
class PyChanUtils():
@staticmethod
def strip_html(comment):
parser = HTMLParser()
comment = parser.unescape(comment)
comment = sub("<w?br/?>", "\n", comment)
comment = sub("<a href=\".+\" class=\"(\w+)\">", \
" ", comment)
comment = sub("</a>", " ", comment)
comment = sub("<span class=\"(\w+)\">", " ", comment)
comment = sub("</span>", " ", comment)
comment = sub("<pre class=\"(\w+)\">", " ", comment)
comment = sub("</pre>", " ", comment)
return comment
@staticmethod
def exclude_replies(comment):
lines = comment.split("\n")
lines = filter(lambda x: not bool(match(">>(\d+)", x.strip())), lines)
comment = "\n".join(lines)
comment = sub(">>(\d+) ", " ", comment)
return comment
@staticmethod
def exclude_greentext_lines(comment):
lines = comment.split("\n")
lines = filter(lambda x: not bool(match("^>([^>]+)", x.strip())), lines)
return "\n".join(lines)
@staticmethod
def exclude_normal_lines(comment):
lines = comment.split("\n")
lines = filter(lambda x: bool(match("^[^>](.+)", x.strip())), lines)
return "\n".join(lines)
@staticmethod
def full_preprocess(comment, include_greentext=True):
comment = PyChanUtils.strip_html(comment)
comment = PyChanUtils.exclude_replies(comment)
if not include_greentext:
comment = PyChanUtils.exclude_greentext_lines(comment)
comment = sub("[^\x00-\x7F]", " ", comment)
comment = comment.lower()
comment = sub("&(amp|lt|gt|ge|le)(;|)", " ", comment)
comment = sub("http([^ ]*)", " ", comment)
# make sure words are joined on m-dashes and quotes,
# e.g. don't -> dont
comment = sub("[^a-z \-']+", " ", comment)
comment = sub("'|-", "", comment)
comment = sub("\\s\\s+", " ", comment)
comment = sub("\n", " ", comment)
comment = str(comment).strip()
return comment
######################## HTTP Request Utilities ########################
class PyChanRequest():
@staticmethod
def get(url, user_agent="pychan"):
"""
Naive URL handler; retrieves a resource from a URL.
"""
request = urllib2.Request(url)
request.add_header("User-Agent", user_agent)
response = urllib2.urlopen(request)
if response.code == 200:
data = response.read()
response.close()
return data
else:
raise(urllib2.URLError("Status: %s" % response.code))