-
Notifications
You must be signed in to change notification settings - Fork 5
/
utils.py
34 lines (25 loc) · 1008 Bytes
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# -*- coding: utf-8 -*-
import re
# TODO: add allowed starting punctuation
# From boltons
HASHTAG_RE = re.compile(r"(?:^|\s)[##]{1}(\w+)", re.UNICODE)
MENTION_RE = re.compile(r"(?:^|\s)[@@]{1}([^\s#<>[\]|{}]+)", re.UNICODE)
def to_unicode(obj):
try:
return unicode(obj)
except UnicodeDecodeError:
return unicode(obj, encoding='utf8')
def find_hashtags(string):
"""Finds and returns all hashtags in a string, with the hashmark
removed. Supports full-width hashmarks for Asian languages and
does not false-positive on URL anchors.
>>> find_hashtags('#atag http://asite/#ananchor')
['atag']
``find_hashtags`` also works with unicode hashtags.
"""
# the following works, doctest just struggles with it
# >>> find_hashtags(u"can't get enough of that dignity chicken #肯德基 woo")
# [u'\u80af\u5fb7\u57fa']
return HASHTAG_RE.findall(to_unicode(string))
def find_mentions(string):
return MENTION_RE.findall(to_unicode(string))