-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcleaning.py
61 lines (47 loc) · 2.02 KB
/
cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# Implemantation from https://github.com/Hironsan/natural-language-preprocessings/blob/master/preprocessings/ja/cleaning.py
import re
from bs4 import BeautifulSoup
from utils_nlp.dataset.livedoor import load_pandas_df
def clean_text(text):
# replaced_text = '\n'.join(s.strip() for s in text.splitlines()[2:] if s != '') # skip header by [2:]
replaced_text = text.lower()
replaced_text = re.sub(r'[【】]', ' ', replaced_text) # 【】の除去
replaced_text = re.sub(r'[()()]', ' ', replaced_text) # ()の除去
replaced_text = re.sub(r'[[]\[\]]', ' ', replaced_text) # []の除去
replaced_text = re.sub(r'[@@]\w+', '', replaced_text) # メンションの除去
replaced_text = re.sub(r'https?:\/\/.*?[\r\n ]', '', replaced_text) # URLの除去
replaced_text = re.sub(r' ', ' ', replaced_text) # 全角空白の除去
return replaced_text
def clean_html_tags(html_text):
soup = BeautifulSoup(html_text, 'html.parser')
cleaned_text = soup.get_text()
cleaned_text = ''.join(cleaned_text.splitlines())
return cleaned_text
def clean_html_and_js_tags(html_text):
soup = BeautifulSoup(html_text, 'html.parser')
[x.extract() for x in soup.findAll(['script', 'style'])]
cleaned_text = soup.get_text()
cleaned_text = ''.join(cleaned_text.splitlines())
return cleaned_text
def clean_url(html_text):
"""
S+ matches all non-whitespace characters (the end of the url)
:param html_text:
:return:
"""
clean_text = re.sub(r'http\S+', '', html_text)
return clean_text
def clean_code(html_text):
"""Qiitaのコードを取り除きます
:param html_text:
:return:
"""
soup = BeautifulSoup(html_text, 'html.parser')
[x.extract() for x in soup.findAll(class_="code-frame")]
cleaned_text = soup.get_text()
cleaned_text = ''.join(cleaned_text.splitlines())
return cleaned_text
if __name__ == '__main__':
df = load_pandas_df(nrows=10)
df['text'] = df['text'].map(clean_text)
print(df.head())