-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_features.py
136 lines (105 loc) · 4.51 KB
/
generate_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import argparse
import logging
import pandas as pd
import spacy
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
import config
def possible_negation_prefix(text: str) -> bool:
"""
Checks if the texts contains a possible negation prefix
:param text: string containing a token
:return: True if the texts starts with a possible negation prefix, False if not
"""
prefixes = ("de", "dis", "im", "in", "ir", "il", "non", "un", "mis")
# Length is mentioned to limit wrong prefix recongnition such as "none" or "mist"
return text.startswith(prefixes) and len(text) >= 5
def possible_negation_suffix(text: str) -> bool:
"""
Checks if the texts contains a possible negation suffix
:param text: string containing a token
:return: True if the texts ends with a possible negation suffix, False if not
"""
suffixes = ("less",)
# length is mentioned so it doesn't consider "less" as containing the suffix
return text.endswith(suffixes) and len(text) >= 5
def generate_features(
df: pd.DataFrame, spacy_model: str, language: str
) -> pd.DataFrame:
"""
Extends the dataframe by adding columns for newly generated features.
Lemma, pos-tag, snowballstem, porterstem, if it contains a possible negation prefix or suffix, next and previous lemma, next and previous pos-tag
:param df: dataframe that contains the presented data in conll-format
:param spacy_model: name of SpaCy model used for features extractiom
:param language: language used as parameter of Snowball Stemmer
:return: Pandas dataframe that contains 11 more columns containing the afformentioned features
"""
logging.info("Loading Spacy model...")
nlp = spacy.load(spacy_model)
# Makes all tokens lowercase
logging.info("Lowercase")
df["token_lower"] = df["token"].str.lower()
logging.info("Lemma, pos")
spacy_pipe = nlp.pipe(df["token_lower"].values, disable=["ner", "parser"])
features_gen = ((doc[0].lemma_, doc[0].pos_) for doc in spacy_pipe)
df["lemma"], df["pos"] = zip(*features_gen)
# Prepare stemmers
logging.info("Loading Snowball Stemmer...")
snow = SnowballStemmer(language=language)
logging.info("Snowball stemmer")
df["snowballStemmer"] = df.apply(lambda row: snow.stem(row["token_lower"]), axis=1)
logging.info("Loading Porter Stemmer...")
port = PorterStemmer()
logging.info("Porter stemmer")
df["porterStemmer"] = df.apply(lambda row: port.stem(row["token_lower"]), axis=1)
# Adds columns with a binary if the word contains a possible negation prefix or suffix
logging.info("Prefix")
df["possible_prefix"] = df.apply(
lambda row: possible_negation_prefix(row["token_lower"]), axis=1
)
logging.info("Suffix")
df["possible_suffix"] = df.apply(
lambda row: possible_negation_suffix(row["token_lower"]), axis=1
)
# Adds new columns for the previous and next lemma and pos-tag
logging.info("Add prev/next shifts")
df["prev_Lemma"] = df["lemma"].shift(periods=1)
df["next_Lemma"] = df["lemma"].shift(periods=-1)
df["prev_pos"] = df["pos"].shift(periods=1)
df["next_pos"] = df["pos"].shift(periods=-1)
return df
def run_generate_features(
data_file, features_file, spacy_model="en_core_web_sm", language="english"
):
# Load spacy model
logging.info(f"Loading data: {data_file}")
df = pd.read_csv(
data_file, delimiter="\t", names=["corpus", "n_sent", "n_word", "token", "tag"]
)
logging.info("Generating features...")
df = generate_features(df, spacy_model=spacy_model, language=language)
# Store generated conll file
logging.info(f"Saving features to: {features_file}")
df.to_csv(features_file, sep="\t")
logging.info("\n" + str(df.head()))
if __name__ == "__main__":
logging.basicConfig(level=config.LOG_LEVEL, format="%(asctime)s: %(message)s")
parser = argparse.ArgumentParser(description="Generate lexical features")
parser.add_argument("data_file", type=str, help="Data in CoNLL format.")
parser.add_argument("features_file", type=str, help="Path out output file")
parser.add_argument(
"--spacy-model",
required=False,
type=str,
default="en_core_web_sm",
help="Name of SpaCy model",
)
parser.add_argument(
"--language",
required=False,
type=str,
default="english",
help="Language used in PorterStemmer",
)
args = parser.parse_args()
run_generate_features(**vars(args))