Skip to content

Commit

Permalink
function words map
Browse files Browse the repository at this point in the history
  • Loading branch information
vito-go committed Mar 3, 2024
1 parent c59cb71 commit 9130f80
Show file tree
Hide file tree
Showing 2 changed files with 192 additions and 5 deletions.
7 changes: 2 additions & 5 deletions mywords-go/artical/article.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ func ParseSourceUrl(sourceUrl string, expr string, proxyUrl *url.URL) (*Article,
}

// ParseVersion 如果article的文件的version不同,则进入文章页面会重新进行解析,但是不会更新解析时间。
const ParseVersion = "0.0.6"
const ParseVersion = "0.0.7"

var regSentenceSplit = regexp.MustCompile(`[^ ][^ ][^ ][^ ]\. [A-Z“]`)

Expand Down Expand Up @@ -162,7 +162,7 @@ loopSentences:
continue
}
//word = strings.TrimPrefix(word, "’")
if _, ok := meaninglessMap[strings.ToLower(word)]; ok {
if _, ok := functionWordsMap[strings.ToLower(word)]; ok {
continue
}
if len(word) < minLen {
Expand Down Expand Up @@ -271,6 +271,3 @@ func getRespBody(www string, proxyUrl *url.URL) ([]byte, error) {
}
return body, nil
}

// define meaninglessMap
var meaninglessMap = map[string]struct{}{}
190 changes: 190 additions & 0 deletions mywords-go/artical/function_words.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
package artical

/*
英语中的虚词(Function Words)包括几类词汇,这些词在句子中起着重要的结构或者语法作用,但它们通常不带有实际的内容信息。虚词包括了以下几种词类:
1. 代词(Pronouns): 用于代替名词的词,如 I, you, he, she, it, we, they, me, him, her, us, them.
2. 冠词(Articles): 用于修饰名词,标明特指或泛指,如 the, a, an.
3. 介词(Prepositions): 表示时间、位置、方向、原因等关系的词,如 in, on, at, to, for, with, by, over, under.
4. 连词(Conjunctions): 用于连接词、短语或从句,如 and, but, or, because, although, if, when, while.
5. 助动词(Auxiliary Verbs): 用于形成时态、语态或语气的动词,如 be, do, have, will, can, could, should, might.
6. 情态动词(Modal Verbs): 表示可能性、能力、许可等概念的特殊助动词,如 can, could, may, might, must, shall, should, will, would.
7. 量词(Quantifiers): 用于表示数量或者程度的词,如 some, any, many, much, few, little, enough.
8. 限定词(Determiners): 用来限定名词的词,常见的有 this, that, these, those, my, your, his, her, its, our, their.
9. 疑问词(Interrogatives): 用于构成疑问的词,如 who, what, where, when, why, how.
10. 叹词(Interjections): 用来表达情感的词,如 oh, ah, wow, ouch.
虚词通常在句子中不承担主要的语义功能,但对于构建句子的语法结构是必不可少的。
在一些自然语言处理或文本分析的应用中,虚词往往在某些情况下会被过滤掉,以便更集中地分析内容词(Content Words)所携带的信息。
*/

var functionWordsMap = map[string]struct{}{
"the": {},
"a": {},
"an": {},
"they": {},
"them": {},
"their": {},
"those": {},
"these": {},
"himself": {},
"herself": {},
"ourselves": {},
"themselves": {},
"anybody": {},
"everyone": {},
"someone": {},
"nobody": {},
"other": {},
"each": {},
"either": {},
"neither": {},
"something": {},
"anything": {},
"nothing": {},
"about": {},
"above": {},
"across": {},
"after": {},
"against": {},
"along": {},
"amid": {},
"on": {},
"over": {},
"among": {},
"around": {},
"before": {},
"behind": {},
"below": {},
"beneath": {},
"beside": {},
"between": {},
"beyond": {},
"during": {},
"except": {},
"inside": {},
"outside": {},
"through": {},
"towards": {},
"under": {},
"without": {},
"and": {},
"but": {},
"for": {},
"nor": {},
"or": {},
"yet": {},
"so": {},
"because": {},
"although": {},
"unless": {},
"however": {},
"whereas": {},
"whenever": {},
"be": {},
"have": {},
"do": {},
"can": {},
"could": {},
"may": {},
"might": {},
"must": {},
"shall": {},
"should": {},
"will": {},
"would": {},
"ought": {},
"some": {},
"any": {},
"many": {},
"much": {},
"few": {},
"several": {},
"plenty": {},
"enough": {},
"more": {},
"most": {},
"less": {},
"least": {},
"all": {},
"both": {},
"half": {},
"this": {},
"that": {},
"such": {},
"which": {},
"what": {},
"whose": {},
"who": {},
"whom": {},
"where": {},
"when": {},
"why": {},
"how": {},
"wow": {},
"gee": {},
"gosh": {},
"ugh": {},
"yikes": {},
"oops": {},
"alas": {},

"there": {},
"itself": {},
"myself": {},
"yourself": {},
"yourselves": {},

"everything": {},

"everybody": {},

"somebody": {},
"whoever": {},
"whomever": {},

"throughout": {},
"toward": {},
"within": {},

"wherever": {},
"while": {},
"since": {},
"is": {},
"am": {},
"are": {},
"was": {},
"were": {},
"being": {},
"been": {},
"has": {},
"had": {},
"having": {},
"does": {},
"did": {},
"doing": {},

"need": {},

"every": {},

"another": {},

"whatever": {},
"whichever": {},

"hey": {},
"ah": {},
"oh": {},
"ouch": {},
"hmm": {},

"yay": {},
"phew": {},
"bummer": {},
"woah": {},
"whoa": {},
"hurray": {},
"er": {},
"um": {},
}

0 comments on commit 9130f80

Please sign in to comment.