Skip to content

Commit

Permalink
使用Mixin设计模式进行重构,给主文件瘦身,并调整代码层级结构,提高可读性
Browse files Browse the repository at this point in the history
  • Loading branch information
blmoistawinde committed Oct 7, 2020
1 parent 642320a commit bdd1098
Show file tree
Hide file tree
Showing 19 changed files with 808 additions and 721 deletions.
3 changes: 2 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import os
import sys
sys.path.insert(0, os.path.abspath('..'))
from harvesttext import __version__


# -- Project information -----------------------------------------------------
Expand All @@ -22,7 +23,7 @@
author = 'blmoistawinde'

# The full version, including alpha/beta/rc tags
release = '0.7.4.2'
release = __version__

github_doc_root = 'https://github.com/blmoistawinde/HarvestText/tree/master/doc'

Expand Down
2 changes: 1 addition & 1 deletion examples/basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def test_case(text0,entity_mention_dict,strategy,entity_type_dict=None,**kwargs)


def find_with_rules():
from harvesttext.match_patterns import UpperFirst, AllEnglish, Contains, StartsWith, EndsWith
from harvesttext.algorithms.match_patterns import UpperFirst, AllEnglish, Contains, StartsWith, EndsWith
# some more patterns is provided
text0 = "我喜欢Python,因为requests库很适合爬虫"
ht0 = HarvestText()
Expand Down
2 changes: 1 addition & 1 deletion examples/naiveKGQA.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""

from harvesttext.harvesttext import HarvestText
from rdflib import URIRef,Graph,Namespace,Literal
from rdflib import URIRef, Graph, Namespace, Literal
from pyxdameraulevenshtein import damerau_levenshtein_distance as edit_dis
import numpy as np

Expand Down
4 changes: 3 additions & 1 deletion harvesttext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#coding=utf-8
#!/usr/bin/env python
import pickle
from .harvesttext import HarvestText
from .resources import *

import pickle
__version__ = '0.8'

def saveHT(htModel,filename):
with open(filename, "wb") as f:
htModel.prepared = False
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
123 changes: 123 additions & 0 deletions harvesttext/ent_network.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import networkx as nx
from itertools import combinations

class EntNetworkMixin:
"""
实体网络模块:
- 根据实体在文档中的共现关系
- 建立全局社交网络
- 建立以某一个实体为中心的社交网络
"""
def build_entity_graph(self, docs, min_freq=0, inv_index={}, used_types=[]):
G = nx.Graph()
links = {}
if len(inv_index) == 0:
for i, sent in enumerate(docs):
entities_info = self.entity_linking(sent)
if len(used_types) == 0:
entities = set(entity for span, (entity, type0) in entities_info)
else:
entities = set(entity for span, (entity, type0) in entities_info if type0[1:-1] in used_types)
for u, v in combinations(entities, 2):
pair0 = tuple(sorted((u, v)))
if pair0 not in links:
links[pair0] = 1
else:
links[pair0] += 1
else: # 已经有倒排文档,可以更快速检索
if len(used_types) == 0:
entities = self.entity_type_dict.keys()
else:
entities = iter(entity for (entity, type0) in self.entity_type_dict.items() if type0 in used_types)
for u, v in combinations(entities, 2):
pair0 = tuple(sorted((u, v)))
ids = inv_index[u] & inv_index[v]
if len(ids) > 0:
links[pair0] = len(ids)
for (u, v) in links:
if links[(u, v)] >= min_freq:
G.add_edge(u, v, weight=links[(u, v)])
self.entity_graph = G
return G

def build_word_ego_graph(self, docs, word, standard_name=True, min_freq=0, other_min_freq=-1, stopwords=None):
'''根据文本和指定限定词,获得以限定词为中心的各词语的关系。
限定词可以是一个特定的方面(衣食住行这类文档),这样就可以从词语中心图中获得关于这个方面的简要信息
:param docs: 文本的列表
:param word: 限定词
:param standard_name: 把所有实体的指称化为标准实体名
:param stopwords: 需要过滤的停用词
:param min_freq: 作为边加入到图中的与中心词最小共现次数,用于筛掉可能过多的边
:param other_min_freq: 中心词以外词语关系的最小共现次数
:return: G(networxX中的Graph)
'''
G = nx.Graph()
links = {}
if other_min_freq == -1:
other_min_freq = min_freq
for doc in docs:
if stopwords:
words = set(x for x in self.seg(doc, standard_name=standard_name) if x not in stopwords)
else:
words = self.seg(doc, standard_name=standard_name)
if word in words:
for u, v in combinations(words, 2):
pair0 = tuple(sorted((u, v)))
if pair0 not in links:
links[pair0] = 1
else:
links[pair0] += 1

used_nodes = set([word]) # 关系对中涉及的词语必须与实体有关(>= min_freq)
for (u, v) in links:
w = links[(u, v)]
if word in (u, v) and w >= min_freq:
used_nodes.add(v if word == u else u)
G.add_edge(u, v, weight=w)
elif w >= other_min_freq:
G.add_edge(u, v, weight=w)
G = G.subgraph(used_nodes).copy()
return G

def build_entity_ego_graph(self, docs, word, min_freq=0, other_min_freq=-1, inv_index={}, used_types=[]):
'''Entity only version of build_word_ego_graph()
'''
G = nx.Graph()
links = {}
if other_min_freq == -1:
other_min_freq = min_freq
if len(inv_index) != 0:
related_docs = self.search_entity(word, docs, inv_index)
else:
related_docs = []
for doc in docs:
entities_info = self.entity_linking(doc)
entities = [entity0 for [[l,r], (entity0,type0)] in entities_info]
if word in entities:
related_docs.append(doc)

for i, sent in enumerate(related_docs):
entities_info = self.entity_linking(sent)
if len(used_types) == 0:
entities = set(entity for span, (entity, type0) in entities_info)
else:
entities = set(entity for span, (entity, type0) in entities_info if type0[1:-1] in used_types)
for u, v in combinations(entities, 2):
pair0 = tuple(sorted((u, v)))
if pair0 not in links:
links[pair0] = 1
else:
links[pair0] += 1

used_nodes = set([word]) # 关系对中涉及的词语必须与实体有关(>= min_freq)
for (u, v) in links:
w = links[(u, v)]
if word in (u, v) and w >= min_freq:
used_nodes.add(v if word == u else u)
G.add_edge(u, v, weight=w)
elif w >= other_min_freq:
G.add_edge(u, v, weight=w)
G = G.subgraph(used_nodes).copy()
return G
38 changes: 38 additions & 0 deletions harvesttext/ent_retrieve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import numpy as np
from collections import defaultdict

class EntRetrieveMixin:
"""
实体检索模块:
- 基于倒排索引快速检索包括某个实体的文档,以及统计出现某实体的文档数目
"""
def build_index(self, docs, with_entity=True, with_type=True):
inv_index = defaultdict(set)
for i, sent in enumerate(docs):
entities_info = self.entity_linking(sent)
for span, (entity, type0) in entities_info:
if with_entity:
inv_index[entity].add(i)
if with_type:
inv_index[type0].add(i)
return inv_index

def get_entity_counts(self, docs, inv_index, used_type=[]):
if len(used_type) > 0:
entities = iter(x for x in self.entity_type_dict
if self.entity_type_dict[x] in used_type)
else:
entities = self.entity_type_dict.keys()
cnt = {enty: len(inv_index[enty]) for enty in entities if enty in inv_index}
return cnt

def search_entity(self, query, docs, inv_index):
words = query.split()
if len(words) > 0:
ids = inv_index[words[0]]
for word in words[1:]:
ids = ids & inv_index[word]
np_docs = np.array(docs)[list(ids)]
return np_docs.tolist()
else:
return []
Loading

0 comments on commit bdd1098

Please sign in to comment.