From bdd109872ec8763759b5abd196ca661215982952 Mon Sep 17 00:00:00 2001
From: Zhiling Zhang <1840962220@qq.com>
Date: Wed, 7 Oct 2020 17:09:35 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BD=BF=E7=94=A8Mixin=E8=AE=BE=E8=AE=A1?=
 =?UTF-8?q?=E6=A8=A1=E5=BC=8F=E8=BF=9B=E8=A1=8C=E9=87=8D=E6=9E=84=EF=BC=8C?=
 =?UTF-8?q?=E7=BB=99=E4=B8=BB=E6=96=87=E4=BB=B6=E7=98=A6=E8=BA=AB=EF=BC=8C?=
 =?UTF-8?q?=E5=B9=B6=E8=B0=83=E6=95=B4=E4=BB=A3=E7=A0=81=E5=B1=82=E7=BA=A7?=
 =?UTF-8?q?=E7=BB=93=E6=9E=84=EF=BC=8C=E6=8F=90=E9=AB=98=E5=8F=AF=E8=AF=BB?=
 =?UTF-8?q?=E6=80=A7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/conf.py                                  |   3 +-
 examples/basics.py                            |   2 +-
 examples/naiveKGQA.py                         |   2 +-
 harvesttext/__init__.py                       |   4 +-
 .../{ => algorithms}/entity_discoverer.py     |   0
 .../{ => algorithms}/match_patterns.py        |   0
 harvesttext/{ => algorithms}/sent_dict.py     |   0
 harvesttext/{ => algorithms}/texttile.py      |   0
 harvesttext/{ => algorithms}/utils.py         |   0
 .../{ => algorithms}/word_discoverer.py       |   0
 harvesttext/ent_network.py                    | 123 +++
 harvesttext/ent_retrieve.py                   |  38 +
 harvesttext/harvesttext.py                    | 783 ++----------------
 harvesttext/parsing.py                        | 189 +++++
 harvesttext/resources.py                      |  29 +-
 harvesttext/sentiment.py                      |  44 +
 harvesttext/summary.py                        |  72 ++
 harvesttext/word_discover.py                  | 237 ++++++
 setup.py                                      |   3 +-
 19 files changed, 808 insertions(+), 721 deletions(-)
 rename harvesttext/{ => algorithms}/entity_discoverer.py (100%)
 rename harvesttext/{ => algorithms}/match_patterns.py (100%)
 rename harvesttext/{ => algorithms}/sent_dict.py (100%)
 rename harvesttext/{ => algorithms}/texttile.py (100%)
 rename harvesttext/{ => algorithms}/utils.py (100%)
 rename harvesttext/{ => algorithms}/word_discoverer.py (100%)
 create mode 100644 harvesttext/ent_network.py
 create mode 100644 harvesttext/ent_retrieve.py
 create mode 100644 harvesttext/parsing.py
 create mode 100644 harvesttext/sentiment.py
 create mode 100644 harvesttext/summary.py
 create mode 100644 harvesttext/word_discover.py

diff --git a/docs/conf.py b/docs/conf.py
index d64d782..1d8fb36 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -13,6 +13,7 @@
 import os
 import sys
 sys.path.insert(0, os.path.abspath('..'))
+from harvesttext import __version__
 
 
 # -- Project information -----------------------------------------------------
@@ -22,7 +23,7 @@
 author = 'blmoistawinde'
 
 # The full version, including alpha/beta/rc tags
-release = '0.7.4.2'
+release = __version__
 
 github_doc_root = 'https://github.com/blmoistawinde/HarvestText/tree/master/doc'
 
diff --git a/examples/basics.py b/examples/basics.py
index 5c24978..ab538e0 100644
--- a/examples/basics.py
+++ b/examples/basics.py
@@ -189,7 +189,7 @@ def test_case(text0,entity_mention_dict,strategy,entity_type_dict=None,**kwargs)
 
 
 def find_with_rules():
-    from harvesttext.match_patterns import UpperFirst, AllEnglish, Contains, StartsWith, EndsWith
+    from harvesttext.algorithms.match_patterns import UpperFirst, AllEnglish, Contains, StartsWith, EndsWith
     # some more patterns is provided
     text0 = "我喜欢Python，因为requests库很适合爬虫"
     ht0 = HarvestText()
diff --git a/examples/naiveKGQA.py b/examples/naiveKGQA.py
index 9a32cfc..81cdc1f 100644
--- a/examples/naiveKGQA.py
+++ b/examples/naiveKGQA.py
@@ -4,7 +4,7 @@
 """
 
 from harvesttext.harvesttext import HarvestText
-from rdflib import URIRef,Graph,Namespace,Literal
+from rdflib import URIRef, Graph, Namespace, Literal
 from pyxdameraulevenshtein import damerau_levenshtein_distance as edit_dis
 import numpy as np
 
diff --git a/harvesttext/__init__.py b/harvesttext/__init__.py
index 1c6c05b..f5ce1aa 100644
--- a/harvesttext/__init__.py
+++ b/harvesttext/__init__.py
@@ -1,9 +1,11 @@
 #coding=utf-8
 #!/usr/bin/env python
+import pickle
 from .harvesttext import HarvestText
 from .resources import *
 
-import pickle
+__version__ = '0.8'
+
 def saveHT(htModel,filename):
     with open(filename, "wb") as f:
         htModel.prepared = False
diff --git a/harvesttext/entity_discoverer.py b/harvesttext/algorithms/entity_discoverer.py
similarity index 100%
rename from harvesttext/entity_discoverer.py
rename to harvesttext/algorithms/entity_discoverer.py
diff --git a/harvesttext/match_patterns.py b/harvesttext/algorithms/match_patterns.py
similarity index 100%
rename from harvesttext/match_patterns.py
rename to harvesttext/algorithms/match_patterns.py
diff --git a/harvesttext/sent_dict.py b/harvesttext/algorithms/sent_dict.py
similarity index 100%
rename from harvesttext/sent_dict.py
rename to harvesttext/algorithms/sent_dict.py
diff --git a/harvesttext/texttile.py b/harvesttext/algorithms/texttile.py
similarity index 100%
rename from harvesttext/texttile.py
rename to harvesttext/algorithms/texttile.py
diff --git a/harvesttext/utils.py b/harvesttext/algorithms/utils.py
similarity index 100%
rename from harvesttext/utils.py
rename to harvesttext/algorithms/utils.py
diff --git a/harvesttext/word_discoverer.py b/harvesttext/algorithms/word_discoverer.py
similarity index 100%
rename from harvesttext/word_discoverer.py
rename to harvesttext/algorithms/word_discoverer.py
diff --git a/harvesttext/ent_network.py b/harvesttext/ent_network.py
new file mode 100644
index 0000000..df80c45
--- /dev/null
+++ b/harvesttext/ent_network.py
@@ -0,0 +1,123 @@
+import networkx as nx
+from itertools import combinations
+
+class EntNetworkMixin:
+    """
+    实体网络模块：
+    - 根据实体在文档中的共现关系
+        - 建立全局社交网络
+        - 建立以某一个实体为中心的社交网络
+    """
+    def build_entity_graph(self, docs, min_freq=0, inv_index={}, used_types=[]):
+        G = nx.Graph()
+        links = {}
+        if len(inv_index) == 0:
+            for i, sent in enumerate(docs):
+                entities_info = self.entity_linking(sent)
+                if len(used_types) == 0:
+                    entities = set(entity for span, (entity, type0) in entities_info)
+                else:
+                    entities = set(entity for span, (entity, type0) in entities_info if type0[1:-1] in used_types)
+                for u, v in combinations(entities, 2):
+                    pair0 = tuple(sorted((u, v)))
+                    if pair0 not in links:
+                        links[pair0] = 1
+                    else:
+                        links[pair0] += 1
+        else:  # 已经有倒排文档，可以更快速检索
+            if len(used_types) == 0:
+                entities = self.entity_type_dict.keys()
+            else:
+                entities = iter(entity for (entity, type0) in self.entity_type_dict.items() if type0 in used_types)
+            for u, v in combinations(entities, 2):
+                pair0 = tuple(sorted((u, v)))
+                ids = inv_index[u] & inv_index[v]
+                if len(ids) > 0:
+                    links[pair0] = len(ids)
+        for (u, v) in links:
+            if links[(u, v)] >= min_freq:
+                G.add_edge(u, v, weight=links[(u, v)])
+        self.entity_graph = G
+        return G
+
+    def build_word_ego_graph(self, docs, word, standard_name=True, min_freq=0, other_min_freq=-1, stopwords=None):
+        '''根据文本和指定限定词，获得以限定词为中心的各词语的关系。
+        限定词可以是一个特定的方面（衣食住行这类文档），这样就可以从词语中心图中获得关于这个方面的简要信息
+
+        :param docs: 文本的列表
+        :param word: 限定词
+        :param standard_name: 把所有实体的指称化为标准实体名
+        :param stopwords: 需要过滤的停用词
+        :param min_freq: 作为边加入到图中的与中心词最小共现次数，用于筛掉可能过多的边
+        :param other_min_freq: 中心词以外词语关系的最小共现次数
+        :return: G（networxX中的Graph）
+
+        '''
+        G = nx.Graph()
+        links = {}
+        if other_min_freq == -1:
+            other_min_freq = min_freq
+        for doc in docs:
+            if stopwords:
+                words = set(x for x in self.seg(doc, standard_name=standard_name) if x not in stopwords)
+            else:
+                words = self.seg(doc, standard_name=standard_name)
+            if word in words:
+                for u, v in combinations(words, 2):
+                    pair0 = tuple(sorted((u, v)))
+                    if pair0 not in links:
+                        links[pair0] = 1
+                    else:
+                        links[pair0] += 1
+
+        used_nodes = set([word])  # 关系对中涉及的词语必须与实体有关（>= min_freq）
+        for (u, v) in links:
+            w = links[(u, v)]
+            if word in (u, v) and w >= min_freq:
+                used_nodes.add(v if word == u else u)
+                G.add_edge(u, v, weight=w)
+            elif w >= other_min_freq:
+                G.add_edge(u, v, weight=w)
+        G = G.subgraph(used_nodes).copy()
+        return G
+
+    def build_entity_ego_graph(self, docs, word, min_freq=0, other_min_freq=-1, inv_index={}, used_types=[]):
+        '''Entity only version of build_word_ego_graph()
+        '''
+        G = nx.Graph()
+        links = {}
+        if other_min_freq == -1:
+            other_min_freq = min_freq
+        if len(inv_index) != 0:
+            related_docs = self.search_entity(word, docs, inv_index)
+        else:
+            related_docs = []
+            for doc in docs:
+                entities_info = self.entity_linking(doc)
+                entities = [entity0 for [[l,r], (entity0,type0)] in entities_info]
+                if word in entities:
+                    related_docs.append(doc)
+
+        for i, sent in enumerate(related_docs):
+            entities_info = self.entity_linking(sent)
+            if len(used_types) == 0:
+                entities = set(entity for span, (entity, type0) in entities_info)
+            else:
+                entities = set(entity for span, (entity, type0) in entities_info if type0[1:-1] in used_types)
+            for u, v in combinations(entities, 2):
+                pair0 = tuple(sorted((u, v)))
+                if pair0 not in links:
+                    links[pair0] = 1
+                else:
+                    links[pair0] += 1
+
+        used_nodes = set([word])  # 关系对中涉及的词语必须与实体有关（>= min_freq）
+        for (u, v) in links:
+            w = links[(u, v)]
+            if word in (u, v) and w >= min_freq:
+                used_nodes.add(v if word == u else u)
+                G.add_edge(u, v, weight=w)
+            elif w >= other_min_freq:
+                G.add_edge(u, v, weight=w)
+        G = G.subgraph(used_nodes).copy()
+        return G
\ No newline at end of file
diff --git a/harvesttext/ent_retrieve.py b/harvesttext/ent_retrieve.py
new file mode 100644
index 0000000..319b0cd
--- /dev/null
+++ b/harvesttext/ent_retrieve.py
@@ -0,0 +1,38 @@
+import numpy as np
+from collections import defaultdict
+
+class EntRetrieveMixin:
+    """
+    实体检索模块:
+    - 基于倒排索引快速检索包括某个实体的文档，以及统计出现某实体的文档数目
+    """
+    def build_index(self, docs, with_entity=True, with_type=True):
+        inv_index = defaultdict(set)
+        for i, sent in enumerate(docs):
+            entities_info = self.entity_linking(sent)
+            for span, (entity, type0) in entities_info:
+                if with_entity:
+                    inv_index[entity].add(i)
+                if with_type:
+                    inv_index[type0].add(i)
+        return inv_index
+
+    def get_entity_counts(self, docs, inv_index, used_type=[]):
+        if len(used_type) > 0:
+            entities = iter(x for x in self.entity_type_dict
+                            if self.entity_type_dict[x] in used_type)
+        else:
+            entities = self.entity_type_dict.keys()
+        cnt = {enty: len(inv_index[enty]) for enty in entities if enty in inv_index}
+        return cnt
+
+    def search_entity(self, query, docs, inv_index):
+        words = query.split()
+        if len(words) > 0:
+            ids = inv_index[words[0]]
+            for word in words[1:]:
+                ids = ids & inv_index[word]
+            np_docs = np.array(docs)[list(ids)]
+            return np_docs.tolist()
+        else:
+            return []
\ No newline at end of file
diff --git a/harvesttext/harvesttext.py b/harvesttext/harvesttext.py
index 99bf494..5698414 100644
--- a/harvesttext/harvesttext.py
+++ b/harvesttext/harvesttext.py
@@ -3,28 +3,34 @@
 import re
 import json
 import numpy as np
-import scipy.special
 import pandas as pd
 import html
 import urllib
-from itertools import combinations
 import jieba
 import jieba.posseg as pseg
-from collections import defaultdict
-from .word_discoverer import WordDiscoverer
-from .sent_dict import SentDict
-from .resources import get_qh_sent_dict, get_baidu_stopwords
-from .texttile import TextTile
-from .entity_discoverer import NFLEntityDiscoverer, NERPEntityDiscover
-from .utils import sent_sim_textrank, sent_sim_cos
 import w3lib.html
 import logging
 import warnings
 from tqdm import tqdm
 from pypinyin import lazy_pinyin, pinyin
 from opencc import OpenCC
-
-class HarvestText:
+from collections import defaultdict
+from .ent_network import EntNetworkMixin
+from .ent_retrieve import EntRetrieveMixin
+from .parsing import ParsingMixin
+from .sentiment import SentimentMixin
+from .summary import SummaryMixin
+from .word_discover import WordDiscoverMixin
+from .resources import get_baidu_stopwords
+
+class HarvestText(EntNetworkMixin, EntRetrieveMixin, ParsingMixin, SentimentMixin, SummaryMixin, WordDiscoverMixin):
+    """
+    主模块：
+    - 主要保留了与实体分词、分句，预处理相关的代码
+    - 还有存取、状态管理等基础代码
+    - 其他功能在各个mixin里面
+    - 主模块的功能是会被各个子模块最频繁调用的，也体现了本库以实体为核心，基于实体展开分析或改进算法的理念
+    """
     def __init__(self, standard_name=False, language='zh_CN'):
         self.standard_name = standard_name  # 是否使用连接到的实体名来替换原文
         self.entity_types = set()
@@ -46,9 +52,17 @@ def __init__(self, standard_name=False, language='zh_CN'):
         with open(pwd + "/resources/pinyin_adjlist.json", "r", encoding="utf-8") as f:
             self.pinyin_adjlist = json.load(f)
         self.language = language
-    #
-    # 实体分词模块
-    #
+        if language == "en":
+            try:
+                nltk.data.find('taggers/averaged_perceptron_tagger')
+            except:
+                nltk.download('averaged_perceptron_tagger')
+            try:
+                nltk.data.find('taggers/universal_tagset')
+            except:
+                nltk.download('universal_tagset')
+            
+
     def build_trie(self, new_word, entity, entity_type):
         type0 = "#%s#" % entity_type
         if not type0 in self.entity_types:
@@ -105,6 +119,15 @@ def remove_entity(self, entity):
                         trie_node["leaf"].remove((entity0, type0))
                         break
 
+    def _add_entities(self, type_entity_mention_dict):
+        for type0 in type_entity_mention_dict:
+            entity_mention_dict0 = type_entity_mention_dict[type0]
+            for entity0 in entity_mention_dict0:
+                mentions = entity_mention_dict0[entity0]
+                for mention0 in mentions:
+                    self.build_trie(mention0, entity0, type0)
+        self.prepare()
+
     def add_entities(self, entity_mention_dict=None, entity_type_dict=None, override=False, load_path=None):
         '''登录的实体信息到ht，或者从save_entities保存的文件中读取（如果指定了load_path）
 
@@ -180,14 +203,40 @@ def add_typed_words(self, type_word_dict):
         self.type_entity_mention_dict = type_entity_mention_dict
         self._add_entities(type_entity_mention_dict)
 
-    def _add_entities(self, type_entity_mention_dict):
-        for type0 in type_entity_mention_dict:
-            entity_mention_dict0 = type_entity_mention_dict[type0]
-            for entity0 in entity_mention_dict0:
-                mentions = entity_mention_dict0[entity0]
-                for mention0 in mentions:
-                    self.build_trie(mention0, entity0, type0)
-        self.prepare()
+    def add_new_words(self, new_words):
+        for word in new_words:
+            self.build_trie(word, word, "新词")
+            self.entity_mention_dict[word] = set([word])
+            self.entity_type_dict[word] = "新词"
+            if word not in self.type_entity_mention_dict["新词"]:
+                self.type_entity_mention_dict["新词"][word] = set([word])
+            else:
+                self.type_entity_mention_dict["新词"][word].add(word)
+        self.check_prepared()
+
+    def add_new_mentions(self, entity_mention_dict):  # 添加链接到已有实体的新别称，一般在新词发现的基础上筛选得到
+        for entity0 in entity_mention_dict:
+            type0 = self.entity_type_dict[entity0]
+            for mention0 in entity_mention_dict[entity0]:
+                self.entity_mention_dict[entity0].add(mention0)
+                self.build_trie(mention0, entity0, type0)
+            self.type_entity_mention_dict[type0][entity0] = self.entity_mention_dict[entity0]
+        self.check_prepared()
+
+    def add_new_entity(self, entity0, mention0=None, type0="添加词"):
+        if mention0 is None:
+            mention0 = entity0
+        self.entity_type_dict[entity0] = type0
+        if entity0 in self.entity_mention_dict:
+            self.entity_mention_dict[entity0].add(mention0)
+        else:
+            self.entity_mention_dict[entity0] = set([mention0])
+        self.build_trie(mention0, entity0, type0)
+        if entity0 not in self.type_entity_mention_dict[type0]:
+            self.type_entity_mention_dict[type0][entity0] = set([mention0])
+        else:
+            self.type_entity_mention_dict[type0][entity0].add(mention0)
+        self.check_prepared()
 
     def prepare(self):
         self.prepared = True
@@ -636,77 +685,6 @@ def load_entities(self, load_path='./ht_entities.txt', override=True):
         self.type_entity_mention_dict = type_entity_mention_dict
         self._add_entities(type_entity_mention_dict)
 
-    def entity_discover(self, text, return_count=False, method="NFL", min_count=5, pinyin_tolerance=0, **kwargs):
-        """无监督地从较大量文本中发现实体的类别和多个同义mention。建议对千句以上的文本来挖掘，并且文本的主题比较集中。
-            效率：在测试环境下处理一个约10000句的时间大约是20秒。另一个约200000句的语料耗时2分半
-            精度：算法准确率不高，但是可以初步聚类，建议先save_entities后, 再进行手动进行调整，然后load_entities再用于进一步挖掘
-
-            ref paper: Mining Entity Synonyms with Efficient Neural Set Generation(https://arxiv.org/abs/1811.07032v1)
-
-        :param text: string or list of string
-        :param return_count: (default False) 是否再返回每个mention的出现次数
-        :param method: 使用的算法， 目前可选 "NFL" (NER+Fasttext+Louvain+模式修复，基于语义和规则发现同义实体，但可能聚集过多错误实体), "NERP"(NER+模式修复, 仅基于规则发现同义实体)
-        :param min_count: (default 5) mininum freq of word to be included
-        :param pinyin_tolerance: {None, 0, 1} 合并拼音相同(取0时)或者差别只有一个(取1时)的候选词到同一组实体，默认使用(0)
-        :param kwargs: 根据算法决定的参数，目前, "NERP"不需要额外参数，而"NFL"可接受的额外参数有：
-
-            emb_dim: (default 50) fasttext embedding's dimensions
-
-            threshold: (default 0.98) [比较敏感，调参重点]larger for more entities, threshold for add an edge between 2 entities if cos_dim exceeds
-
-            ft_iters: (default 20) larger for more entities, num of iterations used by fasttext
-
-            use_subword: (default True) whether to use fasttext's subword info
-
-            min_n: (default 1) min length of used subword
-
-            max_n: (default 4) max length of used subword
-
-        :return: entity_mention_dict, entity_type_dict
-        """
-        text = text if type(text) == str else "\n".join(text)
-        method = method.upper()
-        assert method in {"NFL", "NERP"}
-        # discover candidates with NER
-        print("Doing NER")
-        sent_words = []
-        type_entity_dict = defaultdict(set)
-        entity_count = defaultdict(int)
-        wd_count = defaultdict(int)
-        for sent in tqdm(self.cut_sentences(text)):
-            NERs0, possegs = self.named_entity_recognition(sent, return_posseg=True)
-            sent_wds0 = []
-            for wd, pos in possegs:
-                if wd in NERs0:
-                    zh_pos = NERs0[wd]
-                    entity_name = wd.lower() + "_" + zh_pos
-                    type_entity_dict[zh_pos].add(entity_name)
-                    sent_wds0.append(entity_name)
-                    entity_count[entity_name] += 1
-                else:
-                    sent_wds0.append(wd)
-                    wd_count[wd] += 1
-            sent_words.append(sent_wds0)
-
-        entity_count = pd.Series(entity_count)
-        entity_count = entity_count[entity_count >= min_count]
-        pop_words_cnt = {wd:cnt for wd, cnt in wd_count.items() if cnt >= min_count}
-        id2word = entity_count.index.tolist()
-        word2id = {wd: i for (i, wd) in enumerate(id2word)}
-
-        type_entity_dict2 = {k: list(v) for k, v in type_entity_dict.items()}
-        if method == "NFL":
-            discoverer = NFLEntityDiscoverer(sent_words, type_entity_dict2, entity_count, pop_words_cnt, word2id, id2word,
-                                             min_count, pinyin_tolerance, self.pinyin_adjlist, **kwargs)
-        elif method == "NERP":
-            discoverer = NERPEntityDiscover(sent_words, type_entity_dict2, entity_count, pop_words_cnt, word2id, id2word,
-                                            min_count, pinyin_tolerance, self.pinyin_adjlist, **kwargs)
-        entity_mention_dict, entity_type_dict = discoverer.entity_mention_dict, discoverer.entity_type_dict
-        mention_count = discoverer.mention_count         # 新添加的mention的count在discoverer里更新
-        if return_count:
-            return entity_mention_dict, entity_type_dict, mention_count
-        else:
-            return entity_mention_dict, entity_type_dict
 
     def cut_sentences(self, para, drop_empty_line=True, strip=True, deduplicate=False):
         '''cut_sentences
@@ -743,71 +721,6 @@ def cut_sentences(self, para, drop_empty_line=True, strip=True, deduplicate=Fals
                 sentences = [sent for sent in sentences if len(sent.strip()) > 0]
             return sentences
 
-    def cut_paragraphs(self, text, num_paras=None, block_sents=3, std_weight=0.5,
-                       align_boundary=True, stopwords='baidu', remove_puncts=True,
-                       seq_chars=-1, **kwargs):
-        '''
-
-        :param text:
-        :param num_paras: (默认为None)可以手动设置想要划分的段落数，也可以保留默认值None，让算法自动确定
-        :param block_sents: 算法的参数，将几句句子分为一个block。一般越大，算法自动划分的段落越少
-        :param std_weight: 算法的参数。一般越大，算法自动划分的段落越多
-        :param align_boundary: 新划分的段落是否要与原有的换行处对齐
-        :param stopwords: 字符串列表/元组/集合，或者'baidu'为默认百度停用词，在算法中引入的停用词，一般能够提升准确度
-        :param remove_puncts: （默认为True）是否在算法中去除标点符号，一般能够提升准确度
-        :param seq_chars: （默认为-1）如果设置为>=1的值，则以包含这个数量的字符为基本单元，代替默认的句子。
-        :param **kwargs: passed to ht.cut_sentences, like deduplicate
-        :return:
-        '''
-        if num_paras is not None:
-            assert num_paras > 0, "Should give a positive number of num_paras"
-        assert stopwords == 'baidu' or (hasattr(stopwords, '__iter__') and type(stopwords) != str) 
-        stopwords = get_baidu_stopwords() if stopwords == 'baidu' else stopwords
-        if seq_chars < 1:
-            cut_seqs = lambda x: self.cut_sentences(x, **kwargs)
-        else:
-            seq_chars = int(seq_chars)
-            def _cut_seqs(text, len0, strip=True, deduplicate=False):
-                if deduplicate:
-                    text = re.sub(r"([。！？\!\?])\1+", r"\1", text)
-                if strip:
-                    text = text.strip()
-                seqs = [text[i:i+len0] for i in range(0, len(text), len0)]
-                return seqs
-            cut_seqs = lambda x: _cut_seqs(x, seq_chars, **kwargs)
-        
-        if align_boundary:
-            paras = [para.strip() for para in text.split("\n") if len(para.strip()) > 0]
-            if num_paras is not None:
-                # assert num_paras <= len(paras), "The new segmented paragraphs must be no less than the original ones"
-                if num_paras >= len(paras):
-                    return paras
-            original_boundary_ids = []
-            sentences = []
-            for para in paras:
-                sentences.extend(cut_seqs(para))
-                original_boundary_ids.append(len(sentences))
-        else:
-            original_boundary_ids = None
-            sentences = cut_seqs(text, **kwargs)
-        # with entity resolution, can better decide similarity
-        if remove_puncts:
-            allpuncs = re.compile(
-                r"[，\_《。》、？；：‘’＂“”【「】」、·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+]")
-            sent_words = [re.sub(allpuncs, "",
-                                 self.seg(sent, standard_name=True, stopwords=stopwords, return_sent=True)
-                                 ).split()
-                          for sent in sentences]
-        else:
-            sent_words = [self.seg(sent, standard_name=True, stopwords=stopwords)
-                          for sent in sentences]
-        texttiler = TextTile()
-        predicted_boundary_ids = texttiler.cut_paragraphs(sent_words, num_paras, block_sents, std_weight,
-                                                          align_boundary, original_boundary_ids)
-        jointer = " " if (self.language == 'en' and seq_chars > 1) else ""
-        predicted_paras = [jointer.join(sentences[l:r]) for l, r in zip([0]+predicted_boundary_ids[:-1], predicted_boundary_ids)]
-        return predicted_paras
-
     def clean_text(self, text, remove_url=True, email=True, weibo_at=True, stop_terms=("转发微博",),
                    emoji=True, weibo_topic=False, deduplicate_space=True,
                    norm_url=False, norm_html=False, to_url=False,
@@ -869,564 +782,12 @@ def clean_text(self, text, remove_url=True, email=True, weibo_at=True, stop_term
                 text = text.replace(x, "")
         if remove_puncts:
             allpuncs = re.compile(
-                r"[，\_《。》、？；：‘’＂“”【「】」、·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+]")
+                r"[，\_《。》、？；：‘’＂“”【「】」·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+]")
             text = re.sub(allpuncs, "", text)
 
         return text.strip()
 
-    def named_entity_recognition(self, sent, standard_name=False, return_posseg=False):
-        '''利用pyhanlp的命名实体识别，找到句子中的（人名，地名，机构名，其他专名）实体。harvesttext会预先链接已知实体
-
-        :param sent: string, 文本
-        :param standard_name: bool, 是否把连接到的已登录转化为标准名
-        :param return_posseg: bool, 是否返回包括命名实体识别的，带词性分词结果
-        :param book: bool, 预先识别
-        :return: entity_type_dict: 发现的命名实体信息，字典 {实体名: 实体类型}
-            (return_posseg=True时) possegs: list of (单词, 词性)
-        '''
-        from pyhanlp import HanLP, JClass
-        if not self.hanlp_prepared:
-            self.hanlp_prepare()
-        self.standard_name = standard_name
-        entities_info = self.entity_linking(sent)
-        sent2 = self.decoref(sent, entities_info)
-        StandardTokenizer = JClass("com.hankcs.hanlp.tokenizer.StandardTokenizer")
-        StandardTokenizer.SEGMENT.enableAllNamedEntityRecognize(True)
-        entity_type_dict = {}
-        try:
-            possegs = []
-            for x in StandardTokenizer.segment(sent2):
-                # 三种前缀代表：人名（nr），地名（ns），机构名（nt）
-                tag0 = str(x.nature)
-                if tag0.startswith("nr"):
-                    entity_type_dict[x.word] = "人名"
-                elif tag0.startswith("ns"):
-                    entity_type_dict[x.word] = "地名"
-                elif tag0.startswith("nt"):
-                    entity_type_dict[x.word] = "机构名"
-                elif tag0.startswith("nz"):
-                    entity_type_dict[x.word] = "其他专名"
-                possegs.append((x.word, tag0))
-        except:
-            pass
-        if return_posseg:
-            return entity_type_dict, possegs
-        else:
-            return entity_type_dict
-    def dependency_parse(self, sent, standard_name=False, stopwords=None):
-        '''依存句法分析，调用pyhanlp的接口，并且融入了harvesttext的实体识别机制。不保证高准确率。
-
-        :param sent:
-        :param standard_name:
-        :param stopwords:
-        :return: arcs：依存弧,列表中的列表。
-            [[词语id,词语字面值或实体名(standard_name控制),词性，依存关系，依存子词语id] for 每个词语]
-        '''
-        from pyhanlp import HanLP, JClass
-        if not self.hanlp_prepared:
-            self.hanlp_prepare()
-        self.standard_name = standard_name
-        entities_info = self.entity_linking(sent)
-        sent2 = self.decoref(sent, entities_info)
-        # [word.ID-1, word.LEMMA, word.POSTAG, word.DEPREL ,word.HEAD.ID-1]
-        arcs = []
-        i = 0
-        sentence = HanLP.parseDependency(sent2)
-        for word in sentence.iterator():
-            word0, tag0 = word.LEMMA, word.POSTAG
-            if stopwords and word0 in stopwords:
-                continue
-            if word0 in self.entity_types:
-                if self.standard_name:
-                    word0 = entities_info[i][1][0]  # 使用链接的实体
-                else:
-                    l, r = entities_info[i][0]  # 或使用原文
-                    word0 = sent[l:r]
-                tag0 = entities_info[i][1][1][1:-1]
-                i += 1
-            arcs.append([word.ID-1, word0, tag0, word.DEPREL, word.HEAD.ID-1])
-        return arcs
-
-    def triple_extraction(self, sent, standard_name=False, stopwords=None, expand = "all"):
-        '''利用主谓宾等依存句法关系，找到句子中有意义的三元组。
-        很多代码参考：https://github.com/liuhuanyong/EventTriplesExtraction
-        不保证高准确率。
-
-        :param sent:
-        :param standard_name:
-        :param stopwords:
-        :param expand: 默认"all"：扩展所有主谓词，"exclude_entity"：不扩展已知实体，可以保留标准的实体名，用于链接。"None":不扩展
-        :return:
-        '''
-        arcs = self.dependency_parse(sent, standard_name, stopwords)
-
-        '''对找出的主语或者宾语进行扩展'''
-        def complete_e(words, postags, child_dict_list, word_index):
-            if expand == "all" or (expand == "exclude_entity" and "#"+postags[word_index]+"#" not in self.entity_types):
-                child_dict = child_dict_list[word_index]
-                prefix = ''
-                if '定中关系' in child_dict:
-                    for i in range(len(child_dict['定中关系'])):
-                        prefix += complete_e(words, postags, child_dict_list, child_dict['定中关系'][i])
-                postfix = ''
-                if postags[word_index] == 'v':
-                    if '动宾关系' in child_dict:
-                        postfix += complete_e(words, postags, child_dict_list, child_dict['动宾关系'][0])
-                    if '主谓关系' in child_dict:
-                        prefix = complete_e(words, postags, child_dict_list, child_dict['主谓关系'][0]) + prefix
-
-                return prefix + words[word_index] + postfix
-            elif expand == "None":
-                return words[word_index]
-            else:            # (expand == "exclude_entity" and "#"+postags[word_index]+"#" in self.entity_types)
-                return words[word_index]
-
-
-        words, postags = ["" for i in range(len(arcs))], ["" for i in range(len(arcs))]
-        child_dict_list = [defaultdict(list) for i in range(len(arcs))]
-        for i, format_parse in enumerate(arcs):
-            id0, words[i], postags[i], rel, headID = format_parse
-            child_dict_list[headID][rel].append(i)
-        svos = []
-        for index in range(len(postags)):
-            # 使用依存句法进行抽取
-            if postags[index]:
-                # 抽取以谓词为中心的事实三元组
-                child_dict = child_dict_list[index]
-                # 主谓宾
-                if '主谓关系' in child_dict and '动宾关系' in child_dict:
-                    r = words[index]
-                    e1 = complete_e(words, postags, child_dict_list, child_dict['主谓关系'][0])
-                    e2 = complete_e(words, postags, child_dict_list, child_dict['动宾关系'][0])
-                    svos.append([e1, r, e2])
-
-                # 定语后置，动宾关系
-                relation = arcs[index][-2]
-                head = arcs[index][-1]
-                if relation == '定中关系':
-                    if '动宾关系' in child_dict:
-                        e1 = complete_e(words, postags, child_dict_list, head)
-                        r = words[index]
-                        e2 = complete_e(words, postags, child_dict_list, child_dict['动宾关系'][0])
-                        temp_string = r + e2
-                        if temp_string == e1[:len(temp_string)]:
-                            e1 = e1[len(temp_string):]
-                        if temp_string not in e1:
-                            svos.append([e1, r, e2])
-                # 含有介宾关系的主谓动补关系
-                if '主谓关系' in child_dict and '动补结构' in child_dict:
-                    e1 = complete_e(words, postags, child_dict_list, child_dict['主谓关系'][0])
-                    CMP_index = child_dict['动补结构'][0]
-                    r = words[index] + words[CMP_index]
-                    if '介宾关系' in child_dict_list[CMP_index]:
-                        e2 = complete_e(words, postags, child_dict_list, child_dict_list[CMP_index]['介宾关系'][0])
-                        svos.append([e1, r, e2])
-        return svos
-
     def clear(self):
         self.deprepare()
         self.__init__()
 
-    #
-    # 新词发现模块
-    #
-    def word_discover(self, doc, threshold_seeds=[], auto_param=True,
-                      excluding_types=[], excluding_words='baidu_stopwords',  # 可以排除已经登录的某些种类的实体，或者某些指定词
-                      max_word_len=5, min_freq=0.00005, min_entropy=1.4, min_aggregation=50,
-                      ent_threshold="both", mem_saving=None, sort_by='freq'):
-        '''新词发现，基于 http://www.matrix67.com/blog/archives/5044 实现及微调
-
-        :param doc: (string or list) 待进行新词发现的语料，如果是列表的话，就会自动用换行符拼接
-        :param threshold_seeds: list of string, 设定能接受的“质量”最差的种子词，更差的词语将会在新词发现中被过滤
-        :param auto_param: bool, 使用默认的算法参数
-        :param excluding_types: list of str, 设定要过滤掉的特定词性或已经登录到ht的实体类别
-        :param excluding_words: list of str, 设定要过滤掉的特定词
-        :param max_word_len: 允许被发现的最长的新词长度
-        :param min_freq: 被发现的新词，在给定文本中需要达到的最低频率
-        :param min_entropy: 被发现的新词，在给定文本中需要达到的最低左右交叉熵
-        :param min_aggregation: 被发现的新词，在给定文本中需要达到的最低凝聚度
-        :param ent_threshold: "both": (默认)在使用左右交叉熵进行筛选时，两侧都必须超过阈值; "avg": 两侧的平均值达到阈值即可
-        :param mem_saving: bool or None, 采用一些过滤手段来减少内存使用，但可能影响速度。如果不指定，对长文本自动打开，而对短文本不使用
-        :param sort_by: 以下string之一: {'freq': 词频, 'score': 综合分数, 'agg':凝聚度} 按照特定指标对得到的词语信息排序，默认使用词频
-        :return: info: 包含新词作为index, 以及对应各项指标的DataFrame
-        '''
-        if type(doc) != str:
-            doc = "\n".join(doc)
-        # 采用经验参数，此时后面的参数设置都无效
-        if auto_param:  # 根据自己的几个实验确定的参数估计值，没什么科学性，但是应该能得到还行的结果
-            length = len(doc)
-            min_entropy = np.log(length) / 10
-            min_freq = min(0.00005, 20.0 / length)
-            min_aggregation = np.sqrt(length) / 15
-            mem_saving = bool(length > 300000) if mem_saving is None else mem_saving
-            # ent_threshold: 确定左右熵的阈值对双侧都要求"both"，或者只要左右平均值达到"avg"
-            # 对于每句话都很极短的情况（如长度<8），经常出现在左右边界的词语可能难以被确定，这时ent_threshold建议设为"avg"
-        mem_saving = False if mem_saving is None else mem_saving
-
-        try:
-            ws = WordDiscoverer(doc, max_word_len, min_freq, min_entropy, min_aggregation, ent_threshold, mem_saving)
-        except Exception as e:
-            logging.log(logging.ERROR, str(e))
-            info = {"text": [], "freq": [], "left_ent": [], "right_ent": [], "agg": []}
-            info = pd.DataFrame(info)
-            info = info.set_index("text")
-            return info
-
-        if len(excluding_types) > 0:
-            if "#" in list(excluding_types)[0]:  # 化为无‘#’标签
-                excluding_types = [x[1:-1] for x in excluding_types]
-            ex_mentions = set(x for enty in self.entity_mention_dict
-                           if enty in self.entity_type_dict and
-                           self.entity_type_dict[enty] in excluding_types
-                           for x in self.entity_mention_dict[enty])
-        else:
-            ex_mentions = set()
-        assert excluding_words == 'baidu_stopwords' or (hasattr(excluding_words, '__iter__') and type(excluding_words) != str) 
-        if excluding_words == 'baidu_stopwords':
-            ex_mentions |= get_baidu_stopwords()
-        else:
-            ex_mentions |= set(excluding_words)
-
-        info = ws.get_df_info(ex_mentions)
-
-        # 利用种子词来确定筛选优质新词的标准，种子词中最低质量的词语将被保留（如果一开始就被找到的话）
-        if len(threshold_seeds) > 0:
-            min_score = 100000
-            for seed in threshold_seeds:
-                if seed in info.index:
-                    min_score = min(min_score, info.loc[seed, "score"])
-            if (min_score >= 100000):
-                min_score = 0
-            else:
-                min_score *= 0.9  # 留一些宽松的区间
-                info = info[info["score"] > min_score]
-        if sort_by:
-            info.sort_values(by=sort_by, ascending=False, inplace=True)
-
-        return info
-
-    def add_new_words(self, new_words):
-        for word in new_words:
-            self.build_trie(word, word, "新词")
-            self.entity_mention_dict[word] = set([word])
-            self.entity_type_dict[word] = "新词"
-            if word not in self.type_entity_mention_dict["新词"]:
-                self.type_entity_mention_dict["新词"][word] = set([word])
-            else:
-                self.type_entity_mention_dict["新词"][word].add(word)
-        self.check_prepared()
-
-    def add_new_mentions(self, entity_mention_dict):  # 添加链接到已有实体的新别称，一般在新词发现的基础上筛选得到
-        for entity0 in entity_mention_dict:
-            type0 = self.entity_type_dict[entity0]
-            for mention0 in entity_mention_dict[entity0]:
-                self.entity_mention_dict[entity0].add(mention0)
-                self.build_trie(mention0, entity0, type0)
-            self.type_entity_mention_dict[type0][entity0] = self.entity_mention_dict[entity0]
-        self.check_prepared()
-
-    def add_new_entity(self, entity0, mention0=None, type0="添加词"):
-        if mention0 is None:
-            mention0 = entity0
-        self.entity_type_dict[entity0] = type0
-        if entity0 in self.entity_mention_dict:
-            self.entity_mention_dict[entity0].add(mention0)
-        else:
-            self.entity_mention_dict[entity0] = set([mention0])
-        self.build_trie(mention0, entity0, type0)
-        if entity0 not in self.type_entity_mention_dict[type0]:
-            self.type_entity_mention_dict[type0][entity0] = set([mention0])
-        else:
-            self.type_entity_mention_dict[type0][entity0].add(mention0)
-        self.check_prepared()
-
-    def find_entity_with_rule(self, text, rulesets=[], add_to_dict=True, type0="添加词"):
-        '''利用规则从分词结果中的词语找到实体，并可以赋予相应的类型再加入实体库
-
-        :param text: string, 一段文本
-        :param rulesets: list of (tuple of rules or single rule) from match_patterns,
-            list中包含多个规则，满足其中一种规则的词就认为属于这个type
-            而每种规则由tuple或单个条件(pattern)表示，一个词必须满足其中的一个或多个条件。
-        :param add_to_dict: 是否把找到的结果直接加入词典
-        :param type0: 赋予满足条件的词语的实体类型, 仅当add_to_dict时才有意义
-        :return: found_entities
-
-        '''
-        found_entities = set()
-        for word in self.seg(text):
-            for ruleset in rulesets:  # 每个ruleset是或关系，只要满足一个就添加并跳过其他
-                toAdd = True
-                if type(ruleset) == type((1, 2)):  # tuple
-                    for pattern0 in ruleset:
-                        if not pattern0(word):
-                            toAdd = False
-                            break
-                else:  # single rule
-                    pattern0 = ruleset
-                    if not pattern0(word):
-                        toAdd = False
-                if toAdd:
-                    found_entities.add(word)
-                    break
-        if add_to_dict:
-            for entity0 in found_entities:
-                self.add_new_entity(entity0, entity0, type0)
-            self.prepare()
-        return found_entities
-
-    #
-    # 情感分析模块
-    #
-    def build_sent_dict(self, sents, method="PMI", min_times=5, scale="None",
-                        pos_seeds=None, neg_seeds=None, stopwords=None):
-        '''利用种子词，构建情感词典
-
-        :param sents: list of string, 一般建议为句子，是计算共现PMI的基本单元
-        :param method: "PMI", 使用的算法，目前仅支持PMI
-        :param min_times: int, 默认为5， 在所有句子中出现次数少于这个次数的词语将被过滤
-        :param scale: {"None","0-1","+-1"}, 默认为"None"，否则将对情感值进行变换
-            若为"0-1"，按照最大为1，最小为0进行线性伸缩，0.5未必是中性
-            若为"+-1", 在正负区间内分别伸缩，保留0作为中性的语义
-        :param pos_seeds: list of string, 积极种子词，如不填写将默认采用清华情感词典
-        :param neg_seeds: list of string, 消极种子词，如不填写将默认采用清华情感词典
-        :param stopwords: list of string, stopwords词，如不填写将不使用
-        :return: sent_dict: dict,可以查询单个词语的情感值
-        '''
-        if pos_seeds is None and neg_seeds is None:
-            sdict = get_qh_sent_dict()
-            pos_seeds, neg_seeds = sdict["pos"], sdict["neg"]
-        docs = [set(self.seg(sent)) for sent in sents]
-        if not stopwords is None:
-            stopwords = set(stopwords)
-            for i in range(len(docs)):
-                docs[i] = docs[i] - stopwords
-            docs = list(filter(lambda x: len(x) > 0, docs))
-        self.sent_dict = SentDict(docs, method, min_times, scale, pos_seeds, neg_seeds)
-        return self.sent_dict.sent_dict
-
-    def analyse_sent(self, sent, avg=True):
-        """输入句子，输出其情感值，默认使用句子中，在情感词典中的词语的情感值的平均来计算
-
-        :param sent: string, 句子
-        :param avg: (default True) 是否使用平均值计算句子情感值
-        :return: float情感值(if avg == True), 否则为词语情感值列表
-        """
-        return self.sent_dict.analyse_sent(self.seg(sent), avg)
-
-    #
-    # 实体检索模块
-    #
-    def build_index(self, docs, with_entity=True, with_type=True):
-        inv_index = defaultdict(set)
-        for i, sent in enumerate(docs):
-            entities_info = self.entity_linking(sent)
-            for span, (entity, type0) in entities_info:
-                if with_entity:
-                    inv_index[entity].add(i)
-                if with_type:
-                    inv_index[type0].add(i)
-        return inv_index
-
-    def get_entity_counts(self, docs, inv_index, used_type=[]):
-        if len(used_type) > 0:
-            entities = iter(x for x in self.entity_type_dict
-                            if self.entity_type_dict[x] in used_type)
-        else:
-            entities = self.entity_type_dict.keys()
-        cnt = {enty: len(inv_index[enty]) for enty in entities if enty in inv_index}
-        return cnt
-
-    def search_entity(self, query, docs, inv_index):
-        words = query.split()
-        if len(words) > 0:
-            ids = inv_index[words[0]]
-            for word in words[1:]:
-                ids = ids & inv_index[word]
-            np_docs = np.array(docs)[list(ids)]
-            return np_docs.tolist()
-        else:
-            return []
-
-    #
-    # 文本摘要模块
-    #
-    def get_summary(self, docs, topK=5, stopwords=None, with_importance=False, standard_name=True,
-                    maxlen=None, avoid_repeat=False):
-        '''使用Textrank算法得到文本中的关键句
-
-        :param docs: str句子列表
-        :param topK: 选取几个句子, 如果设置了maxlen，则优先考虑长度
-        :param stopwords: 在算法中采用的停用词
-        :param with_importance: 返回时是否包括算法得到的句子重要性
-        :param standard_name: 如果有entity_mention_list的话，在算法中正规化实体名，一般有助于提升算法效果
-        :param maxlen: 设置得到的摘要最长不超过多少字数，如果已经达到长度限制但未达到topK句也会停止
-        :param avoid_repeat: 使用MMR principle惩罚与已经抽取的摘要重复的句子，避免重复
-        :return: 句子列表，或者with_importance=True时，（句子，分数）列表
-        '''
-        assert topK > 0
-        import networkx as nx
-        maxlen = float('inf') if maxlen is None else maxlen
-        # 使用standard_name,相似度可以基于实体链接的结果计算而更加准确
-        sents = [self.seg(doc.strip(), standard_name=standard_name, stopwords=stopwords) for doc in docs]
-        sents = [sent for sent in sents if len(sent) > 0]
-        G = nx.Graph()
-        for u, v in combinations(range(len(sents)), 2):
-            G.add_edge(u, v, weight=sent_sim_textrank(sents[u], sents[v]))
-
-        pr = nx.pagerank_scipy(G)
-        pr_sorted = sorted(pr.items(), key=lambda x: x[1], reverse=True)
-        if not avoid_repeat:
-            ret = []
-            curr_len = 0
-            for i, imp in pr_sorted[:topK]:
-                curr_len += len(docs[i])
-                if curr_len > maxlen: break
-                ret.append((docs[i], imp) if with_importance else docs[i])
-            return [ ]
-        else:
-            assert topK <= len(sents)
-            ret = []
-            curr_len = 0
-            curr_sumy_words = []
-            candidate_ids = list(range(len(sents)))
-            i, imp = pr_sorted[0]
-            curr_len += len(docs[i])
-            if curr_len > maxlen:
-                return ret
-            ret.append((docs[i], imp) if with_importance else docs[i])
-            curr_sumy_words.extend(sents[i])
-            candidate_ids.remove(i)
-            for iter in range(topK-1):
-                importance = [pr[i] for i in candidate_ids]
-                norm_importance = scipy.special.softmax(importance)
-                redundancy = np.array([sent_sim_cos(curr_sumy_words, sents[i]) for i in candidate_ids])
-                scores = 0.6*norm_importance - 0.4*redundancy
-                id_in_cands = np.argmax(scores)
-                i, imp = candidate_ids[id_in_cands], importance[id_in_cands]
-                curr_len += len(docs[i])
-                if curr_len > maxlen:
-                    return ret
-                ret.append((docs[i], imp) if with_importance else docs[i])
-                curr_sumy_words.extend(sents[i])
-                del candidate_ids[id_in_cands]
-            return ret
-
-    #
-    # 实体网络模块
-    #
-    def build_entity_graph(self, docs, min_freq=0, inv_index={}, used_types=[]):
-        import networkx as nx
-        G = nx.Graph()
-        links = {}
-        if len(inv_index) == 0:
-            for i, sent in enumerate(docs):
-                entities_info = self.entity_linking(sent)
-                if len(used_types) == 0:
-                    entities = set(entity for span, (entity, type0) in entities_info)
-                else:
-                    entities = set(entity for span, (entity, type0) in entities_info if type0[1:-1] in used_types)
-                for u, v in combinations(entities, 2):
-                    pair0 = tuple(sorted((u, v)))
-                    if pair0 not in links:
-                        links[pair0] = 1
-                    else:
-                        links[pair0] += 1
-        else:  # 已经有倒排文档，可以更快速检索
-            if len(used_types) == 0:
-                entities = self.entity_type_dict.keys()
-            else:
-                entities = iter(entity for (entity, type0) in self.entity_type_dict.items() if type0 in used_types)
-            for u, v in combinations(entities, 2):
-                pair0 = tuple(sorted((u, v)))
-                ids = inv_index[u] & inv_index[v]
-                if len(ids) > 0:
-                    links[pair0] = len(ids)
-        for (u, v) in links:
-            if links[(u, v)] >= min_freq:
-                G.add_edge(u, v, weight=links[(u, v)])
-        self.entity_graph = G
-        return G
-
-    def build_word_ego_graph(self, docs, word, standard_name=True, min_freq=0, other_min_freq=-1, stopwords=None):
-        '''根据文本和指定限定词，获得以限定词为中心的各词语的关系。
-        限定词可以是一个特定的方面（衣食住行这类文档），这样就可以从词语中心图中获得关于这个方面的简要信息
-
-        :param docs: 文本的列表
-        :param word: 限定词
-        :param standard_name: 把所有实体的指称化为标准实体名
-        :param stopwords: 需要过滤的停用词
-        :param min_freq: 作为边加入到图中的与中心词最小共现次数，用于筛掉可能过多的边
-        :param other_min_freq: 中心词以外词语关系的最小共现次数
-        :return: G（networxX中的Graph）
-
-        '''
-        import networkx as nx
-        G = nx.Graph()
-        links = {}
-        if other_min_freq == -1:
-            other_min_freq = min_freq
-        for doc in docs:
-            if stopwords:
-                words = set(x for x in self.seg(doc, standard_name=standard_name) if x not in stopwords)
-            else:
-                words = self.seg(doc, standard_name=standard_name)
-            if word in words:
-                for u, v in combinations(words, 2):
-                    pair0 = tuple(sorted((u, v)))
-                    if pair0 not in links:
-                        links[pair0] = 1
-                    else:
-                        links[pair0] += 1
-
-        used_nodes = set([word])  # 关系对中涉及的词语必须与实体有关（>= min_freq）
-        for (u, v) in links:
-            w = links[(u, v)]
-            if word in (u, v) and w >= min_freq:
-                used_nodes.add(v if word == u else u)
-                G.add_edge(u, v, weight=w)
-            elif w >= other_min_freq:
-                G.add_edge(u, v, weight=w)
-        G = G.subgraph(used_nodes).copy()
-        return G
-
-    def build_entity_ego_graph(self, docs, word, min_freq=0, other_min_freq=-1, inv_index={}, used_types=[]):
-        '''Entity only version of build_word_ego_graph()
-        '''
-        import networkx as nx
-        G = nx.Graph()
-        links = {}
-        if other_min_freq == -1:
-            other_min_freq = min_freq
-        if len(inv_index) != 0:
-            related_docs = self.search_entity(word, docs, inv_index)
-        else:
-            related_docs = []
-            for doc in docs:
-                entities_info = self.entity_linking(doc)
-                entities = [entity0 for [[l,r], (entity0,type0)] in entities_info]
-                if word in entities:
-                    related_docs.append(doc)
-
-        for i, sent in enumerate(related_docs):
-            entities_info = self.entity_linking(sent)
-            if len(used_types) == 0:
-                entities = set(entity for span, (entity, type0) in entities_info)
-            else:
-                entities = set(entity for span, (entity, type0) in entities_info if type0[1:-1] in used_types)
-            for u, v in combinations(entities, 2):
-                pair0 = tuple(sorted((u, v)))
-                if pair0 not in links:
-                    links[pair0] = 1
-                else:
-                    links[pair0] += 1
-
-        used_nodes = set([word])  # 关系对中涉及的词语必须与实体有关（>= min_freq）
-        for (u, v) in links:
-            w = links[(u, v)]
-            if word in (u, v) and w >= min_freq:
-                used_nodes.add(v if word == u else u)
-                G.add_edge(u, v, weight=w)
-            elif w >= other_min_freq:
-                G.add_edge(u, v, weight=w)
-        G = G.subgraph(used_nodes).copy()
-        return G
diff --git a/harvesttext/parsing.py b/harvesttext/parsing.py
new file mode 100644
index 0000000..63c7fce
--- /dev/null
+++ b/harvesttext/parsing.py
@@ -0,0 +1,189 @@
+import re
+from .resources import get_baidu_stopwords
+from collections import defaultdict
+from .algorithms.texttile import TextTile
+
+class ParsingMixin:
+    """
+    文本解析模块：
+    - 依存句法分析
+    - 基于依存句法分析的三元组抽取
+    - 基于Texttile的文本自动分段算法
+    """
+    def dependency_parse(self, sent, standard_name=False, stopwords=None):
+        '''依存句法分析，调用pyhanlp的接口，并且融入了harvesttext的实体识别机制。不保证高准确率。
+
+        :param sent:
+        :param standard_name:
+        :param stopwords:
+        :return: arcs：依存弧,列表中的列表。
+            [[词语id,词语字面值或实体名(standard_name控制),词性，依存关系，依存子词语id] for 每个词语]
+        '''
+        from pyhanlp import HanLP, JClass
+        if not self.hanlp_prepared:
+            self.hanlp_prepare()
+        self.standard_name = standard_name
+        entities_info = self.entity_linking(sent)
+        sent2 = self.decoref(sent, entities_info)
+        # [word.ID-1, word.LEMMA, word.POSTAG, word.DEPREL ,word.HEAD.ID-1]
+        arcs = []
+        i = 0
+        sentence = HanLP.parseDependency(sent2)
+        for word in sentence.iterator():
+            word0, tag0 = word.LEMMA, word.POSTAG
+            if stopwords and word0 in stopwords:
+                continue
+            if word0 in self.entity_types:
+                if self.standard_name:
+                    word0 = entities_info[i][1][0]  # 使用链接的实体
+                else:
+                    l, r = entities_info[i][0]  # 或使用原文
+                    word0 = sent[l:r]
+                tag0 = entities_info[i][1][1][1:-1]
+                i += 1
+            arcs.append([word.ID-1, word0, tag0, word.DEPREL, word.HEAD.ID-1])
+        return arcs
+
+    def triple_extraction(self, sent, standard_name=False, stopwords=None, expand = "all"):
+        '''利用主谓宾等依存句法关系，找到句子中有意义的三元组。
+        很多代码参考：https://github.com/liuhuanyong/EventTriplesExtraction
+        不保证高准确率。
+
+        :param sent:
+        :param standard_name:
+        :param stopwords:
+        :param expand: 默认"all"：扩展所有主谓词，"exclude_entity"：不扩展已知实体，可以保留标准的实体名，用于链接。"None":不扩展
+        :return:
+        '''
+        arcs = self.dependency_parse(sent, standard_name, stopwords)
+
+        '''对找出的主语或者宾语进行扩展'''
+        def complete_e(words, postags, child_dict_list, word_index):
+            if expand == "all" or (expand == "exclude_entity" and "#"+postags[word_index]+"#" not in self.entity_types):
+                child_dict = child_dict_list[word_index]
+                prefix = ''
+                if '定中关系' in child_dict:
+                    for i in range(len(child_dict['定中关系'])):
+                        prefix += complete_e(words, postags, child_dict_list, child_dict['定中关系'][i])
+                postfix = ''
+                if postags[word_index] == 'v':
+                    if '动宾关系' in child_dict:
+                        postfix += complete_e(words, postags, child_dict_list, child_dict['动宾关系'][0])
+                    if '主谓关系' in child_dict:
+                        prefix = complete_e(words, postags, child_dict_list, child_dict['主谓关系'][0]) + prefix
+
+                return prefix + words[word_index] + postfix
+            elif expand == "None":
+                return words[word_index]
+            else:            # (expand == "exclude_entity" and "#"+postags[word_index]+"#" in self.entity_types)
+                return words[word_index]
+
+
+        words, postags = ["" for i in range(len(arcs))], ["" for i in range(len(arcs))]
+        child_dict_list = [defaultdict(list) for i in range(len(arcs))]
+        for i, format_parse in enumerate(arcs):
+            id0, words[i], postags[i], rel, headID = format_parse
+            child_dict_list[headID][rel].append(i)
+        svos = []
+        for index in range(len(postags)):
+            # 使用依存句法进行抽取
+            if postags[index]:
+                # 抽取以谓词为中心的事实三元组
+                child_dict = child_dict_list[index]
+                # 主谓宾
+                if '主谓关系' in child_dict and '动宾关系' in child_dict:
+                    r = words[index]
+                    e1 = complete_e(words, postags, child_dict_list, child_dict['主谓关系'][0])
+                    e2 = complete_e(words, postags, child_dict_list, child_dict['动宾关系'][0])
+                    svos.append([e1, r, e2])
+
+                # 定语后置，动宾关系
+                relation = arcs[index][-2]
+                head = arcs[index][-1]
+                if relation == '定中关系':
+                    if '动宾关系' in child_dict:
+                        e1 = complete_e(words, postags, child_dict_list, head)
+                        r = words[index]
+                        e2 = complete_e(words, postags, child_dict_list, child_dict['动宾关系'][0])
+                        temp_string = r + e2
+                        if temp_string == e1[:len(temp_string)]:
+                            e1 = e1[len(temp_string):]
+                        if temp_string not in e1:
+                            svos.append([e1, r, e2])
+                # 含有介宾关系的主谓动补关系
+                if '主谓关系' in child_dict and '动补结构' in child_dict:
+                    e1 = complete_e(words, postags, child_dict_list, child_dict['主谓关系'][0])
+                    CMP_index = child_dict['动补结构'][0]
+                    r = words[index] + words[CMP_index]
+                    if '介宾关系' in child_dict_list[CMP_index]:
+                        e2 = complete_e(words, postags, child_dict_list, child_dict_list[CMP_index]['介宾关系'][0])
+                        svos.append([e1, r, e2])
+        return svos
+
+    def cut_paragraphs(self, text, num_paras=None, block_sents=3, std_weight=0.5,
+                       align_boundary=True, stopwords='baidu', remove_puncts=True,
+                       seq_chars=-1, **kwargs):
+        '''
+
+        :param text:
+        :param num_paras: (默认为None)可以手动设置想要划分的段落数，也可以保留默认值None，让算法自动确定
+        :param block_sents: 算法的参数，将几句句子分为一个block。一般越大，算法自动划分的段落越少
+        :param std_weight: 算法的参数。一般越大，算法自动划分的段落越多
+        :param align_boundary: 新划分的段落是否要与原有的换行处对齐
+        :param stopwords: 字符串列表/元组/集合，或者'baidu'为默认百度停用词，在算法中引入的停用词，一般能够提升准确度
+        :param remove_puncts: （默认为True）是否在算法中去除标点符号，一般能够提升准确度
+        :param seq_chars: （默认为-1）如果设置为>=1的值，则以包含这个数量的字符为基本单元，代替默认的句子。
+        :param **kwargs: passed to ht.cut_sentences, like deduplicate
+        :return:
+        '''
+        if num_paras is not None:
+            assert num_paras > 0, "Should give a positive number of num_paras"
+        assert stopwords == 'baidu' or (hasattr(stopwords, '__iter__') and type(stopwords) != str)
+        stopwords = get_baidu_stopwords() if stopwords == 'baidu' else stopwords
+        if seq_chars < 1:
+            cut_seqs = lambda x: self.cut_sentences(x, **kwargs)
+        else:
+            seq_chars = int(seq_chars)
+
+            def _cut_seqs(text, len0, strip=True, deduplicate=False):
+                if deduplicate:
+                    text = re.sub(r"([。！？\!\?])\1+", r"\1", text)
+                if strip:
+                    text = text.strip()
+                seqs = [text[i:i + len0] for i in range(0, len(text), len0)]
+                return seqs
+
+            cut_seqs = lambda x: _cut_seqs(x, seq_chars, **kwargs)
+
+        if align_boundary:
+            paras = [para.strip() for para in text.split("\n") if len(para.strip()) > 0]
+            if num_paras is not None:
+                # assert num_paras <= len(paras), "The new segmented paragraphs must be no less than the original ones"
+                if num_paras >= len(paras):
+                    return paras
+            original_boundary_ids = []
+            sentences = []
+            for para in paras:
+                sentences.extend(cut_seqs(para))
+                original_boundary_ids.append(len(sentences))
+        else:
+            original_boundary_ids = None
+            sentences = cut_seqs(text, **kwargs)
+        # with entity resolution, can better decide similarity
+        if remove_puncts:
+            allpuncs = re.compile(
+                r"[，\_《。》、？；：‘’＂“”【「】」、·！@￥…（）—\,\<\.\>\/\?\;\:\'\"\[\]\{\}\~\`\!\@\#\$\%\^\&\*\(\)\-\=\+]")
+            sent_words = [re.sub(allpuncs, "",
+                                 self.seg(sent, standard_name=True, stopwords=stopwords, return_sent=True)
+                                 ).split()
+                          for sent in sentences]
+        else:
+            sent_words = [self.seg(sent, standard_name=True, stopwords=stopwords)
+                          for sent in sentences]
+        texttiler = TextTile()
+        predicted_boundary_ids = texttiler.cut_paragraphs(sent_words, num_paras, block_sents, std_weight,
+                                                          align_boundary, original_boundary_ids)
+        jointer = " " if (self.language == 'en' and seq_chars > 1) else ""
+        predicted_paras = [jointer.join(sentences[l:r]) for l, r in
+                           zip([0] + predicted_boundary_ids[:-1], predicted_boundary_ids)]
+        return predicted_paras
diff --git a/harvesttext/resources.py b/harvesttext/resources.py
index fcd0853..623329b 100644
--- a/harvesttext/resources.py
+++ b/harvesttext/resources.py
@@ -19,6 +19,7 @@ def get_qh_sent_dict():
     此资源被用于以下论文中:
     Jun Li and Maosong Sun, Experimental Study on Sentiment Classification of Chinese Review using Machine Learning Techniques, in Proceding of IEEE NLPKE 2007
     李军 中文评论的褒贬义分类实验研究 硕士论文 清华大学 2008
+
     :return: qh_sent_dict = {"pos":[words],"neg":[words]}
 
     """
@@ -30,22 +31,38 @@ def get_qh_sent_dict():
 
 def get_baidu_stopwords():
     """
-        获得百度停用词列表
-        来源，网上流传的版本：https://wenku.baidu.com/view/98c46383e53a580216fcfed9.html
-        包含了中英文常见词及部分标点符号
-        :return: stopwords: set of string
+    获得百度停用词列表
+    来源，网上流传的版本：https://wenku.baidu.com/view/98c46383e53a580216fcfed9.html
+    包含了中英文常见词及部分标点符号
+
+    :return: stopwords: set of string
 
-        """
+    """
     pwd = os.path.abspath(os.path.dirname(__file__))
     with open(pwd + "/resources/bd_stopwords.json", "r", encoding="utf-8") as f:
         stopwords = json.load(f)
     return set(stopwords)
 
+def get_nltk_en_stopwords():
+    """
+    来自nltk的英语停用词
+
+    :return: stopwords: set of string
+    """
+    import nltk
+    try:
+        nltk.data.find('corpora/stopwords')
+    except:
+        nltk.download('stopwords')
+    from nltk.corpus import stopwords
+    return set(stopwords.words('english'))
+
 def get_qh_typed_words(used_types = ['IT', '动物', '医药', '历史人名', '地名', '成语', '法律', '财经', '食物']):
     """
     THUOCL：清华大学开放中文词库
     http://thuocl.thunlp.org/
     IT	财经	成语	地名	历史名人	诗词	医学	饮食	法律	汽车	动物
+
     :param used_types:
     :return: typed_words: 字典，键为类型，值为该类的词语组成的set
 
@@ -62,6 +79,7 @@ def get_qh_typed_words(used_types = ['IT', '动物', '医药', '历史人名', '
 def get_sanguo():
     """
     获得三国演义原文
+
     :return: ["章节1文本","章节2文本",...]
 
     """
@@ -74,6 +92,7 @@ def get_sanguo_entity_dict():
     """
     获得三国演义中的人名、地名、势力名的知识库。
     自行搭建的简单版，一定有遗漏和错误，仅供参考使用
+
     :return: entity_mention_dict,entity_type_dict
 
     """
diff --git a/harvesttext/sentiment.py b/harvesttext/sentiment.py
new file mode 100644
index 0000000..8bd2240
--- /dev/null
+++ b/harvesttext/sentiment.py
@@ -0,0 +1,44 @@
+from .resources import get_qh_sent_dict
+from .algorithms.sent_dict import SentDict
+
+class SentimentMixin:
+    """
+    情感分析模块：
+    - 基于SO-PMI的情感词典挖掘和情感分析算法
+    """
+    def build_sent_dict(self, sents, method="PMI", min_times=5, scale="None",
+                        pos_seeds=None, neg_seeds=None, stopwords=None):
+        '''利用种子词，构建情感词典
+
+        :param sents: list of string, 一般建议为句子，是计算共现PMI的基本单元
+        :param method: "PMI", 使用的算法，目前仅支持PMI
+        :param min_times: int, 默认为5， 在所有句子中出现次数少于这个次数的词语将被过滤
+        :param scale: {"None","0-1","+-1"}, 默认为"None"，否则将对情感值进行变换
+            若为"0-1"，按照最大为1，最小为0进行线性伸缩，0.5未必是中性
+            若为"+-1", 在正负区间内分别伸缩，保留0作为中性的语义
+        :param pos_seeds: list of string, 积极种子词，如不填写将默认采用清华情感词典
+        :param neg_seeds: list of string, 消极种子词，如不填写将默认采用清华情感词典
+        :param stopwords: list of string, stopwords词，如不填写将不使用
+        :return: sent_dict: dict,可以查询单个词语的情感值
+        '''
+        if pos_seeds is None and neg_seeds is None:
+            sdict = get_qh_sent_dict()
+            pos_seeds, neg_seeds = sdict["pos"], sdict["neg"]
+        docs = [set(self.seg(sent)) for sent in sents]
+        if not stopwords is None:
+            stopwords = set(stopwords)
+            for i in range(len(docs)):
+                docs[i] = docs[i] - stopwords
+            docs = list(filter(lambda x: len(x) > 0, docs))
+        self.sent_dict = SentDict(docs, method, min_times, scale, pos_seeds, neg_seeds)
+        return self.sent_dict.sent_dict
+
+    def analyse_sent(self, sent, avg=True):
+        """输入句子，输出其情感值，默认使用句子中，在情感词典中的词语的情感值的平均来计算
+
+        :param sent: string, 句子
+        :param avg: (default True) 是否使用平均值计算句子情感值
+        :return: float情感值(if avg == True), 否则为词语情感值列表
+        """
+        return self.sent_dict.analyse_sent(self.seg(sent), avg)
+
diff --git a/harvesttext/summary.py b/harvesttext/summary.py
new file mode 100644
index 0000000..8d25e34
--- /dev/null
+++ b/harvesttext/summary.py
@@ -0,0 +1,72 @@
+import numpy as np
+import scipy.special
+from itertools import combinations
+from .algorithms.utils import sent_sim_textrank, sent_sim_cos
+
+class SummaryMixin:
+    """
+    文本摘要模块：
+    - 基于textrank+MMR的无监督抽取式摘要方法
+    """
+    def get_summary(self, docs, topK=5, stopwords=None, with_importance=False, standard_name=True,
+                    maxlen=None, avoid_repeat=False):
+        '''使用Textrank算法得到文本中的关键句
+
+        :param docs: str句子列表
+        :param topK: 选取几个句子, 如果设置了maxlen，则优先考虑长度
+        :param stopwords: 在算法中采用的停用词
+        :param with_importance: 返回时是否包括算法得到的句子重要性
+        :param standard_name: 如果有entity_mention_list的话，在算法中正规化实体名，一般有助于提升算法效果
+        :param maxlen: 设置得到的摘要最长不超过多少字数，如果已经达到长度限制但未达到topK句也会停止
+        :param avoid_repeat: 使用MMR principle惩罚与已经抽取的摘要重复的句子，避免重复
+        :return: 句子列表，或者with_importance=True时，（句子，分数）列表
+        '''
+        assert topK > 0
+        import networkx as nx
+        maxlen = float('inf') if maxlen is None else maxlen
+        # 使用standard_name,相似度可以基于实体链接的结果计算而更加准确
+        sents = [self.seg(doc.strip(), standard_name=standard_name, stopwords=stopwords) for doc in docs]
+        sents = [sent for sent in sents if len(sent) > 0]
+        G = nx.Graph()
+        for u, v in combinations(range(len(sents)), 2):
+            G.add_edge(u, v, weight=sent_sim_textrank(sents[u], sents[v]))
+
+        pr = nx.pagerank_scipy(G)
+        pr_sorted = sorted(pr.items(), key=lambda x: x[1], reverse=True)
+        if not avoid_repeat:
+            ret = []
+            curr_len = 0
+            for i, imp in pr_sorted[:topK]:
+                curr_len += len(docs[i])
+                if curr_len > maxlen: break
+                ret.append((docs[i], imp) if with_importance else docs[i])
+            return [ ]
+        else:
+            assert topK <= len(sents)
+            ret = []
+            curr_len = 0
+            curr_sumy_words = []
+            candidate_ids = list(range(len(sents)))
+            i, imp = pr_sorted[0]
+            curr_len += len(docs[i])
+            if curr_len > maxlen:
+                return ret
+            ret.append((docs[i], imp) if with_importance else docs[i])
+            curr_sumy_words.extend(sents[i])
+            candidate_ids.remove(i)
+            for iter in range(topK-1):
+                importance = [pr[i] for i in candidate_ids]
+                norm_importance = scipy.special.softmax(importance)
+                redundancy = np.array([sent_sim_cos(curr_sumy_words, sents[i]) for i in candidate_ids])
+                scores = 0.6*norm_importance - 0.4*redundancy
+                id_in_cands = np.argmax(scores)
+                i, imp = candidate_ids[id_in_cands], importance[id_in_cands]
+                curr_len += len(docs[i])
+                if curr_len > maxlen:
+                    return ret
+                ret.append((docs[i], imp) if with_importance else docs[i])
+                curr_sumy_words.extend(sents[i])
+                del candidate_ids[id_in_cands]
+            return ret
+
+
diff --git a/harvesttext/word_discover.py b/harvesttext/word_discover.py
new file mode 100644
index 0000000..b302ec4
--- /dev/null
+++ b/harvesttext/word_discover.py
@@ -0,0 +1,237 @@
+import logging
+import numpy as np
+import pandas as pd
+from collections import defaultdict
+from tqdm import tqdm
+from .resources import get_baidu_stopwords
+from .algorithms.word_discoverer import WordDiscoverer
+from .algorithms.entity_discoverer import NFLEntityDiscoverer, NERPEntityDiscover
+
+class WordDiscoverMixin:
+    """
+    新词、关键词发现模块:
+    - 基于凝聚度和左右熵的新词发现
+    - 基于模式的专有名词发现
+    - 命名实体识别
+    - 实验性质的实体别名发现算法
+    """
+    def word_discover(self, doc, threshold_seeds=[], auto_param=True,
+                      excluding_types=[], excluding_words='baidu_stopwords',  # 可以排除已经登录的某些种类的实体，或者某些指定词
+                      max_word_len=5, min_freq=0.00005, min_entropy=1.4, min_aggregation=50,
+                      ent_threshold="both", mem_saving=None, sort_by='freq'):
+        '''新词发现，基于 http://www.matrix67.com/blog/archives/5044 实现及微调
+
+        :param doc: (string or list) 待进行新词发现的语料，如果是列表的话，就会自动用换行符拼接
+        :param threshold_seeds: list of string, 设定能接受的“质量”最差的种子词，更差的词语将会在新词发现中被过滤
+        :param auto_param: bool, 使用默认的算法参数
+        :param excluding_types: list of str, 设定要过滤掉的特定词性或已经登录到ht的实体类别
+        :param excluding_words: list of str, 设定要过滤掉的特定词
+        :param max_word_len: 允许被发现的最长的新词长度
+        :param min_freq: 被发现的新词，在给定文本中需要达到的最低频率
+        :param min_entropy: 被发现的新词，在给定文本中需要达到的最低左右交叉熵
+        :param min_aggregation: 被发现的新词，在给定文本中需要达到的最低凝聚度
+        :param ent_threshold: "both": (默认)在使用左右交叉熵进行筛选时，两侧都必须超过阈值; "avg": 两侧的平均值达到阈值即可
+        :param mem_saving: bool or None, 采用一些过滤手段来减少内存使用，但可能影响速度。如果不指定，对长文本自动打开，而对短文本不使用
+        :param sort_by: 以下string之一: {'freq': 词频, 'score': 综合分数, 'agg':凝聚度} 按照特定指标对得到的词语信息排序，默认使用词频
+        :return: info: 包含新词作为index, 以及对应各项指标的DataFrame
+        '''
+        if type(doc) != str:
+            doc = "\n".join(doc)
+        # 采用经验参数，此时后面的参数设置都无效
+        if auto_param:  # 根据自己的几个实验确定的参数估计值，没什么科学性，但是应该能得到还行的结果
+            length = len(doc)
+            min_entropy = np.log(length) / 10
+            min_freq = min(0.00005, 20.0 / length)
+            min_aggregation = np.sqrt(length) / 15
+            mem_saving = bool(length > 300000) if mem_saving is None else mem_saving
+            # ent_threshold: 确定左右熵的阈值对双侧都要求"both"，或者只要左右平均值达到"avg"
+            # 对于每句话都很极短的情况（如长度<8），经常出现在左右边界的词语可能难以被确定，这时ent_threshold建议设为"avg"
+        mem_saving = False if mem_saving is None else mem_saving
+
+        try:
+            ws = WordDiscoverer(doc, max_word_len, min_freq, min_entropy, min_aggregation, ent_threshold, mem_saving)
+        except Exception as e:
+            logging.log(logging.ERROR, str(e))
+            info = {"text": [], "freq": [], "left_ent": [], "right_ent": [], "agg": []}
+            info = pd.DataFrame(info)
+            info = info.set_index("text")
+            return info
+
+        if len(excluding_types) > 0:
+            if "#" in list(excluding_types)[0]:  # 化为无‘#’标签
+                excluding_types = [x[1:-1] for x in excluding_types]
+            ex_mentions = set(x for enty in self.entity_mention_dict
+                           if enty in self.entity_type_dict and
+                           self.entity_type_dict[enty] in excluding_types
+                           for x in self.entity_mention_dict[enty])
+        else:
+            ex_mentions = set()
+        assert excluding_words == 'baidu_stopwords' or (hasattr(excluding_words, '__iter__') and type(excluding_words) != str)
+        if excluding_words == 'baidu_stopwords':
+            ex_mentions |= get_baidu_stopwords()
+        else:
+            ex_mentions |= set(excluding_words)
+
+        info = ws.get_df_info(ex_mentions)
+
+        # 利用种子词来确定筛选优质新词的标准，种子词中最低质量的词语将被保留（如果一开始就被找到的话）
+        if len(threshold_seeds) > 0:
+            min_score = 100000
+            for seed in threshold_seeds:
+                if seed in info.index:
+                    min_score = min(min_score, info.loc[seed, "score"])
+            if (min_score >= 100000):
+                min_score = 0
+            else:
+                min_score *= 0.9  # 留一些宽松的区间
+                info = info[info["score"] > min_score]
+        if sort_by:
+            info.sort_values(by=sort_by, ascending=False, inplace=True)
+
+        return info
+
+    def find_entity_with_rule(self, text, rulesets=[], add_to_dict=True, type0="添加词"):
+        '''利用规则从分词结果中的词语找到实体，并可以赋予相应的类型再加入实体库
+
+        :param text: string, 一段文本
+        :param rulesets: list of (tuple of rules or single rule) from match_patterns,
+            list中包含多个规则，满足其中一种规则的词就认为属于这个type
+            而每种规则由tuple或单个条件(pattern)表示，一个词必须满足其中的一个或多个条件。
+        :param add_to_dict: 是否把找到的结果直接加入词典
+        :param type0: 赋予满足条件的词语的实体类型, 仅当add_to_dict时才有意义
+        :return: found_entities
+
+        '''
+        found_entities = set()
+        for word in self.seg(text):
+            for ruleset in rulesets:  # 每个ruleset是或关系，只要满足一个就添加并跳过其他
+                toAdd = True
+                if type(ruleset) == type((1, 2)):  # tuple
+                    for pattern0 in ruleset:
+                        if not pattern0(word):
+                            toAdd = False
+                            break
+                else:  # single rule
+                    pattern0 = ruleset
+                    if not pattern0(word):
+                        toAdd = False
+                if toAdd:
+                    found_entities.add(word)
+                    break
+        if add_to_dict:
+            for entity0 in found_entities:
+                self.add_new_entity(entity0, entity0, type0)
+            self.prepare()
+        return found_entities
+
+    def named_entity_recognition(self, sent, standard_name=False, return_posseg=False):
+        '''利用pyhanlp的命名实体识别，找到句子中的（人名，地名，机构名，其他专名）实体。harvesttext会预先链接已知实体
+
+        :param sent: string, 文本
+        :param standard_name: bool, 是否把连接到的已登录转化为标准名
+        :param return_posseg: bool, 是否返回包括命名实体识别的，带词性分词结果
+        :param book: bool, 预先识别
+        :return: entity_type_dict: 发现的命名实体信息，字典 {实体名: 实体类型}
+            (return_posseg=True时) possegs: list of (单词, 词性)
+        '''
+        from pyhanlp import HanLP, JClass
+        if not self.hanlp_prepared:
+            self.hanlp_prepare()
+        self.standard_name = standard_name
+        entities_info = self.entity_linking(sent)
+        sent2 = self.decoref(sent, entities_info)
+        StandardTokenizer = JClass("com.hankcs.hanlp.tokenizer.StandardTokenizer")
+        StandardTokenizer.SEGMENT.enableAllNamedEntityRecognize(True)
+        entity_type_dict = {}
+        try:
+            possegs = []
+            for x in StandardTokenizer.segment(sent2):
+                # 三种前缀代表：人名（nr），地名（ns），机构名（nt）
+                tag0 = str(x.nature)
+                if tag0.startswith("nr"):
+                    entity_type_dict[x.word] = "人名"
+                elif tag0.startswith("ns"):
+                    entity_type_dict[x.word] = "地名"
+                elif tag0.startswith("nt"):
+                    entity_type_dict[x.word] = "机构名"
+                elif tag0.startswith("nz"):
+                    entity_type_dict[x.word] = "其他专名"
+                possegs.append((x.word, tag0))
+        except:
+            pass
+        if return_posseg:
+            return entity_type_dict, possegs
+        else:
+            return entity_type_dict
+    def entity_discover(self, text, return_count=False, method="NFL", min_count=5, pinyin_tolerance=0, **kwargs):
+        """无监督地从较大量文本中发现实体的类别和多个同义mention。建议对千句以上的文本来挖掘，并且文本的主题比较集中。
+            效率：在测试环境下处理一个约10000句的时间大约是20秒。另一个约200000句的语料耗时2分半
+            精度：算法准确率不高，但是可以初步聚类，建议先save_entities后, 再进行手动进行调整，然后load_entities再用于进一步挖掘
+
+            ref paper: Mining Entity Synonyms with Efficient Neural Set Generation(https://arxiv.org/abs/1811.07032v1)
+
+        :param text: string or list of string
+        :param return_count: (default False) 是否再返回每个mention的出现次数
+        :param method: 使用的算法， 目前可选 "NFL" (NER+Fasttext+Louvain+模式修复，基于语义和规则发现同义实体，但可能聚集过多错误实体), "NERP"(NER+模式修复, 仅基于规则发现同义实体)
+        :param min_count: (default 5) mininum freq of word to be included
+        :param pinyin_tolerance: {None, 0, 1} 合并拼音相同(取0时)或者差别只有一个(取1时)的候选词到同一组实体，默认使用(0)
+        :param kwargs: 根据算法决定的参数，目前, "NERP"不需要额外参数，而"NFL"可接受的额外参数有：
+
+            emb_dim: (default 50) fasttext embedding's dimensions
+
+            threshold: (default 0.98) [比较敏感，调参重点]larger for more entities, threshold for add an edge between 2 entities if cos_dim exceeds
+
+            ft_iters: (default 20) larger for more entities, num of iterations used by fasttext
+
+            use_subword: (default True) whether to use fasttext's subword info
+
+            min_n: (default 1) min length of used subword
+
+            max_n: (default 4) max length of used subword
+
+        :return: entity_mention_dict, entity_type_dict
+        """
+        text = text if type(text) == str else "\n".join(text)
+        method = method.upper()
+        assert method in {"NFL", "NERP"}
+        # discover candidates with NER
+        print("Doing NER")
+        sent_words = []
+        type_entity_dict = defaultdict(set)
+        entity_count = defaultdict(int)
+        wd_count = defaultdict(int)
+        for sent in tqdm(self.cut_sentences(text)):
+            NERs0, possegs = self.named_entity_recognition(sent, return_posseg=True)
+            sent_wds0 = []
+            for wd, pos in possegs:
+                if wd in NERs0:
+                    zh_pos = NERs0[wd]
+                    entity_name = wd.lower() + "_" + zh_pos
+                    type_entity_dict[zh_pos].add(entity_name)
+                    sent_wds0.append(entity_name)
+                    entity_count[entity_name] += 1
+                else:
+                    sent_wds0.append(wd)
+                    wd_count[wd] += 1
+            sent_words.append(sent_wds0)
+
+        entity_count = pd.Series(entity_count)
+        entity_count = entity_count[entity_count >= min_count]
+        pop_words_cnt = {wd:cnt for wd, cnt in wd_count.items() if cnt >= min_count}
+        id2word = entity_count.index.tolist()
+        word2id = {wd: i for (i, wd) in enumerate(id2word)}
+
+        type_entity_dict2 = {k: list(v) for k, v in type_entity_dict.items()}
+        if method == "NFL":
+            discoverer = NFLEntityDiscoverer(sent_words, type_entity_dict2, entity_count, pop_words_cnt, word2id, id2word,
+                                             min_count, pinyin_tolerance, self.pinyin_adjlist, **kwargs)
+        elif method == "NERP":
+            discoverer = NERPEntityDiscover(sent_words, type_entity_dict2, entity_count, pop_words_cnt, word2id, id2word,
+                                            min_count, pinyin_tolerance, self.pinyin_adjlist, **kwargs)
+        entity_mention_dict, entity_type_dict = discoverer.entity_mention_dict, discoverer.entity_type_dict
+        mention_count = discoverer.mention_count         # 新添加的mention的count在discoverer里更新
+        if return_count:
+            return entity_mention_dict, entity_type_dict, mention_count
+        else:
+            return entity_mention_dict, entity_type_dict
+
diff --git a/setup.py b/setup.py
index 70ee952..dc3c401 100644
--- a/setup.py
+++ b/setup.py
@@ -2,12 +2,13 @@
 # -*- coding: utf-8 -*-
 
 from setuptools import setup, find_packages
+from harvesttext import __version__
 
 setup(
     name='harvesttext',
     author="blmoistawinde",
     author_email="1840962220@qq.com",
-    version="0.7.4.2",
+    version=__version__,
     license='MIT',
     keywords='NLP, tokenizing, entity linking, sentiment analysis, text cleaning',
     url='https://github.com/blmoistawinde/HarvestText',