From e3eb4638e8dafa6b967daf776640533316679c73 Mon Sep 17 00:00:00 2001 From: Diwank Singh Tomer Date: Sat, 30 Jan 2021 06:59:11 +0000 Subject: [PATCH] fix: Fix dataset, script Signed-off-by: Diwank Singh Tomer --- scripts/benchmark/ChatbotCorpus.json | 40 +--------------------------- scripts/benchmark/main.py | 37 +++++++++++++++---------- scripts/benchmark/requirements.txt | 9 +++++++ 3 files changed, 33 insertions(+), 53 deletions(-) diff --git a/scripts/benchmark/ChatbotCorpus.json b/scripts/benchmark/ChatbotCorpus.json index ba841c9..ff2164e 100644 --- a/scripts/benchmark/ChatbotCorpus.json +++ b/scripts/benchmark/ChatbotCorpus.json @@ -295,7 +295,6 @@ "templates": [ "how i can get from ENTITYGPE to ENTITYGPE", "can you give me a connection from ENTITYGPE to ENTITYGPE?", - "depart in ENTITYGPE, i assume", "when is the next ENTITYGPE leaving from ENTITYGPE?", "when does the ENTITYNOUN come at ENTITYGPE", "when is the ENTITYPRODUCT from ENTITYGPE?" @@ -1522,7 +1521,6 @@ "match": "ENTITYNOUN in ENTITYGPE", "templates": [ "ENTITYNOUN in ENTITYGPE", - "depart in ENTITYGPE, i assume", "how do i get from ENTITYGPE to ENTITYGPE?", "how can i get from ENTITYGPE to ENTITYGPE as ENTITYNOUN as possible?", "can you give me a connection from ENTITYGPE to ENTITYGPE?", @@ -2587,36 +2585,6 @@ "how can i get from ENTITYGPE to ENTITYGPE?" ] }, - { - "entities": [ - { - "text": "garching", - "entityType": "StationStart", - "holmesIdentifier": "ENTITYGPE" - } - ], - "training": false, - "inputs": [ - "I assume you leave in garching.", - "I assume that you leave in garching.", - "I think you leave in garching.", - "I assume that they leave in garching.", - "I assume they leave in garching.", - "I assume that you depart in garching.", - "I assume you depart in garching.", - "I assume you leave in garching", - "depart in garching, i assume" - ], - "match": "depart in ENTITYGPE, i assume", - "templates": [ - "depart in ENTITYGPE, i assume", - "or depart from ENTITYGPE", - "is there a ENTITYPRODUCT from ENTITYGPE to ENTITYGPE?", - "can you find a ENTITYPRODUCT from ENTITYGPE to ENTITYGPE?", - "ENTITYGPE to ENTITYGPE", - "how do i get from ENTITYGPE to ENTITYGPE?" - ] - }, { "entities": [ { @@ -3434,8 +3402,7 @@ "how can i get from ENTITYGPE to ENTITYGPE?", "when does the ENTITYNOUN starts at ENTITYGPE?", "when will the ENTITYNOUN depart from ENTITYGPE?", - "when does the ENTITYNOUN leaves at ENTITYGPE?", - "depart in ENTITYGPE, i assume" + "when does the ENTITYNOUN leaves at ENTITYGPE?" ] }, { @@ -3754,7 +3721,6 @@ "how can i get from ENTITYGPE to ENTITYGPE?", "when does the ENTITYNOUN departs at ENTITYGPE?", "when is the ENTITYPRODUCT leaving in ENTITYGPE", - "depart in ENTITYGPE, i assume", "when is the ENTITYNOUN in ENTITYGPE?" ] }, @@ -4238,7 +4204,6 @@ "templates": [ "how can i get from ENTITYGPE to ENTITYGPE?", "how can i get to ENTITYGPE?", - "depart in ENTITYGPE, i assume", "when does the ENTITYPRODUCT leaving in ENTITYGPE", "when is the ENTITYNOUN in ENTITYGPE?", "ENTITYNOUN from ENTITYGPE" @@ -5188,7 +5153,6 @@ "match": "when is the ENTITYNOUN leaving from ENTITYGPE?", "templates": [ "when is the ENTITYNOUN leaving from ENTITYGPE?", - "depart in ENTITYGPE, i assume", "how to get from ENTITYGPE to ENTITYGPE", "ENTITYGPE to ENTITYGPE", "what's the ENTITYNOUN way from ENTITYGPE to ENTITYGPE?", @@ -6156,7 +6120,6 @@ "from ENTITYGPE to ENTITYGPE", "when is the ENTITYNOUN from ENTITYGPE to ENTITYGPE", "when does the ENTITYNOUN starts at ENTITYGPE?", - "depart in ENTITYGPE, i assume", "when the ENTITYNOUN in ENTITYGPE is leaving?", "when does the ENTITYNOUN leaves at ENTITYGPE?" ] @@ -6439,7 +6402,6 @@ "from ENTITYGPE to ENTITYGPE", "take me from ENTITYGPE to ENTITYGPE", "ENTITYNOUN from ENTITYGPE", - "depart in ENTITYGPE, i assume", "when does the ENTITYNOUN departes from ENTITYGPE?" ] } diff --git a/scripts/benchmark/main.py b/scripts/benchmark/main.py index 3d49dc3..e555d6d 100644 --- a/scripts/benchmark/main.py +++ b/scripts/benchmark/main.py @@ -10,6 +10,7 @@ import holmes_extractor as holmes import patches.holmes_extractor.semantics as semantics import ChatbotCorpus +from textdistance import cosine logger = logging.getLogger("holmes_extractor") @@ -75,24 +76,31 @@ def match_phrase(self, search_phrases, phrase): return self.manager.match_search_phrases_against(entry=phrase) -def check_result(results, all_matches): +def is_match(phrase1, phrase2, threshold=0.75): + similarity = cosine.normalized_similarity(phrase1, phrase2) + + return (similarity >= threshold) + +def check_result(results, all_matches, threshold=0.75): res = [] for result in results: matches = result.get("word_matches", []) if not matches: continue - + res.append( any( [ - " ".join( - [ - m["search_phrase_word"].lower() - for m in matches - ] - ) - == match.lower() - for match in all_matches + is_match( + " ".join( + [ + m["search_phrase_word"].lower() + for m in matches + ] + ), + match.lower(), + threshold=threshold + ) for match in all_matches ] ) ) @@ -139,9 +147,9 @@ def create_search_phrases(model, phrases): return result -def main(paraphrase: bool = True, large_model: bool = False, sentences: int = 0): - model = Model("tuner007/pegasus_paraphrase") - model.prepare() +def main(paraphrase: bool = True, large_model: bool = False, sentences: int = 0, threshold: float = 0.75): + model = paraphrase and Model("tuner007/pegasus_paraphrase") + paraphrase and model.prepare() logger.info("-" * 120) success, count, errors, non_empty_count = 0, 0, 0, 0 @@ -171,7 +179,8 @@ def main(paraphrase: bool = True, large_model: bool = False, sentences: int = 0) errors += 1 logger.exception(e) continue - if check_result(result, create_search_phrases(model, [sentence.match])): + search_phrases = create_search_phrases(model, [sentence.match]) if paraphrase else [sentence.match] + if check_result(result, search_phrases, threshold=threshold): success += 1 break count += 1 diff --git a/scripts/benchmark/requirements.txt b/scripts/benchmark/requirements.txt index 9502fb4..d6a7b62 100644 --- a/scripts/benchmark/requirements.txt +++ b/scripts/benchmark/requirements.txt @@ -1,5 +1,7 @@ +abydos==0.5.0 backcall==0.2.0 beautifulsoup4==4.9.3 +beautifultable==1.0.1 blis==0.2.4 boto3==1.16.62 botocore==1.19.62 @@ -9,6 +11,9 @@ chardet==4.0.0 click==7.1.2 cymem==2.0.5 decorator==4.4.2 +deprecation==2.1.0 +en-core-web-lg==2.1.0 +en-core-web-md==2.1.0 falcon==2.0.0 filelock==3.0.12 holmes-extractor==2.2.1 @@ -18,6 +23,7 @@ ipython==7.19.0 ipython-genutils==0.2.0 isodate==0.6.0 jedi==0.18.0 +jellyfish==0.8.2 jmespath==0.10.0 joblib==1.0.0 json-sempai==0.4.0 @@ -40,6 +46,8 @@ ptyprocess==0.7.0 Pygments==2.7.4 pyparsing==2.4.7 python-dateutil==2.8.1 +python-Levenshtein==0.12.1 +pyxDamerauLevenshtein==1.6.1 rdflib==5.0.0 regex==2020.11.13 requests==2.25.1 @@ -53,6 +61,7 @@ sklearn==0.0 soupsieve==2.1 spacy==2.1.0 srsly==1.0.5 +textdistance==4.2.1 thinc==7.0.8 threadpoolctl==2.1.0 tokenizers==0.9.4