Merge pull request #197 from megagonlabs/develop

Release v5.0.3
megagonlabs · Oct 15, 2021 · 277b29d · 277b29d
2 parents 1753eac + ad46e8b
commit 277b29d
Showing 7 changed files with 152 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -221,6 +221,11 @@ Please read the official documents to compile user dictionaries with `sudachipy`
 
 ### version 5.x
 
+#### ginza-5.0.3
+- 2021-10-15
+- Bug fix
+  - `Bunsetu span should not cross the sentence boundary` #195
+
 #### ginza-5.0.2
 - 2021-09-06
 - Bug fix

diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
@@ -0,0 +1,125 @@
+from datetime import datetime
+import json
+import sys
+
+
+REPEAT = 5
+BATCH_SIZE = 128
+
+assert len(sys.argv) == 1 or len(sys.argv) == 2 and sys.argv[1] == "-g", "Usage: python {sys.argv[0]} {sys.argv[1]} [-g]"
+require_gpu = len(sys.argv) == 2 and sys.argv[1] == "-g"
+if require_gpu:
+    device = "GPU"
+else:
+    device = "CPU"
+
+sents = [_.rstrip("\n") for _ in sys.stdin]
+
+results = {}
+
+
+print("timestamp                 ", "[msec]", "device", 'procedure description', sep="\t", file=sys.stderr)
+start = datetime.now()
+prev = start
+print(start, 0, f'benchmark started with {len(sents)} sentences', sep="\t", file=sys.stderr)
+
+import spacy
+if require_gpu:
+    spacy.require_gpu()
+lap = datetime.now()
+dur = int((lap - prev).total_seconds() * 1000)
+results[f"import spacy"] = [dur]
+print(lap, dur, device, 'import spacy', sep="\t", file=sys.stderr)
+prev = lap
+
+nlp = spacy.load("ja_ginza")
+lap = datetime.now()
+dur = int((lap - prev).total_seconds() * 1000)
+results[f"spacy.load('ja_ginza')"] = [dur]
+print(lap, dur, device, f'spacy.load("ja_ginza")', sep="\t", file=sys.stderr)
+prev = lap
+
+results[f"ja_ginza->nlp(batch={BATCH_SIZE})"] = []
+for repeat in range(1, REPEAT + 1):
+    for _ in range((len(sents) - 1) // BATCH_SIZE + 1):
+        doc = nlp("\n".join(sents[_ * BATCH_SIZE:(_ + 1) * BATCH_SIZE]))
+    lap = datetime.now()
+    dur = int((lap - prev).total_seconds() * 1000)
+    results[f"ja_ginza->nlp(batch={BATCH_SIZE})"].append(dur / len(sents))
+    print(
+        lap,
+        dur,
+        device,
+        f'#{repeat} ja_ginza->nlp(batch={BATCH_SIZE}): {dur / len(sents):.03f}[msec/sent]',
+        sep="\t", file=sys.stderr,
+    )
+    prev = lap
+
+results[f"ja_ginza->nlp(batch=1)"] = []
+for repeat in range(1, REPEAT + 1):
+    for sent in sents:
+        doc = nlp(sent)
+    lap = datetime.now()
+    dur = int((lap - prev).total_seconds() * 1000)
+    results[f"ja_ginza->nlp(batch=1)"].append(dur / len(sents))
+    print(
+        lap,
+        dur,
+        device,
+        f'#{repeat} ja_ginza->nlp(batch=1):   {dur / len(sents):.03f}[msec/sent]',
+        sep="\t", file=sys.stderr,
+    )
+    prev = lap
+
+nlp = spacy.load("ja_ginza_electra")
+lap = datetime.now()
+dur = int((lap - prev).total_seconds() * 1000)
+results[f"spacy.load('ja_ginza_electra')"] = [dur]
+print(lap, dur, device, f'spacy.load("ja_ginza_electra")', sep="\t", file=sys.stderr)
+prev = lap
+
+results[f"ja_ginza_electra->nlp(batch={BATCH_SIZE})"] = []
+for repeat in range(1, REPEAT + 1):
+    for _ in range((len(sents) - 1) // BATCH_SIZE + 1):
+        doc = nlp("\n".join(sents[_ * BATCH_SIZE:(_ + 1) * BATCH_SIZE]))
+    lap = datetime.now()
+    dur = int((lap - prev).total_seconds() * 1000)
+    results[f"ja_ginza_electra->nlp(batch={BATCH_SIZE})"].append(dur / len(sents))
+    print(
+        lap,
+        dur,
+        device,
+        f'#{repeat} ja_ginza_electra->nlp(batch={BATCH_SIZE}): {dur / len(sents):.03f}[msec/sent]',
+        sep="\t", file=sys.stderr,
+    )
+    prev = lap
+
+results[f"ja_ginza_electra->nlp(batch=1)"] = []
+for repeat in range(1, REPEAT + 1):
+    for sent in sents:
+        doc = nlp(sent)
+    lap = datetime.now()
+    dur = int((lap - prev).total_seconds() * 1000)
+    results[f"ja_ginza_electra->nlp(batch=1)"].append(dur / len(sents))
+    print(
+        lap,
+        dur,
+        device,
+        f'#{repeat} ja_ginza_electra->nlp(batch=1):   {dur / len(sents):.03f}[msec/sent]',
+        sep="\t", file=sys.stderr,
+    )
+    prev = lap
+
+dur = int((lap - start).total_seconds() * 1000)
+print(lap, dur, device, 'total', sep="\t", file=sys.stderr)
+
+for k, v in results.items():
+    l = sorted(v)
+    results[k] = l[len(l) // 2]
+
+json.dump(
+    {"device": device, "results": results},
+    sys.stdout,
+    ensure_ascii=False,
+)
+print()
diff --git a/benchmark/run_benchmark.sh b/benchmark/run_benchmark.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -e
+cat gsd/dev.txt gsd/test.txt | python benchmark.py -g
+cat gsd/dev.txt gsd/test.txt | python benchmark.py
diff --git a/benchmark/setup_benchmark.sh b/benchmark/setup_benchmark.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+mkdir -p gsd
+for t in train dev test ; do
+  curl "https://raw.githubusercontent.com/megagonlabs/UD_Japanese-GSD/c614040872a74587912a15ef4637eabc0dc29a60/ja_gsd-ud-${t}.ne.conllu?raw=true" | grep "# text = " | sed 's/# text = //' > gsd/${t}.txt
+done
+echo
+echo '=== CUDA Related Installation Steps ==='
+echo 'The pytorch should be installed with cuda support. See https://pytorch.org/get-started/previous-versions/#linux-and-windows-1'
+echo 'Also you need to install spacy with appropriate cuda specifier as `pip install -U spacy[cudaXXX]`. See https://spacy.io/usage#gpu'
+echo 'And then, install GiNZA as `pip install -U ginza ja-ginza ja-ginza-electra`.'
diff --git a/docs/index.md b/docs/index.md
@@ -225,6 +225,11 @@ Contains information from mC4 which is made available under the ODC Attribution
 
 ### version 5.x
 
+#### ginza-5.0.3
+- 2021-10-15
+- Bug fix
+  - `Bunsetu span should not cross the sentence boundary` #195
+
 #### ginza-5.0.2
 - 2021-09-06
 - Bug fix

diff --git a/ginza/bunsetu_recognizer.py b/ginza/bunsetu_recognizer.py
@@ -79,7 +79,7 @@ def bunsetu_span(token: Token) -> Span:
     start = token.i
     end = start + 1
     for idx in range(start, 0, -1):
-        if bunsetu_bi_list[idx] == "B":
+        if bunsetu_bi_list[idx] == "B" or token.doc[idx].is_sent_start:
             start = idx
             break
     else:

diff --git a/setup.py b/setup.py
@@ -26,5 +26,5 @@
     name="ginza",
     packages=find_packages(include=["ginza"]),
     url="https://github.com/megagonlabs/ginza",
-    version='5.0.2',
+    version='5.0.3',
 )