Skip to content

Commit

Permalink
Merge pull request #197 from megagonlabs/develop
Browse files Browse the repository at this point in the history
Release v5.0.3
hiroshi-matsuda-rit authored Oct 15, 2021
2 parents 1753eac + ad46e8b commit 277b29d
Showing 7 changed files with 152 additions and 2 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -221,6 +221,11 @@ Please read the official documents to compile user dictionaries with `sudachipy`

### version 5.x

#### ginza-5.0.3
- 2021-10-15
- Bug fix
- `Bunsetu span should not cross the sentence boundary` #195

#### ginza-5.0.2
- 2021-09-06
- Bug fix
125 changes: 125 additions & 0 deletions benchmark/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
from datetime import datetime
import json
import sys


REPEAT = 5
BATCH_SIZE = 128

assert len(sys.argv) == 1 or len(sys.argv) == 2 and sys.argv[1] == "-g", "Usage: python {sys.argv[0]} {sys.argv[1]} [-g]"
require_gpu = len(sys.argv) == 2 and sys.argv[1] == "-g"
if require_gpu:
device = "GPU"
else:
device = "CPU"

sents = [_.rstrip("\n") for _ in sys.stdin]

results = {}


print("timestamp ", "[msec]", "device", 'procedure description', sep="\t", file=sys.stderr)
start = datetime.now()
prev = start
print(start, 0, f'benchmark started with {len(sents)} sentences', sep="\t", file=sys.stderr)

import spacy
if require_gpu:
spacy.require_gpu()
lap = datetime.now()
dur = int((lap - prev).total_seconds() * 1000)
results[f"import spacy"] = [dur]
print(lap, dur, device, 'import spacy', sep="\t", file=sys.stderr)
prev = lap

nlp = spacy.load("ja_ginza")
lap = datetime.now()
dur = int((lap - prev).total_seconds() * 1000)
results[f"spacy.load('ja_ginza')"] = [dur]
print(lap, dur, device, f'spacy.load("ja_ginza")', sep="\t", file=sys.stderr)
prev = lap

results[f"ja_ginza->nlp(batch={BATCH_SIZE})"] = []
for repeat in range(1, REPEAT + 1):
for _ in range((len(sents) - 1) // BATCH_SIZE + 1):
doc = nlp("\n".join(sents[_ * BATCH_SIZE:(_ + 1) * BATCH_SIZE]))
lap = datetime.now()
dur = int((lap - prev).total_seconds() * 1000)
results[f"ja_ginza->nlp(batch={BATCH_SIZE})"].append(dur / len(sents))
print(
lap,
dur,
device,
f'#{repeat} ja_ginza->nlp(batch={BATCH_SIZE}): {dur / len(sents):.03f}[msec/sent]',
sep="\t", file=sys.stderr,
)
prev = lap

results[f"ja_ginza->nlp(batch=1)"] = []
for repeat in range(1, REPEAT + 1):
for sent in sents:
doc = nlp(sent)
lap = datetime.now()
dur = int((lap - prev).total_seconds() * 1000)
results[f"ja_ginza->nlp(batch=1)"].append(dur / len(sents))
print(
lap,
dur,
device,
f'#{repeat} ja_ginza->nlp(batch=1): {dur / len(sents):.03f}[msec/sent]',
sep="\t", file=sys.stderr,
)
prev = lap

nlp = spacy.load("ja_ginza_electra")
lap = datetime.now()
dur = int((lap - prev).total_seconds() * 1000)
results[f"spacy.load('ja_ginza_electra')"] = [dur]
print(lap, dur, device, f'spacy.load("ja_ginza_electra")', sep="\t", file=sys.stderr)
prev = lap

results[f"ja_ginza_electra->nlp(batch={BATCH_SIZE})"] = []
for repeat in range(1, REPEAT + 1):
for _ in range((len(sents) - 1) // BATCH_SIZE + 1):
doc = nlp("\n".join(sents[_ * BATCH_SIZE:(_ + 1) * BATCH_SIZE]))
lap = datetime.now()
dur = int((lap - prev).total_seconds() * 1000)
results[f"ja_ginza_electra->nlp(batch={BATCH_SIZE})"].append(dur / len(sents))
print(
lap,
dur,
device,
f'#{repeat} ja_ginza_electra->nlp(batch={BATCH_SIZE}): {dur / len(sents):.03f}[msec/sent]',
sep="\t", file=sys.stderr,
)
prev = lap

results[f"ja_ginza_electra->nlp(batch=1)"] = []
for repeat in range(1, REPEAT + 1):
for sent in sents:
doc = nlp(sent)
lap = datetime.now()
dur = int((lap - prev).total_seconds() * 1000)
results[f"ja_ginza_electra->nlp(batch=1)"].append(dur / len(sents))
print(
lap,
dur,
device,
f'#{repeat} ja_ginza_electra->nlp(batch=1): {dur / len(sents):.03f}[msec/sent]',
sep="\t", file=sys.stderr,
)
prev = lap

dur = int((lap - start).total_seconds() * 1000)
print(lap, dur, device, 'total', sep="\t", file=sys.stderr)

for k, v in results.items():
l = sorted(v)
results[k] = l[len(l) // 2]

json.dump(
{"device": device, "results": results},
sys.stdout,
ensure_ascii=False,
)
print()
4 changes: 4 additions & 0 deletions benchmark/run_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash
set -e
cat gsd/dev.txt gsd/test.txt | python benchmark.py -g
cat gsd/dev.txt gsd/test.txt | python benchmark.py
11 changes: 11 additions & 0 deletions benchmark/setup_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
set -e
mkdir -p gsd
for t in train dev test ; do
curl "https://raw.githubusercontent.com/megagonlabs/UD_Japanese-GSD/c614040872a74587912a15ef4637eabc0dc29a60/ja_gsd-ud-${t}.ne.conllu?raw=true" | grep "# text = " | sed 's/# text = //' > gsd/${t}.txt
done
echo
echo '=== CUDA Related Installation Steps ==='
echo 'The pytorch should be installed with cuda support. See https://pytorch.org/get-started/previous-versions/#linux-and-windows-1'
echo 'Also you need to install spacy with appropriate cuda specifier as `pip install -U spacy[cudaXXX]`. See https://spacy.io/usage#gpu'
echo 'And then, install GiNZA as `pip install -U ginza ja-ginza ja-ginza-electra`.'
5 changes: 5 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
@@ -225,6 +225,11 @@ Contains information from mC4 which is made available under the ODC Attribution

### version 5.x

#### ginza-5.0.3
- 2021-10-15
- Bug fix
- `Bunsetu span should not cross the sentence boundary` #195

#### ginza-5.0.2
- 2021-09-06
- Bug fix
2 changes: 1 addition & 1 deletion ginza/bunsetu_recognizer.py
Original file line number Diff line number Diff line change
@@ -79,7 +79,7 @@ def bunsetu_span(token: Token) -> Span:
start = token.i
end = start + 1
for idx in range(start, 0, -1):
if bunsetu_bi_list[idx] == "B":
if bunsetu_bi_list[idx] == "B" or token.doc[idx].is_sent_start:
start = idx
break
else:
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -26,5 +26,5 @@
name="ginza",
packages=find_packages(include=["ginza"]),
url="https://github.com/megagonlabs/ginza",
version='5.0.2',
version='5.0.3',
)

0 comments on commit 277b29d

Please sign in to comment.