Skip to content

Commit

Permalink
🚧 respect threshold in cli and api
Browse files Browse the repository at this point in the history
  • Loading branch information
simonwoerpel committed Jan 3, 2024
1 parent db2cc9b commit cdee962
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 12 deletions.
16 changes: 11 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ The implementation uses a pre-populated redis cache which can fallback to other
juditha lookup "jane doe"
"Jane Doe"

To match more fuzzy, reduce the threshold (default 0.97):

juditha lookup "doe, jane" --threshold 0.5
"Jane Doe"

## data import

### from ftm entities
Expand All @@ -49,7 +54,8 @@ Following the [`nomenklatura`](https://github.com/opensanctions/nomenklatura) sp
from juditha import lookup

assert lookup("jane doe") == "Jane Doe"
assert lookup("foo") is None
assert lookup("doe, jane") is None
assert lookup("doe, jane", threshold=0.5) == "Jane Doe"
```

## run as api
Expand All @@ -60,16 +66,16 @@ assert lookup("foo") is None

Just do head requests to check if a name is known:

curl -I "http://localhost:8000/Alice"
curl -I "http://localhost:8000/jane%20doe"
HTTP/1.1 200 OK

curl -I "http://localhost:8000/John"
HTTP/1.1 404 Not Found

Do an actual request to get the canonized name:
An actual request returns the canonized name:

curl "http://localhost:8000/alice"
Alice
curl "http://localhost:8000/doe,%20jane?threshold=0.5"
Jane Doe


## settings
Expand Down
8 changes: 5 additions & 3 deletions juditha/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,25 @@


@app.get("/_classify/{q}")
async def api_classify(q: str) -> str:
async def api_classify(q: str) -> Response:
schema = classify(q)
if schema is None:
return Response("404", status_code=404)
return Response(schema)


@app.get("/{q}")
async def api_lookup(q: str, threshold: float | None = settings.FUZZY_THRESHOLD) -> str:
async def api_lookup(
q: str, threshold: float | None = settings.FUZZY_THRESHOLD
) -> Response:
name = lookup(q, threshold=threshold)
if name is None:
return Response("404", status_code=404)
return Response(name)


@app.head("/{q}")
async def api_head(q: str, threshold: float | None = settings.FUZZY_THRESHOLD) -> None:
async def api_head(q: str, threshold: float | None = settings.FUZZY_THRESHOLD) -> int:
name = lookup(q, threshold=threshold)
if name is None:
raise HTTPException(404)
Expand Down
13 changes: 9 additions & 4 deletions juditha/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def load_dataset(
with_schema: Annotated[
bool, typer.Option(..., help="Include schemata for classifier")
] = False,
) -> int:
):
try:
res = io.load_dataset(uri, with_schema=with_schema)
success(f"Imported {res} names.")
Expand All @@ -59,7 +59,7 @@ def load_catalog(
with_schema: Annotated[
bool, typer.Option(..., help="Include schemata for classifier")
] = False,
) -> int:
):
try:
res = io.load_catalog(uri, with_schema=with_schema)
success(f"Imported {res} names.")
Expand All @@ -68,9 +68,14 @@ def load_catalog(


@cli.command("lookup")
def cli_lookup(value: str):
def cli_lookup(
value: str,
threshold: Annotated[
float, typer.Option(..., help="Fuzzy threshold")
] = settings.FUZZY_THRESHOLD,
):
try:
result = lookup(value)
result = lookup(value, threshold=threshold)
if result is not None:
print(result)
else:
Expand Down
2 changes: 2 additions & 0 deletions tests/fixtures/names.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Jane Doe
Alice
27 changes: 27 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from pathlib import Path

from typer.testing import CliRunner

from juditha.cli import cli

runner = CliRunner()


def test_cli(fixtures_path: Path):
runner.invoke(cli, ["load", "-i", fixtures_path / "names.txt"])
res = runner.invoke(cli, ["lookup", "Jane Doe"])
assert res.exit_code == 0
assert res.output.strip() == "Jane Doe"

res = runner.invoke(cli, ["lookup", "doe, jane"])
assert res.exit_code == 0
assert "not found" in res.output
res = runner.invoke(cli, ["lookup", "doe, jane", "--threshold", "0.5"])
assert res.exit_code == 0
assert res.output.strip() == "Jane Doe"

res = runner.invoke(
cli,
["load", "-i", fixtures_path / "eu_authorities.ftm.json", "--from-entities"],
)
assert res.exit_code == 0

0 comments on commit cdee962

Please sign in to comment.