Skip to content

Commit

Permalink
Add genson integration for json generation (#1390)
Browse files Browse the repository at this point in the history
This PR aims at integrating support of the `genson` package (in
`generate.json`) to be able to use dynamic json schema generation as
proposed in #1383.
  • Loading branch information
g-prz authored Jan 27, 2025
1 parent ea4904a commit 437ffe4
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 1 deletion.
73 changes: 73 additions & 0 deletions docs/reference/generation/json.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,76 @@ print(add(**result))
```

A great advantage of passing functions directly to specify the structure is that the structure of the LLM will change with the function's definition. No need to change the code at several places!


## From a dynamic JSON schema builder - GenSON

Outlines integrated [GenSON](https://github.com/wolverdude/GenSON) builders to be able to dynamicly declare JSON schemas. It can be used as follow:

```python
from genson import SchemaBuilder

from outlines import models
from outlines import generate

builder = SchemaBuilder()
builder.add_schema({"type": "object", "properties": {}})
builder.add_object({"name": "Toto", "age": 5})

model = models.transformers(
"HuggingFaceTB/SmolLM2-135M",
device="auto",
)
generator = generate.json(model, builder)

res = generator("Return a json of a young boy")
print(res)
# {"name": "Ben", "age": 10}
```

Anytime you are updating the schema through the builder, you need to redifine the outline generator to include these changes. From the the previous example:

```python
from genson import SchemaBuilder

from outlines import models
from outlines import generate

builder = SchemaBuilder()
builder.add_schema({"type": "object", "properties": {}})
builder.add_object({"name": "Toto", "age": 5})

model = models.transformers(
"HuggingFaceTB/SmolLM2-135M",
device="auto",
)
generator = generate.json(model, builder)

res = generator("Return a json of a young boy")
print(res)
# {"name": "Ben", "age": 10}

builder.add_object({"hobby": "sports"})
generator = generate.json(model, builder)

res = generator("Return a json of a youg boy whose hobby is coding")
print(res)
# {"name": "Ben", "age": 10, "hobby": "coding"}
```

!!! Note

Beware of [GenSON](https://github.com/wolverdude/GenSON)'s behavior regarding dynamic amending of schemas through their builder. Here is an example of how you could lose `required` informations and generate json with missing fields:

```python
builder = SchemaBuilder()
builder.add_schema({"type": "object", "properties": {}})
builder.add_object({"name": "Toto", "age": 5})

print(builder.to_schema())
# {'$schema': 'http://json-schema.org/schema#', 'type': 'object', 'properties': {'name': {'type': 'string'}, 'age': {'type': 'integer'}}, 'required': ['age', 'name']}

builder.add_object({"hobby": "sport"})
print(builder.to_schema())
# {'name': {'type': 'string'}, 'age': {'type': 'integer'}, 'hobby': {'type': 'string'}}}
```
6 changes: 6 additions & 0 deletions outlines/generate/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from functools import singledispatch
from typing import Callable, Optional, Union

from genson import SchemaBuilder
from outlines_core.fsm.json_schema import build_regex_from_schema
from pydantic import BaseModel

Expand Down Expand Up @@ -55,6 +56,11 @@ def json(
regex_str = build_regex_from_schema(schema, whitespace_pattern)
generator = regex(model, regex_str, sampler)
generator.format_sequence = lambda x: pyjson.loads(x)
elif isinstance(schema_object, SchemaBuilder):
schema = schema_object.to_json()
regex_str = build_regex_from_schema(schema, whitespace_pattern)
generator = regex(model, regex_str, sampler)
generator.format_sequence = lambda x: pyjson.loads(x)
elif callable(schema_object):
schema = pyjson.dumps(get_schema_from_signature(schema_object))
regex_str = build_regex_from_schema(schema, whitespace_pattern)
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ dependencies = [
"airportsdata",
"torch",
"outlines_core==0.1.26",
"genson",
]
dynamic = ["version"]

Expand Down Expand Up @@ -71,7 +72,7 @@ test = [
"transformers",
"pillow",
"exllamav2",
"jax"
"jax",
]
serve = [
"vllm>=0.3.0",
Expand Down Expand Up @@ -147,6 +148,7 @@ module = [
"pycountry.*",
"airportsdata.*",
"outlines_core.*",
"genson",
]
ignore_missing_imports = true

Expand Down
16 changes: 16 additions & 0 deletions tests/generate/test_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,22 @@ def test_generate_json(request, model_fixture, sample_schema):
generator(**get_inputs(model_fixture), max_tokens=100)


def test_integrate_genson_generate_json(request):
from genson import SchemaBuilder

builder = SchemaBuilder()
builder.add_schema({"type": "object", "properties": {}})
builder.add_object({"name": "Toto", "age": 5})

model = request.getfixturevalue("model_transformers_opt125m")

generator = generate.json(model, builder)
res = generator("Return a json of a young boy")

assert "name" in res
assert "age" in res


@pytest.mark.parametrize("model_fixture", ALL_MODEL_FIXTURES)
@pytest.mark.parametrize("sample_choices", ALL_SAMPLE_CHOICES_FIXTURES)
def test_generate_choice(request, model_fixture, sample_choices):
Expand Down

0 comments on commit 437ffe4

Please sign in to comment.