diff --git a/outlines/fsm/json_schema.py b/outlines/fsm/json_schema.py index 98d2de59c..f4ca55f7e 100644 --- a/outlines/fsm/json_schema.py +++ b/outlines/fsm/json_schema.py @@ -32,12 +32,15 @@ DATE = r'"(?:\d{4})-(?:0[1-9]|1[0-2])-(?:0[1-9]|[1-2][0-9]|3[0-1])"' TIME = r'"(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\.[0-9]+)?(Z)?"' UUID = r'"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"' +# URI only supports a subset of https://datatracker.ietf.org/doc/html/rfc3986, specifically https:// URLs with optional auth details +URI = r'"(https?:\/\/)?([-a-zA-Z0-9:%._\+~#=]+@)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}([-a-zA-Z0-9@:%_\+.~#?&//=]*)"' format_to_regex = { "uuid": UUID, "date-time": DATE_TIME, "date": DATE, "time": TIME, + "uri": URI, } @@ -350,14 +353,8 @@ def to_regex( return rf'("{pattern}")' elif "format" in instance: format = instance["format"] - if format == "date-time": - return format_to_regex["date-time"] - elif format == "uuid": - return format_to_regex["uuid"] - elif format == "date": - return format_to_regex["date"] - elif format == "time": - return format_to_regex["time"] + if format in format_to_regex: + return format_to_regex[format] else: raise NotImplementedError( f"Format {format} is not supported by Outlines" diff --git a/tests/fsm/test_json_schema.py b/tests/fsm/test_json_schema.py index 7565ff642..c3f481642 100644 --- a/tests/fsm/test_json_schema.py +++ b/tests/fsm/test_json_schema.py @@ -16,6 +16,7 @@ STRING, STRING_INNER, TIME, + URI, UUID, WHITESPACE, build_regex_from_schema, @@ -746,6 +747,51 @@ def test_match_number(pattern, does_match): ("{ }", True), ], ), + # URI + ( + {"title": "Foo", "type": "string", "format": "uri"}, + URI, + [ + ('"https://www.example.com"', True), + ('"http://example.com"', True), + ('"https://subdomain.example.co.uk/path?query=value#fragment"', True), + ('"https://example.com:8080"', True), # With port + ('"http://123.45.67.89"', True), # IP address + ( + '"https://example.com/path/to/resource.html"', + True, + ), # With file extension + ('"https://user:pass@example.com"', True), # With basic auth + ( + '"https://example.com/?q=test&r=123"', + True, + ), # With multiple query parameters + ('"https://example.co.uk"', True), # Different TLD + ('"https://xn--bcher-kva.example"', True), # Punycode domain + ('"https://example.com/path%20with%20spaces"', True), # Encoded spaces + ('"ftp://example.com"', False), # FTP protocol + ('"not a uri"', False), + ('"https://"', False), # Incomplete URI + ('""', False), # Empty string + ("https://www.example.com", False), # Missing quotes + ('"http:/example.com"', False), # Missing slash after protocol + ('"https://example.com:abc"', False), # Invalid port + ('"https://exa mple.com"', False), # Space in domain + ('"https://.example.com"', False), # Domain starting with dot + ('"https://example..com"', False), # Consecutive dots in domain + ( + '"https://exam ple.com/path"', + False, + ), # Space in domain (but valid path) + ('"https://example.com/path "', False), # Space at end of path + ('"https://example.com#frag ment"', False), # Space in fragment + ('"https://example.com/?q=va lue"', False), # Space in query + ('"https://exa\nmple.com"', False), # Newline in domain + ('"https://example.com/pat\nh"', False), # Newline in path + ('"https://example.com#frag\nment"', False), # Newline in fragment + ('"https://example.com/?q=val\nue"', False), # Newline in query + ], + ), ], ) def test_match(schema, regex, examples):