Lexer: Properly support Unicode 15.1.0

The previous lexer implementation in `Language.Rust.Parser.Lexer` was broken for Unicode characters with sufficiently large codepoints, as the previous implementation incorrectly attempted to port UTF-16–encoded codepoints over to `alex`, which is UTF-8–encoded. Rather than try to fix the previous implementation (which was based on old `rustc` code that is no longer used), this ports the lexer to a new implementation that is based on the Rust `unicode-xid` crate (which is how modern versions of `rustc` lex Unicode characters). Specifically: * This adapts `unicode-xid`'s lexer generation script to generate an `alex`-based lexer instead of a Rust-based one. * The new lexer is generated to support codepoints from Unicode 15.1.0. (It is unclear which exact Unicode version the previous lexer targeted, but given that it was last updated in 2016, it was likely quite an old version.) * I have verified that the new lexer can lex exotic Unicode characters such as `𝑂` and `𐌝` by adding them as regression tests. Fixes #3.
GaloisInc · Aug 28, 2024 · 34ed94d · 34ed94d
1 parent dfcbae3
commit 34ed94d
Show file tree

Hide file tree

Showing 4 changed files with 1,196 additions and 411 deletions.
diff --git a/.gitignore b/.gitignore
@@ -26,3 +26,7 @@ sample-sources/
 !sample-sources/statement-expressions.rs
 !sample-sources/statements.rs
 !sample-sources/types.rs
+
+# Unicode-related autogenerated files
+DerivedCoreProperties.txt
+UnicodeLexer.x
diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+#
+# Copyright 2011-2015 The Rust Project Developers
+#           2024 Galois Inc.
+#
+# This script was originally created by the Rust Project Developers as part of
+# the `unicode-xid` crate:
+#
+#   https://github.com/unicode-rs/unicode-xid/blob/b3a2718b062da229c0a50d12281de0e5d8e8cff6/scripts/unicode.py
+#
+# See the COPYRIGHT file in the `unicode-xid` crate:
+#
+#  https://github.com/unicode-rs/unicode-xid/blob/b3a2718b062da229c0a50d12281de0e5d8e8cff6/COPYRIGHT
+#
+# Galois Inc. has modified the script to generate an `alex`-based lexer instead
+# of a Rust-based lexer.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+import fileinput, re, os, sys
+
+unicode_version = (15, 1, 0)
+
+preamble = '''-- NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
+--
+-- If you need to update this code, perform the following steps:
+--
+-- 1. (Optional) Update the value of `unicode_version` in "scripts/unicode.py".
+-- 2. Run the "scripts/unicode.py" script.
+-- 3. Copy the code (including the comments) in the autogenerated `UnicodeLexer.x` file.
+-- 4. Replace the existing autogenerated code here.
+'''
+
+postamble = '''-- End of code generated by "scripts/unicode.py".
+'''
+
+def unicode_url(f):
+    return "http://www.unicode.org/Public/%s.%s.%s/ucd/%s" % (unicode_version + (f,))
+
+def fetch(f):
+    if not os.path.exists(os.path.basename(f)):
+        os.system("curl -O %s" % unicode_url(f))
+
+    if not os.path.exists(os.path.basename(f)):
+        sys.stderr.write("cannot load %s" % f)
+        exit(1)
+
+def group_cat(cat):
+    cat_out = []
+    letters = sorted(set(cat))
+    cur_start = letters.pop(0)
+    cur_end = cur_start
+    for letter in letters:
+        assert letter > cur_end, \
+            "cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
+        if letter == cur_end + 1:
+            cur_end = letter
+        else:
+            cat_out.append((cur_start, cur_end))
+            cur_start = cur_end = letter
+    cat_out.append((cur_start, cur_end))
+    return cat_out
+
+def ungroup_cat(cat):
+    cat_out = []
+    for (lo, hi) in cat:
+        while lo <= hi:
+            cat_out.append(lo)
+            lo += 1
+    return cat_out
+
+def format_table_content(f, content, indent):
+    line = ""
+    first = True
+    for chunk in content.split("|"):
+        line += " " * indent
+        if first:
+            line += "= " + chunk
+        else:
+            line += "| " + chunk
+        line += "\n"
+        first = False
+    f.write(line + '\n')
+
+def load_properties(f, interestingprops):
+    fetch(f)
+    props = {}
+    re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)")
+    re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
+
+    for line in fileinput.input(os.path.basename(f)):
+        prop = None
+        d_lo = 0
+        d_hi = 0
+        m = re1.match(line)
+        if m:
+            d_lo = m.group(1)
+            d_hi = m.group(1)
+            prop = m.group(2)
+        else:
+            m = re2.match(line)
+            if m:
+                d_lo = m.group(1)
+                d_hi = m.group(2)
+                prop = m.group(3)
+            else:
+                continue
+        if interestingprops and prop not in interestingprops:
+            continue
+        d_lo = int(d_lo, 16)
+        d_hi = int(d_hi, 16)
+        if prop not in props:
+            props[prop] = []
+        props[prop].append((d_lo, d_hi))
+
+    # optimize if possible
+    for prop in props:
+        props[prop] = group_cat(ungroup_cat(props[prop]))
+
+    return props
+
+def escape_char(c):
+    return "\\x%04x" % c
+
+def emit_table(f, name, t_data):
+    f.write("@%s\n" % name)
+    data = ""
+    first = True
+    for dat in t_data:
+        if not first:
+            data += "|"
+        first = False
+        if dat[0] == dat[1]:
+            data += "%s" % escape_char(dat[0])
+        else:
+            data += "[%s-%s]" % (escape_char(dat[0]), escape_char(dat[1]))
+    format_table_content(f, data, 2)
+
+def emit_property_module(f, mod, tbl, emit):
+    for cat in emit:
+        emit_table(f, cat, tbl[cat])
+
+if __name__ == "__main__":
+    r = "UnicodeLexer.x"
+    if os.path.exists(r):
+        os.remove(r)
+    with open(r, "w") as rf:
+        # write the file's preamble
+        rf.write(preamble)
+
+        # download and parse all the data
+        rf.write('''
+-- Based on Unicode %s.%s.%s, using the following Unicode table:
+-- %s
+
+''' % (unicode_version + (unicode_url("DerviedCoreProperties.txt"),)))
+
+        want_derived = ["XID_Start", "XID_Continue"]
+        derived = load_properties("DerivedCoreProperties.txt", want_derived)
+        emit_property_module(rf, "derived_property", derived, want_derived)
+
+        # write the file's postamble
+        rf.write(postamble)