Skip to content

Commit

Permalink
Lexer: Properly support Unicode 15.1.0
Browse files Browse the repository at this point in the history
The previous lexer implementation in `Language.Rust.Parser.Lexer` was broken
for Unicode characters with sufficiently large codepoints, as the previous
implementation incorrectly attempted to port UTF-16–encoded codepoints over to
`alex`, which is UTF-8–encoded. Rather than try to fix the previous
implementation (which was based on old `rustc` code that is no longer used),
this ports the lexer to a new implementation that is based on the Rust
`unicode-xid` crate (which is how modern versions of `rustc` lex Unicode
characters). Specifically:

* This adapts `unicode-xid`'s lexer generation script to generate an
  `alex`-based lexer instead of a Rust-based one.

* The new lexer is generated to support codepoints from Unicode 15.1.0.
  (It is unclear which exact Unicode version the previous lexer targeted, but
  given that it was last updated in 2016, it was likely quite an old version.)

* I have verified that the new lexer can lex exotic Unicode characters such as
  `𝑂` and `𐌝` by adding them as regression tests.

Fixes #3.
  • Loading branch information
RyanGlScott committed Aug 28, 2024
1 parent dfcbae3 commit 34ed94d
Show file tree
Hide file tree
Showing 4 changed files with 1,196 additions and 411 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ sample-sources/
!sample-sources/statement-expressions.rs
!sample-sources/statements.rs
!sample-sources/types.rs

# Unicode-related autogenerated files
DerivedCoreProperties.txt
UnicodeLexer.x
167 changes: 167 additions & 0 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#!/usr/bin/env python
#
# Copyright 2011-2015 The Rust Project Developers
# 2024 Galois Inc.
#
# This script was originally created by the Rust Project Developers as part of
# the `unicode-xid` crate:
#
# https://github.com/unicode-rs/unicode-xid/blob/b3a2718b062da229c0a50d12281de0e5d8e8cff6/scripts/unicode.py
#
# See the COPYRIGHT file in the `unicode-xid` crate:
#
# https://github.com/unicode-rs/unicode-xid/blob/b3a2718b062da229c0a50d12281de0e5d8e8cff6/COPYRIGHT
#
# Galois Inc. has modified the script to generate an `alex`-based lexer instead
# of a Rust-based lexer.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

import fileinput, re, os, sys

unicode_version = (15, 1, 0)

preamble = '''-- NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
--
-- If you need to update this code, perform the following steps:
--
-- 1. (Optional) Update the value of `unicode_version` in "scripts/unicode.py".
-- 2. Run the "scripts/unicode.py" script.
-- 3. Copy the code (including the comments) in the autogenerated `UnicodeLexer.x` file.
-- 4. Replace the existing autogenerated code here.
'''

postamble = '''-- End of code generated by "scripts/unicode.py".
'''

def unicode_url(f):
return "http://www.unicode.org/Public/%s.%s.%s/ucd/%s" % (unicode_version + (f,))

def fetch(f):
if not os.path.exists(os.path.basename(f)):
os.system("curl -O %s" % unicode_url(f))

if not os.path.exists(os.path.basename(f)):
sys.stderr.write("cannot load %s" % f)
exit(1)

def group_cat(cat):
cat_out = []
letters = sorted(set(cat))
cur_start = letters.pop(0)
cur_end = cur_start
for letter in letters:
assert letter > cur_end, \
"cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
if letter == cur_end + 1:
cur_end = letter
else:
cat_out.append((cur_start, cur_end))
cur_start = cur_end = letter
cat_out.append((cur_start, cur_end))
return cat_out

def ungroup_cat(cat):
cat_out = []
for (lo, hi) in cat:
while lo <= hi:
cat_out.append(lo)
lo += 1
return cat_out

def format_table_content(f, content, indent):
line = ""
first = True
for chunk in content.split("|"):
line += " " * indent
if first:
line += "= " + chunk
else:
line += "| " + chunk
line += "\n"
first = False
f.write(line + '\n')

def load_properties(f, interestingprops):
fetch(f)
props = {}
re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)")
re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")

for line in fileinput.input(os.path.basename(f)):
prop = None
d_lo = 0
d_hi = 0
m = re1.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(1)
prop = m.group(2)
else:
m = re2.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(2)
prop = m.group(3)
else:
continue
if interestingprops and prop not in interestingprops:
continue
d_lo = int(d_lo, 16)
d_hi = int(d_hi, 16)
if prop not in props:
props[prop] = []
props[prop].append((d_lo, d_hi))

# optimize if possible
for prop in props:
props[prop] = group_cat(ungroup_cat(props[prop]))

return props

def escape_char(c):
return "\\x%04x" % c

def emit_table(f, name, t_data):
f.write("@%s\n" % name)
data = ""
first = True
for dat in t_data:
if not first:
data += "|"
first = False
if dat[0] == dat[1]:
data += "%s" % escape_char(dat[0])
else:
data += "[%s-%s]" % (escape_char(dat[0]), escape_char(dat[1]))
format_table_content(f, data, 2)

def emit_property_module(f, mod, tbl, emit):
for cat in emit:
emit_table(f, cat, tbl[cat])

if __name__ == "__main__":
r = "UnicodeLexer.x"
if os.path.exists(r):
os.remove(r)
with open(r, "w") as rf:
# write the file's preamble
rf.write(preamble)

# download and parse all the data
rf.write('''
-- Based on Unicode %s.%s.%s, using the following Unicode table:
-- %s
''' % (unicode_version + (unicode_url("DerviedCoreProperties.txt"),)))

want_derived = ["XID_Start", "XID_Continue"]
derived = load_properties("DerivedCoreProperties.txt", want_derived)
emit_property_module(rf, "derived_property", derived, want_derived)

# write the file's postamble
rf.write(postamble)
Loading

0 comments on commit 34ed94d

Please sign in to comment.