Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lexer: Properly support Unicode 15.1.0 #4

Merged
merged 2 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ sample-sources/
!sample-sources/statement-expressions.rs
!sample-sources/statements.rs
!sample-sources/types.rs

# Unicode-related autogenerated files
DerivedCoreProperties.txt
UnicodeLexer.x
167 changes: 167 additions & 0 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#!/usr/bin/env python
#
# Copyright 2011-2015 The Rust Project Developers
# 2024 Galois Inc.
#
# This script was originally created by the Rust Project Developers as part of
# the `unicode-xid` crate:
#
# https://github.com/unicode-rs/unicode-xid/blob/b3a2718b062da229c0a50d12281de0e5d8e8cff6/scripts/unicode.py
#
# See the COPYRIGHT file in the `unicode-xid` crate:
#
# https://github.com/unicode-rs/unicode-xid/blob/b3a2718b062da229c0a50d12281de0e5d8e8cff6/COPYRIGHT
#
# Galois Inc. has modified the script to generate an `alex`-based lexer instead
# of a Rust-based lexer.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

import fileinput, re, os, sys

unicode_version = (15, 1, 0)

preamble = '''-- NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
--
-- If you need to update this code, perform the following steps:
--
-- 1. (Optional) Update the value of `unicode_version` in "scripts/unicode.py".
-- 2. Run the "scripts/unicode.py" script.
-- 3. Copy the code (including the comments) in the autogenerated `UnicodeLexer.x` file.
-- 4. Replace the existing autogenerated code here.
'''

postamble = '''-- End of code generated by "scripts/unicode.py".
'''

def unicode_url(f):
return "http://www.unicode.org/Public/%s.%s.%s/ucd/%s" % (unicode_version + (f,))

def fetch(f):
if not os.path.exists(os.path.basename(f)):
os.system("curl -O %s" % unicode_url(f))

if not os.path.exists(os.path.basename(f)):
sys.stderr.write("cannot load %s" % f)
exit(1)

def group_cat(cat):
cat_out = []
letters = sorted(set(cat))
cur_start = letters.pop(0)
cur_end = cur_start
for letter in letters:
assert letter > cur_end, \
"cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
if letter == cur_end + 1:
cur_end = letter
else:
cat_out.append((cur_start, cur_end))
cur_start = cur_end = letter
cat_out.append((cur_start, cur_end))
return cat_out

def ungroup_cat(cat):
cat_out = []
for (lo, hi) in cat:
while lo <= hi:
cat_out.append(lo)
lo += 1
return cat_out

def format_table_content(f, content, indent):
line = ""
first = True
for chunk in content.split("|"):
line += " " * indent
if first:
line += "= " + chunk
else:
line += "| " + chunk
line += "\n"
first = False
f.write(line + '\n')

def load_properties(f, interestingprops):
fetch(f)
props = {}
re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)")
re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")

for line in fileinput.input(os.path.basename(f)):
prop = None
d_lo = 0
d_hi = 0
m = re1.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(1)
prop = m.group(2)
else:
m = re2.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(2)
prop = m.group(3)
else:
continue
if interestingprops and prop not in interestingprops:
continue
d_lo = int(d_lo, 16)
d_hi = int(d_hi, 16)
if prop not in props:
props[prop] = []
props[prop].append((d_lo, d_hi))

# optimize if possible
for prop in props:
props[prop] = group_cat(ungroup_cat(props[prop]))

return props

def escape_char(c):
return "\\x%04x" % c

def emit_table(f, name, t_data):
f.write("@%s\n" % name)
data = ""
first = True
for dat in t_data:
if not first:
data += "|"
first = False
if dat[0] == dat[1]:
data += "%s" % escape_char(dat[0])
else:
data += "[%s-%s]" % (escape_char(dat[0]), escape_char(dat[1]))
format_table_content(f, data, 2)

def emit_property_module(f, mod, tbl, emit):
for cat in emit:
emit_table(f, cat, tbl[cat])

if __name__ == "__main__":
r = "UnicodeLexer.x"
if os.path.exists(r):
os.remove(r)
with open(r, "w") as rf:
# write the file's preamble
rf.write(preamble)

# download and parse all the data
rf.write('''
-- Based on Unicode %s.%s.%s, using the following Unicode table:
-- %s

''' % (unicode_version + (unicode_url("DerviedCoreProperties.txt"),)))

want_derived = ["XID_Start", "XID_Continue"]
derived = load_properties("DerivedCoreProperties.txt", want_derived)
emit_property_module(rf, "derived_property", derived, want_derived)

# write the file's postamble
rf.write(postamble)
Loading
Loading