forked from harpocrates/language-rust
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from GaloisInc/T3-fix-unicode-lexing
Lexer: Properly support Unicode 15.1.0
- Loading branch information
Showing
4 changed files
with
1,234 additions
and
449 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
#!/usr/bin/env python | ||
# | ||
# Copyright 2011-2015 The Rust Project Developers | ||
# 2024 Galois Inc. | ||
# | ||
# This script was originally created by the Rust Project Developers as part of | ||
# the `unicode-xid` crate: | ||
# | ||
# https://github.com/unicode-rs/unicode-xid/blob/b3a2718b062da229c0a50d12281de0e5d8e8cff6/scripts/unicode.py | ||
# | ||
# See the COPYRIGHT file in the `unicode-xid` crate: | ||
# | ||
# https://github.com/unicode-rs/unicode-xid/blob/b3a2718b062da229c0a50d12281de0e5d8e8cff6/COPYRIGHT | ||
# | ||
# Galois Inc. has modified the script to generate an `alex`-based lexer instead | ||
# of a Rust-based lexer. | ||
# | ||
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | ||
# option. This file may not be copied, modified, or distributed | ||
# except according to those terms. | ||
|
||
import fileinput, re, os, sys | ||
|
||
unicode_version = (15, 1, 0) | ||
|
||
preamble = '''-- NOTE: The following code was generated by "scripts/unicode.py", do not edit directly | ||
-- | ||
-- If you need to update this code, perform the following steps: | ||
-- | ||
-- 1. (Optional) Update the value of `unicode_version` in "scripts/unicode.py". | ||
-- 2. Run the "scripts/unicode.py" script. | ||
-- 3. Copy the code (including the comments) in the autogenerated `UnicodeLexer.x` file. | ||
-- 4. Replace the existing autogenerated code here. | ||
''' | ||
|
||
postamble = '''-- End of code generated by "scripts/unicode.py". | ||
''' | ||
|
||
def unicode_url(f): | ||
return "http://www.unicode.org/Public/%s.%s.%s/ucd/%s" % (unicode_version + (f,)) | ||
|
||
def fetch(f): | ||
if not os.path.exists(os.path.basename(f)): | ||
os.system("curl -O %s" % unicode_url(f)) | ||
|
||
if not os.path.exists(os.path.basename(f)): | ||
sys.stderr.write("cannot load %s" % f) | ||
exit(1) | ||
|
||
def group_cat(cat): | ||
cat_out = [] | ||
letters = sorted(set(cat)) | ||
cur_start = letters.pop(0) | ||
cur_end = cur_start | ||
for letter in letters: | ||
assert letter > cur_end, \ | ||
"cur_end: %s, letter: %s" % (hex(cur_end), hex(letter)) | ||
if letter == cur_end + 1: | ||
cur_end = letter | ||
else: | ||
cat_out.append((cur_start, cur_end)) | ||
cur_start = cur_end = letter | ||
cat_out.append((cur_start, cur_end)) | ||
return cat_out | ||
|
||
def ungroup_cat(cat): | ||
cat_out = [] | ||
for (lo, hi) in cat: | ||
while lo <= hi: | ||
cat_out.append(lo) | ||
lo += 1 | ||
return cat_out | ||
|
||
def format_table_content(f, content, indent): | ||
line = "" | ||
first = True | ||
for chunk in content.split("|"): | ||
line += " " * indent | ||
if first: | ||
line += "= " + chunk | ||
else: | ||
line += "| " + chunk | ||
line += "\n" | ||
first = False | ||
f.write(line + '\n') | ||
|
||
def load_properties(f, interestingprops): | ||
fetch(f) | ||
props = {} | ||
re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)") | ||
re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)") | ||
|
||
for line in fileinput.input(os.path.basename(f)): | ||
prop = None | ||
d_lo = 0 | ||
d_hi = 0 | ||
m = re1.match(line) | ||
if m: | ||
d_lo = m.group(1) | ||
d_hi = m.group(1) | ||
prop = m.group(2) | ||
else: | ||
m = re2.match(line) | ||
if m: | ||
d_lo = m.group(1) | ||
d_hi = m.group(2) | ||
prop = m.group(3) | ||
else: | ||
continue | ||
if interestingprops and prop not in interestingprops: | ||
continue | ||
d_lo = int(d_lo, 16) | ||
d_hi = int(d_hi, 16) | ||
if prop not in props: | ||
props[prop] = [] | ||
props[prop].append((d_lo, d_hi)) | ||
|
||
# optimize if possible | ||
for prop in props: | ||
props[prop] = group_cat(ungroup_cat(props[prop])) | ||
|
||
return props | ||
|
||
def escape_char(c): | ||
return "\\x%04x" % c | ||
|
||
def emit_table(f, name, t_data): | ||
f.write("@%s\n" % name) | ||
data = "" | ||
first = True | ||
for dat in t_data: | ||
if not first: | ||
data += "|" | ||
first = False | ||
if dat[0] == dat[1]: | ||
data += "%s" % escape_char(dat[0]) | ||
else: | ||
data += "[%s-%s]" % (escape_char(dat[0]), escape_char(dat[1])) | ||
format_table_content(f, data, 2) | ||
|
||
def emit_property_module(f, mod, tbl, emit): | ||
for cat in emit: | ||
emit_table(f, cat, tbl[cat]) | ||
|
||
if __name__ == "__main__": | ||
r = "UnicodeLexer.x" | ||
if os.path.exists(r): | ||
os.remove(r) | ||
with open(r, "w") as rf: | ||
# write the file's preamble | ||
rf.write(preamble) | ||
|
||
# download and parse all the data | ||
rf.write(''' | ||
-- Based on Unicode %s.%s.%s, using the following Unicode table: | ||
-- %s | ||
''' % (unicode_version + (unicode_url("DerviedCoreProperties.txt"),))) | ||
|
||
want_derived = ["XID_Start", "XID_Continue"] | ||
derived = load_properties("DerivedCoreProperties.txt", want_derived) | ||
emit_property_module(rf, "derived_property", derived, want_derived) | ||
|
||
# write the file's postamble | ||
rf.write(postamble) |
Oops, something went wrong.