From 6201458f3b685ddc93d1729893e292ded7c9c609 Mon Sep 17 00:00:00 2001 From: yhirose Date: Fri, 26 Jan 2024 22:00:18 -0500 Subject: [PATCH] Fix #286 --- README.md | 7 ++++++ peglib.h | 67 +++++++++++++++++++++++++++++++++++---------------- test/test1.cc | 22 +++++++++++++++++ 3 files changed, 75 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 80a36d2..6c59069 100644 --- a/README.md +++ b/README.md @@ -347,6 +347,13 @@ START <- 'This month is ' MONTH '.' MONTH <- 'Jan' | 'January' | 'Feb' | 'February' | '...' ``` +It supports the case insensitive mode. + +```peg +START <- 'This month is ' MONTH '.' +MONTH <- 'Jan'i | 'January'i | 'Feb'i | 'February'i | '...'i +``` + Cut operator ------------ diff --git a/peglib.h b/peglib.h index b6c764a..4f320e8 100644 --- a/peglib.h +++ b/peglib.h @@ -377,14 +377,13 @@ template T token_to_number_(std::string_view sv) { class Trie { public: - Trie() = default; - Trie(const Trie &) = default; - - Trie(const std::vector &items) { + Trie(const std::vector &items, bool ignore_case) + : ignore_case_(ignore_case) { for (const auto &item : items) { for (size_t len = 1; len <= item.size(); len++) { auto last = len == item.size(); - std::string_view sv(item.data(), len); + const auto &s = ignore_case ? to_lower(item) : item; + std::string_view sv(s.data(), len); auto it = dic_.find(sv); if (it == dic_.end()) { dic_.emplace(sv, Info{last, last}); @@ -402,7 +401,8 @@ class Trie { auto done = false; size_t len = 1; while (!done && len <= text_len) { - std::string_view sv(text, len); + const auto &s = ignore_case_ ? to_lower(text) : std::string(text); + std::string_view sv(s.data(), len); auto it = dic_.find(sv); if (it == dic_.end()) { done = true; @@ -416,6 +416,13 @@ class Trie { } private: + std::string to_lower(std::string s) const { + for (char &c : s) { + c = std::tolower(c); + } + return s; + } + struct Info { bool done; bool match; @@ -424,6 +431,8 @@ class Trie { // TODO: Use unordered_map when heterogeneous lookup is supported in C++20 // std::unordered_map dic_; std::map> dic_; + + bool ignore_case_; }; /*----------------------------------------------------------------------------- @@ -1159,7 +1168,8 @@ class NotPredicate : public Ope { class Dictionary : public Ope, public std::enable_shared_from_this { public: - Dictionary(const std::vector &v) : trie_(v) {} + Dictionary(const std::vector &v, bool ignore_case) + : trie_(v, ignore_case) {} size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const override; @@ -1568,8 +1578,9 @@ inline std::shared_ptr npd(const std::shared_ptr &ope) { return std::make_shared(ope); } -inline std::shared_ptr dic(const std::vector &v) { - return std::make_shared(v); +inline std::shared_ptr dic(const std::vector &v, + bool ignore_case) { + return std::make_shared(v, ignore_case); } inline std::shared_ptr lit(std::string &&s) { @@ -3335,16 +3346,17 @@ class ParserGenerator { seq(g["Suffix"], opt(seq(g["LABEL"], g["Identifier"]))); g["Suffix"] <= seq(g["Primary"], opt(g["Loop"])); g["Loop"] <= cho(g["QUESTION"], g["STAR"], g["PLUS"], g["Repetition"]); - g["Primary"] <= - cho(seq(g["Ignore"], g["IdentCont"], g["Arguments"], - npd(g["LEFTARROW"])), - seq(g["Ignore"], g["Identifier"], - npd(seq(opt(g["Parameters"]), g["LEFTARROW"]))), - seq(g["OPEN"], g["Expression"], g["CLOSE"]), - seq(g["BeginTok"], g["Expression"], g["EndTok"]), g["CapScope"], - seq(g["BeginCap"], g["Expression"], g["EndCap"]), g["BackRef"], - g["LiteralI"], g["Dictionary"], g["Literal"], g["NegatedClassI"], - g["NegatedClass"], g["ClassI"], g["Class"], g["DOT"]); + g["Primary"] <= cho(seq(g["Ignore"], g["IdentCont"], g["Arguments"], + npd(g["LEFTARROW"])), + seq(g["Ignore"], g["Identifier"], + npd(seq(opt(g["Parameters"]), g["LEFTARROW"]))), + seq(g["OPEN"], g["Expression"], g["CLOSE"]), + seq(g["BeginTok"], g["Expression"], g["EndTok"]), + g["CapScope"], + seq(g["BeginCap"], g["Expression"], g["EndCap"]), + g["BackRef"], g["DictionaryI"], g["LiteralI"], + g["Dictionary"], g["Literal"], g["NegatedClassI"], + g["NegatedClass"], g["ClassI"], g["Class"], g["DOT"]); g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]); g["IdentCont"] <= tok(seq(g["IdentStart"], zom(g["IdentRest"]))); @@ -3358,6 +3370,9 @@ class ParserGenerator { g["Dictionary"] <= seq(g["LiteralD"], oom(seq(g["PIPE"], g["LiteralD"]))); + g["DictionaryI"] <= + seq(g["LiteralID"], oom(seq(g["PIPE"], g["LiteralID"]))); + auto lit_ope = cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]), seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), @@ -3365,11 +3380,13 @@ class ParserGenerator { g["Literal"] <= lit_ope; g["LiteralD"] <= lit_ope; - g["LiteralI"] <= + auto lit_case_ignore_ope = cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), lit("'i"), g["Spacing"]), seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), lit("\"i"), g["Spacing"])); + g["LiteralI"] <= lit_case_ignore_ope; + g["LiteralID"] <= lit_case_ignore_ope; // NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'. g["Class"] <= seq(chr('['), npd(chr('^')), @@ -3720,7 +3737,11 @@ class ParserGenerator { g["Dictionary"] = [](const SemanticValues &vs) { auto items = vs.transform(); - return dic(items); + return dic(items, false); + }; + g["DictionaryI"] = [](const SemanticValues &vs) { + auto items = vs.transform(); + return dic(items, true); }; g["Literal"] = [](const SemanticValues &vs) { @@ -3735,6 +3756,10 @@ class ParserGenerator { auto &tok = vs.tokens.front(); return resolve_escape_sequence(tok.data(), tok.size()); }; + g["LiteralID"] = [](const SemanticValues &vs) { + auto &tok = vs.tokens.front(); + return resolve_escape_sequence(tok.data(), tok.size()); + }; g["Class"] = [](const SemanticValues &vs) { auto ranges = vs.transform>(); diff --git a/test/test1.cc b/test/test1.cc index 146b161..f63ad96 100644 --- a/test/test1.cc +++ b/test/test1.cc @@ -374,6 +374,28 @@ TEST(GeneralTest, Word_expression_test_Dictionary) { EXPECT_TRUE(parser.parse("toa")); } +TEST(GeneralTest, Word_expression_case_ignore_test_Dictionary) { + parser parser(R"( + Identifier ← < !Keyword [a-z][a-z]* > + Keyword ← 'def'i | 'to'i + %whitespace ← [ \t\r\n]* + %word ← [a-z]+ + )"); + + EXPECT_TRUE(parser.parse("toa")); +} + +TEST(GeneralTest, Word_expression_syntax_error_test_Dictionary) { + parser parser(R"( + Identifier ← < !Keyword [a-z][a-z]* > + Keyword ← 'def' | 'to'i + %whitespace ← [ \t\r\n]* + %word ← [a-z]+ + )"); + + EXPECT_FALSE(parser); +} + TEST(GeneralTest, Skip_token_test) { parser parser(" ROOT <- _ ITEM (',' _ ITEM _)* " " ITEM <- ([a-z0-9])+ "