diff --git a/examples/intersect-test.cpp b/examples/intersect-test.cpp index 4c3bff8..40ffb9e 100644 --- a/examples/intersect-test.cpp +++ b/examples/intersect-test.cpp @@ -21,7 +21,7 @@ using ByteLexicalRule = log_surgeon::LexicalRule; auto get_intersect_for_query( std::map& m_id_symbol, - std::unique_ptr>& dfa1, + Dfa const& dfa1, std::string const& search_string ) -> void { std::string processed_search_string; @@ -41,8 +41,8 @@ auto get_intersect_for_query( rules.emplace_back(0, std::move(schema_var_ast->m_regex_ptr)); } Nfa nfa(std::move(rules)); - auto dfa2 = ByteLexer::nfa_to_dfa(nfa); - auto schema_types = dfa1->get_intersect(dfa2.get()); + Dfa dfa2(std::move(nfa)); + auto schema_types = dfa1.get_intersect(&dfa2); std::cout << search_string << ":"; for (auto const& schema_type : schema_types) { std::cout << m_id_symbol[schema_type] << ","; @@ -79,7 +79,7 @@ auto main() -> int { m_id_symbol[m_id_symbol.size()] = var_ast->m_name; } Nfa nfa(std::move(rules)); - auto dfa = ByteLexer::nfa_to_dfa(nfa); + Dfa dfa(std::move(nfa)); get_intersect_for_query(m_id_symbol, dfa, "*1*"); get_intersect_for_query(m_id_symbol, dfa, "*a*"); get_intersect_for_query(m_id_symbol, dfa, "*a1*"); diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index a392502..4f68a16 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -27,14 +27,6 @@ class Lexer { static inline std::vector const cTokenUncaughtStringTypes = {(uint32_t)SymbolId::TokenUncaughtString}; - /** - * Generate a DFA from an NFA - * @param finite_automata::Nfa nfa - * @return std::unique_ptr> - */ - static auto nfa_to_dfa(finite_automata::Nfa& nfa - ) -> std::unique_ptr>; - /** * Add a delimiters line from the schema to the lexer * @param delimiters @@ -134,12 +126,6 @@ class Lexer { std::unordered_map m_id_symbol; private: - /** - * Return epsilon_closure over m_epsilon_transitions - * @return - */ - static auto epsilon_closure(TypedNfaState const* state_ptr) -> std::set; - /** * Get next character from the input buffer * @return unsigned char diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index dded278..a4e36f5 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -2,6 +2,7 @@ #define LOG_SURGEON_LEXER_TPP #include +#include #include #include #include @@ -378,7 +379,7 @@ template void Lexer::generate() { finite_automata::Nfa nfa{std::move(m_rules)}; // TODO: DFA ignores tags. E.g., treats "capture:user=(?\d+)" as "capture:user=\d+" - m_dfa = nfa_to_dfa(nfa); + m_dfa = std::make_unique>(std::move(nfa)); auto const* state = m_dfa->get_root(); for (uint32_t i = 0; i < cSizeOfByte; i++) { if (state->next(i) != nullptr) { @@ -388,91 +389,6 @@ void Lexer::generate() { } } } - -template -auto Lexer::epsilon_closure(TypedNfaState const* state_ptr -) -> std::set { - std::set closure_set; - std::stack stack; - stack.push(state_ptr); - while (!stack.empty()) { - auto const* current_state = stack.top(); - stack.pop(); - if (false == closure_set.insert(current_state).second) { - continue; - } - for (auto const* dest_state : current_state->get_epsilon_transitions()) { - stack.push(dest_state); - } - - // TODO: currently treat tagged transitions as epsilon transitions - for (auto const& positive_tagged_start_transition : - current_state->get_positive_tagged_start_transitions()) - { - stack.push(positive_tagged_start_transition.get_dest_state()); - } - auto const& optional_positive_tagged_end_transition - = current_state->get_positive_tagged_end_transition(); - if (optional_positive_tagged_end_transition.has_value()) { - stack.push(optional_positive_tagged_end_transition.value().get_dest_state()); - } - - auto const& optional_negative_tagged_transition - = current_state->get_negative_tagged_transition(); - if (optional_negative_tagged_transition.has_value()) { - stack.push(optional_negative_tagged_transition.value().get_dest_state()); - } - } - return closure_set; -} - -template -auto Lexer::nfa_to_dfa(finite_automata::Nfa& nfa -) -> std::unique_ptr> { - typedef std::set StateSet; - auto dfa = std::make_unique>(); - std::map dfa_states; - std::stack unmarked_sets; - auto create_dfa_state - = [&dfa, &dfa_states, &unmarked_sets](StateSet const& set) -> TypedDfaState* { - auto* state = dfa->new_state(set); - dfa_states[set] = state; - unmarked_sets.push(set); - return state; - }; - auto start_set = epsilon_closure(nfa.get_root()); - create_dfa_state(start_set); - while (!unmarked_sets.empty()) { - auto set = unmarked_sets.top(); - unmarked_sets.pop(); - auto* dfa_state = dfa_states.at(set); - std::map ascii_transitions_map; - for (TypedNfaState const* s0 : set) { - for (uint32_t i = 0; i < cSizeOfByte; i++) { - for (TypedNfaState* const s1 : s0->get_byte_transitions(i)) { - StateSet closure = epsilon_closure(s1); - ascii_transitions_map[i].insert(closure.begin(), closure.end()); - } - } - } - auto next_dfa_state - = [&dfa_states, &create_dfa_state](StateSet const& set) -> TypedDfaState* { - TypedDfaState* state{nullptr}; - auto it = dfa_states.find(set); - if (it == dfa_states.end()) { - state = create_dfa_state(set); - } else { - state = it->second; - } - return state; - }; - for (typename std::map::value_type const& kv : ascii_transitions_map) { - auto* dest_state = next_dfa_state(kv.second); - dfa_state->add_byte_transition(kv.first, dest_state); - } - } - return dfa; -} } // namespace log_surgeon #endif // LOG_SURGEON_LEXER_TPP diff --git a/src/log_surgeon/finite_automata/Dfa.hpp b/src/log_surgeon/finite_automata/Dfa.hpp index 5d425dd..baceaec 100644 --- a/src/log_surgeon/finite_automata/Dfa.hpp +++ b/src/log_surgeon/finite_automata/Dfa.hpp @@ -7,11 +7,15 @@ #include #include +#include namespace log_surgeon::finite_automata { template class Dfa { public: + template + explicit Dfa(Nfa nfa); + /** * Creates a new DFA state based on a set of NFA states and adds it to `m_states`. * @param nfa_state_set The set of NFA states represented by this DFA state. @@ -36,6 +40,57 @@ class Dfa { std::vector> m_states; }; +template +template +Dfa::Dfa(Nfa nfa) { + using StateSet = std::set; + + std::map dfa_states; + std::stack unmarked_sets; + auto create_dfa_state + = [this, &dfa_states, &unmarked_sets](StateSet const& set) -> TypedDfaState* { + auto* state = new_state(set); + dfa_states[set] = state; + unmarked_sets.push(set); + return state; + }; + + auto start_set = nfa.get_root()->epsilon_closure(); + create_dfa_state(start_set); + while (false == unmarked_sets.empty()) { + auto set = unmarked_sets.top(); + unmarked_sets.pop(); + auto* dfa_state = dfa_states.at(set); + std::map ascii_transitions_map; + // map transitions_map; + for (auto const* s0 : set) { + for (uint32_t i = 0; i < cSizeOfByte; i++) { + for (auto* const s1 : s0->get_byte_transitions(i)) { + StateSet closure = s1->epsilon_closure(); + ascii_transitions_map[i].insert(closure.begin(), closure.end()); + } + } + // TODO: add this for the utf8 case + } + auto next_dfa_state + = [&dfa_states, &create_dfa_state](StateSet const& set) -> TypedDfaState* { + TypedDfaState* state; + auto it = dfa_states.find(set); + if (it == dfa_states.end()) { + state = create_dfa_state(set); + } else { + state = it->second; + } + return state; + }; + for (auto const& kv : ascii_transitions_map) { + auto* dest_state = next_dfa_state(kv.second); + dfa_state->add_byte_transition(kv.first, dest_state); + } + // TODO: add this for the utf8 case + } +} + template template auto Dfa::new_state(std::set const& nfa_state_set diff --git a/src/log_surgeon/finite_automata/NfaState.hpp b/src/log_surgeon/finite_automata/NfaState.hpp index 09676c9..590c160 100644 --- a/src/log_surgeon/finite_automata/NfaState.hpp +++ b/src/log_surgeon/finite_automata/NfaState.hpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -93,6 +94,11 @@ class NfaState { */ auto add_interval(Interval interval, NfaState* dest_state) -> void; + /** + * @return The set of all states reachable from the current state via epsilon transitions. + */ + auto epsilon_closure() -> std::set; + /** * @param state_ids A map of states to their unique identifiers. * @return A string representation of the NFA state on success. @@ -165,6 +171,41 @@ auto NfaState::add_interval(Interval interval, NfaState* dest_state) } } +template +auto NfaState::epsilon_closure() -> std::set { + std::set closure_set; + std::stack stack; + stack.push(this); + while (false == stack.empty()) { + auto const* current_state = stack.top(); + stack.pop(); + if (false == closure_set.insert(current_state).second) { + continue; + } + for (auto const* dest_state : current_state->get_epsilon_transitions()) { + stack.push(dest_state); + } + + // TODO: currently treat tagged transitions as epsilon transitions + for (auto const& positive_tagged_start_transition : + current_state->get_positive_tagged_start_transitions()) + { + stack.push(positive_tagged_start_transition.get_dest_state()); + } + auto const& optional_positive_tagged_end_transition + = current_state->get_positive_tagged_end_transition(); + if (optional_positive_tagged_end_transition.has_value()) { + stack.push(optional_positive_tagged_end_transition.value().get_dest_state()); + } + auto const& optional_negative_tagged_transition + = current_state->get_negative_tagged_transition(); + if (optional_negative_tagged_transition.has_value()) { + stack.push(optional_negative_tagged_transition.value().get_dest_state()); + } + } + return closure_set; +} + template auto NfaState::serialize(std::unordered_map const& state_ids ) const -> std::optional {