Skip to content

Commit

Permalink
refactor: Move epsilon_closure from Lexer to NfaState; Refactor…
Browse files Browse the repository at this point in the history
… `nfa_to_dfa` as `Dfa`'s constructor. (#71)

Co-authored-by: Lin Zhihao <[email protected]>
  • Loading branch information
SharafMohamed and LinZhihao-723 authored Jan 11, 2025
1 parent 44c5578 commit e08f728
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 104 deletions.
8 changes: 4 additions & 4 deletions examples/intersect-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ using ByteLexicalRule = log_surgeon::LexicalRule<ByteNfaState>;

auto get_intersect_for_query(
std::map<uint32_t, std::string>& m_id_symbol,
std::unique_ptr<Dfa<ByteDfaState>>& dfa1,
Dfa<ByteDfaState> const& dfa1,
std::string const& search_string
) -> void {
std::string processed_search_string;
Expand All @@ -41,8 +41,8 @@ auto get_intersect_for_query(
rules.emplace_back(0, std::move(schema_var_ast->m_regex_ptr));
}
Nfa<ByteNfaState> nfa(std::move(rules));
auto dfa2 = ByteLexer::nfa_to_dfa(nfa);
auto schema_types = dfa1->get_intersect(dfa2.get());
Dfa<ByteDfaState> dfa2(std::move(nfa));
auto schema_types = dfa1.get_intersect(&dfa2);
std::cout << search_string << ":";
for (auto const& schema_type : schema_types) {
std::cout << m_id_symbol[schema_type] << ",";
Expand Down Expand Up @@ -79,7 +79,7 @@ auto main() -> int {
m_id_symbol[m_id_symbol.size()] = var_ast->m_name;
}
Nfa<ByteNfaState> nfa(std::move(rules));
auto dfa = ByteLexer::nfa_to_dfa(nfa);
Dfa<ByteDfaState> dfa(std::move(nfa));
get_intersect_for_query(m_id_symbol, dfa, "*1*");
get_intersect_for_query(m_id_symbol, dfa, "*a*");
get_intersect_for_query(m_id_symbol, dfa, "*a1*");
Expand Down
14 changes: 0 additions & 14 deletions src/log_surgeon/Lexer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,6 @@ class Lexer {
static inline std::vector<uint32_t> const cTokenUncaughtStringTypes
= {(uint32_t)SymbolId::TokenUncaughtString};

/**
* Generate a DFA from an NFA
* @param finite_automata::Nfa<TypedNfaState> nfa
* @return std::unique_ptr<finite_automata::Dfa<TypedDfaState>>
*/
static auto nfa_to_dfa(finite_automata::Nfa<TypedNfaState>& nfa
) -> std::unique_ptr<finite_automata::Dfa<TypedDfaState>>;

/**
* Add a delimiters line from the schema to the lexer
* @param delimiters
Expand Down Expand Up @@ -134,12 +126,6 @@ class Lexer {
std::unordered_map<uint32_t, std::string> m_id_symbol;

private:
/**
* Return epsilon_closure over m_epsilon_transitions
* @return
*/
static auto epsilon_closure(TypedNfaState const* state_ptr) -> std::set<TypedNfaState const*>;

/**
* Get next character from the input buffer
* @return unsigned char
Expand Down
88 changes: 2 additions & 86 deletions src/log_surgeon/Lexer.tpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define LOG_SURGEON_LEXER_TPP

#include <cassert>
#include <memory>
#include <stack>
#include <string>
#include <vector>
Expand Down Expand Up @@ -378,7 +379,7 @@ template <typename TypedNfaState, typename TypedDfaState>
void Lexer<TypedNfaState, TypedDfaState>::generate() {
finite_automata::Nfa<TypedNfaState> nfa{std::move(m_rules)};
// TODO: DFA ignores tags. E.g., treats "capture:user=(?<user_id>\d+)" as "capture:user=\d+"
m_dfa = nfa_to_dfa(nfa);
m_dfa = std::make_unique<finite_automata::Dfa<TypedDfaState>>(std::move(nfa));
auto const* state = m_dfa->get_root();
for (uint32_t i = 0; i < cSizeOfByte; i++) {
if (state->next(i) != nullptr) {
Expand All @@ -388,91 +389,6 @@ void Lexer<TypedNfaState, TypedDfaState>::generate() {
}
}
}

template <typename TypedNfaState, typename TypedDfaState>
auto Lexer<TypedNfaState, TypedDfaState>::epsilon_closure(TypedNfaState const* state_ptr
) -> std::set<TypedNfaState const*> {
std::set<TypedNfaState const*> closure_set;
std::stack<TypedNfaState const*> stack;
stack.push(state_ptr);
while (!stack.empty()) {
auto const* current_state = stack.top();
stack.pop();
if (false == closure_set.insert(current_state).second) {
continue;
}
for (auto const* dest_state : current_state->get_epsilon_transitions()) {
stack.push(dest_state);
}

// TODO: currently treat tagged transitions as epsilon transitions
for (auto const& positive_tagged_start_transition :
current_state->get_positive_tagged_start_transitions())
{
stack.push(positive_tagged_start_transition.get_dest_state());
}
auto const& optional_positive_tagged_end_transition
= current_state->get_positive_tagged_end_transition();
if (optional_positive_tagged_end_transition.has_value()) {
stack.push(optional_positive_tagged_end_transition.value().get_dest_state());
}

auto const& optional_negative_tagged_transition
= current_state->get_negative_tagged_transition();
if (optional_negative_tagged_transition.has_value()) {
stack.push(optional_negative_tagged_transition.value().get_dest_state());
}
}
return closure_set;
}

template <typename TypedNfaState, typename TypedDfaState>
auto Lexer<TypedNfaState, TypedDfaState>::nfa_to_dfa(finite_automata::Nfa<TypedNfaState>& nfa
) -> std::unique_ptr<finite_automata::Dfa<TypedDfaState>> {
typedef std::set<TypedNfaState const*> StateSet;
auto dfa = std::make_unique<finite_automata::Dfa<TypedDfaState>>();
std::map<StateSet, TypedDfaState*> dfa_states;
std::stack<StateSet> unmarked_sets;
auto create_dfa_state
= [&dfa, &dfa_states, &unmarked_sets](StateSet const& set) -> TypedDfaState* {
auto* state = dfa->new_state(set);
dfa_states[set] = state;
unmarked_sets.push(set);
return state;
};
auto start_set = epsilon_closure(nfa.get_root());
create_dfa_state(start_set);
while (!unmarked_sets.empty()) {
auto set = unmarked_sets.top();
unmarked_sets.pop();
auto* dfa_state = dfa_states.at(set);
std::map<uint32_t, StateSet> ascii_transitions_map;
for (TypedNfaState const* s0 : set) {
for (uint32_t i = 0; i < cSizeOfByte; i++) {
for (TypedNfaState* const s1 : s0->get_byte_transitions(i)) {
StateSet closure = epsilon_closure(s1);
ascii_transitions_map[i].insert(closure.begin(), closure.end());
}
}
}
auto next_dfa_state
= [&dfa_states, &create_dfa_state](StateSet const& set) -> TypedDfaState* {
TypedDfaState* state{nullptr};
auto it = dfa_states.find(set);
if (it == dfa_states.end()) {
state = create_dfa_state(set);
} else {
state = it->second;
}
return state;
};
for (typename std::map<uint32_t, StateSet>::value_type const& kv : ascii_transitions_map) {
auto* dest_state = next_dfa_state(kv.second);
dfa_state->add_byte_transition(kv.first, dest_state);
}
}
return dfa;
}
} // namespace log_surgeon

#endif // LOG_SURGEON_LEXER_TPP
55 changes: 55 additions & 0 deletions src/log_surgeon/finite_automata/Dfa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,15 @@
#include <vector>

#include <log_surgeon/finite_automata/DfaStatePair.hpp>
#include <log_surgeon/finite_automata/Nfa.hpp>

namespace log_surgeon::finite_automata {
template <typename TypedDfaState>
class Dfa {
public:
template <typename NfaStateType>
explicit Dfa(Nfa<NfaStateType> nfa);

/**
* Creates a new DFA state based on a set of NFA states and adds it to `m_states`.
* @param nfa_state_set The set of NFA states represented by this DFA state.
Expand All @@ -36,6 +40,57 @@ class Dfa {
std::vector<std::unique_ptr<TypedDfaState>> m_states;
};

template <typename TypedDfaState>
template <typename TypedNfaState>
Dfa<TypedDfaState>::Dfa(Nfa<TypedNfaState> nfa) {
using StateSet = std::set<TypedNfaState const*>;

std::map<StateSet, TypedDfaState*> dfa_states;
std::stack<StateSet> unmarked_sets;
auto create_dfa_state
= [this, &dfa_states, &unmarked_sets](StateSet const& set) -> TypedDfaState* {
auto* state = new_state(set);
dfa_states[set] = state;
unmarked_sets.push(set);
return state;
};

auto start_set = nfa.get_root()->epsilon_closure();
create_dfa_state(start_set);
while (false == unmarked_sets.empty()) {
auto set = unmarked_sets.top();
unmarked_sets.pop();
auto* dfa_state = dfa_states.at(set);
std::map<uint32_t, StateSet> ascii_transitions_map;
// map<Interval, StateSet> transitions_map;
for (auto const* s0 : set) {
for (uint32_t i = 0; i < cSizeOfByte; i++) {
for (auto* const s1 : s0->get_byte_transitions(i)) {
StateSet closure = s1->epsilon_closure();
ascii_transitions_map[i].insert(closure.begin(), closure.end());
}
}
// TODO: add this for the utf8 case
}
auto next_dfa_state
= [&dfa_states, &create_dfa_state](StateSet const& set) -> TypedDfaState* {
TypedDfaState* state;
auto it = dfa_states.find(set);
if (it == dfa_states.end()) {
state = create_dfa_state(set);
} else {
state = it->second;
}
return state;
};
for (auto const& kv : ascii_transitions_map) {
auto* dest_state = next_dfa_state(kv.second);
dfa_state->add_byte_transition(kv.first, dest_state);
}
// TODO: add this for the utf8 case
}
}

template <typename TypedDfaState>
template <typename TypedNfaState>
auto Dfa<TypedDfaState>::new_state(std::set<TypedNfaState*> const& nfa_state_set
Expand Down
41 changes: 41 additions & 0 deletions src/log_surgeon/finite_automata/NfaState.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <cstdint>
#include <memory>
#include <optional>
#include <stack>
#include <string>
#include <tuple>
#include <unordered_map>
Expand Down Expand Up @@ -93,6 +94,11 @@ class NfaState {
*/
auto add_interval(Interval interval, NfaState* dest_state) -> void;

/**
* @return The set of all states reachable from the current state via epsilon transitions.
*/
auto epsilon_closure() -> std::set<NfaState const*>;

/**
* @param state_ids A map of states to their unique identifiers.
* @return A string representation of the NFA state on success.
Expand Down Expand Up @@ -165,6 +171,41 @@ auto NfaState<state_type>::add_interval(Interval interval, NfaState* dest_state)
}
}

template <StateType state_type>
auto NfaState<state_type>::epsilon_closure() -> std::set<NfaState const*> {
std::set<NfaState const*> closure_set;
std::stack<NfaState const*> stack;
stack.push(this);
while (false == stack.empty()) {
auto const* current_state = stack.top();
stack.pop();
if (false == closure_set.insert(current_state).second) {
continue;
}
for (auto const* dest_state : current_state->get_epsilon_transitions()) {
stack.push(dest_state);
}

// TODO: currently treat tagged transitions as epsilon transitions
for (auto const& positive_tagged_start_transition :
current_state->get_positive_tagged_start_transitions())
{
stack.push(positive_tagged_start_transition.get_dest_state());
}
auto const& optional_positive_tagged_end_transition
= current_state->get_positive_tagged_end_transition();
if (optional_positive_tagged_end_transition.has_value()) {
stack.push(optional_positive_tagged_end_transition.value().get_dest_state());
}
auto const& optional_negative_tagged_transition
= current_state->get_negative_tagged_transition();
if (optional_negative_tagged_transition.has_value()) {
stack.push(optional_negative_tagged_transition.value().get_dest_state());
}
}
return closure_set;
}

template <StateType state_type>
auto NfaState<state_type>::serialize(std::unordered_map<NfaState const*, uint32_t> const& state_ids
) const -> std::optional<std::string> {
Expand Down

0 comments on commit e08f728

Please sign in to comment.