Skip to content

Commit

Permalink
Optimize escape_json_string and add some missing docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
gibber9809 committed Dec 3, 2024
1 parent ec5b208 commit c4ad984
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 8 deletions.
26 changes: 22 additions & 4 deletions components/core/src/clp_s/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -485,39 +485,57 @@ bool StringUtils::tokenize_column_descriptor(
}

void StringUtils::escape_json_string(std::string& destination, std::string_view const source) {
// credit to https://stackoverflow.com/questions/7724448/simple-json-string-escape-for-c
for (char c : source) {
// Escaping is implemented using this `append_unescaped_slice` approach to offer a fast path
// when strings are mostly or entirely valid escaped JSON. Benchmarking shows that this offers
// a net decompression speedup of ~30% compared to adding every character to the destination one
// character at a time.
size_t slice_begin{0ULL};
auto append_unescaped_slice = [&](size_t i) {
if (slice_begin < i) {
destination.append(source.substr(slice_begin, i - slice_begin));
}
slice_begin = i + 1;
};
for (size_t i = 0; i < source.size(); ++i) {
char c = source[i];
switch (c) {
case '"':
append_unescaped_slice(i);
destination.append("\\\"");
break;
case '\\':
append_unescaped_slice(i);
destination.append("\\\\");
break;
case '\t':
append_unescaped_slice(i);
destination.append("\\t");
break;
case '\r':
append_unescaped_slice(i);
destination.append("\\r");
break;
case '\n':
append_unescaped_slice(i);
destination.append("\\n");
break;
case '\b':
append_unescaped_slice(i);
destination.append("\\b");
break;
case '\f':
append_unescaped_slice(i);
destination.append("\\f");
break;
default:
if ('\x00' <= c && c <= '\x1f') {
append_unescaped_slice(i);
char_to_escaped_four_char_hex(destination, c);
} else {
destination.push_back(c);
}
break;
}
}
append_unescaped_slice(source.size());
}

namespace {
Expand Down
54 changes: 50 additions & 4 deletions components/core/src/clp_s/Utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,16 +210,43 @@ class StringUtils {
static bool convert_string_to_double(std::string const& raw, double& converted);

/**
* Converts a string column descriptor delimited by '.' into a list of tokens
* Converts a KQL string column descriptor delimited by '.' into a list of tokens. The
* descriptor is tokenized and unescaped per the escaping rules for KQL columns.
* @param descriptor
* @param tokens
* @return true if the descriptor was tokenized successfully, false otherwise
*/
[[nodiscard]] static bool
tokenize_column_descriptor(std::string const& descriptor, std::vector<std::string>& tokens);

/**
* Escapes a string according to JSON string escaping rules and appends the escaped string to
* a buffer. The input string can be either ascii or UTF-8.
*
* According to the JSON spec JSON strings must escape control sequences (characters 0x00
* through 0x1f) as well as the '"' and '\' characters.
*
* This function escapes common control sequences like newline with short escape sequences
* (e.g. \n) and less common control sequences with unicode escape sequences (e.g. \u001f). The
* '"' and '\' characters are escaped with a backslash.
*
* @param source
* @param destination
*/
static void escape_json_string(std::string& destination, std::string_view const source);

/**
* Unescapes a KQL value string according to the escaping rules for KQL value strings and
* converts it into a valid CLP search string.
*
* Specifically this means that the string is unescaped, but the escape sequences '\\', '\*',
* and '\?' are preserved so that the resulting string can be interpreted correctly by CLP
* search.
*
* @param value
* @param unescaped
* @return true if the value was escaped succesfully and false otherwise.
*/
static bool unescape_kql_value(std::string const& value, std::string& unescaped);

private:
Expand All @@ -242,6 +269,11 @@ class StringUtils {
char const*& wild_bookmark
);

/**
* Converts a character into its two byte hexadecimal representation.
* @param c
* @return the two byte hexadecimal representation of c as an array of two characters.
*/
static std::array<char, 2> char_to_hex(char c) {
std::array<char, 2> ret;
auto nibble_to_hex = [](char nibble) -> char {
Expand All @@ -255,12 +287,26 @@ class StringUtils {
return std::array<char, 2>{nibble_to_hex(0x0F & (c >> 4)), nibble_to_hex(0x0f & c)};
}

static void char_to_escaped_four_char_hex(std::string& dest, char c) {
dest.append("\\u00");
/**
* Converts a character into a unicode escape sequence (e.g. \u0000) and appends the escape
* sequences to the `destination` buffer.
* @param destination
* @param c
*/
static void char_to_escaped_four_char_hex(std::string& destination, char c) {
destination.append("\\u00");
auto hex = char_to_hex(c);
dest.append(hex.data(), hex.size());
destination.append(hex.data(), hex.size());
}

/**
* Unescape a KQL key or value with special handling for each case and append the unescaped
* value to the `unescaped` buffer.
* @param value
* @param unescaped
* @param is_value
* @return true if the value was unescaped succesfully and false otherwise.
*/
static bool
unescape_kql_internal(std::string const& value, std::string& unescaped, bool is_value);
};
Expand Down

0 comments on commit c4ad984

Please sign in to comment.