Simplifying JSON read code (#1449)

* Simplifying JSON code Added escape_unicode.hpp * Fix variant reset logic * variant handling of tags * Removing now unused key_stats * Update string_literal.hpp * explicit constexpr on lambda to perhaps fix GCC error * Remove now unused read_json_visitor * make_static for GCC 12
stephenberry · Nov 22, 2024 · 04b70c2 · 04b70c2
1 parent bbc11fa
commit 04b70c2
Show file tree

Hide file tree

Showing 7 changed files with 322 additions and 384 deletions.
diff --git a/include/glaze/core/common.hpp b/include/glaze/core/common.hpp
@@ -600,7 +600,7 @@ struct glz::meta<glz::error_code>
                                     "invalid_distribution_elements",
                                     "hostname_failure",
                                     "includer_error"};
-   static constexpr auto value = std::array{none, //
+   static constexpr std::array value{none, //
                                             version_mismatch, //
                                             invalid_header, //
                                             invalid_query, //

diff --git a/include/glaze/core/opts.hpp b/include/glaze/core/opts.hpp
@@ -80,11 +80,6 @@ namespace glz
 
       bool_t bools_as_numbers = false; // Read and write booleans with 1's and 0's
 
-      bool_t escaped_unicode_key_conversion =
-         false; // JSON does not require escaped unicode keys to match with unescaped UTF-8
-      // This enables automatic escaped unicode unescaping and matching for keys in glz::object, but it comes at a
-      // performance cost.
-
       bool_t quoted_num = false; // treat numbers as quoted or array-like types as having quoted numbers
       bool_t number = false; // read numbers as strings and write these string as numbers
       bool_t raw = false; // write out string like values without quotes

diff --git a/include/glaze/json.hpp b/include/glaze/json.hpp
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include "glaze/json/escape_unicode.hpp"
 #include "glaze/json/invoke.hpp"
 #include "glaze/json/json_concepts.hpp"
 #include "glaze/json/json_ptr.hpp"

diff --git a/include/glaze/json/escape_unicode.hpp b/include/glaze/json/escape_unicode.hpp
@@ -0,0 +1,222 @@
+// Glaze Library
+// For the license information refer to glaze.hpp
+
+#pragma once
+
+#include <string>
+#include <cstdint>
+
+#include "glaze/util/string_literal.hpp"
+
+// JSON does not require escaped unicode keys to match with unescaped UTF-8
+// In order to match with escaped unicode you can register your fields with
+// the escaped unicode value.
+// glz::escape_unicode<"😀"> will generate a compile time escaped unicode version
+// of your key.
+
+namespace glz::detail
+{
+   // Helper function to append a Unicode escape sequence to the output string.
+   inline constexpr void append_unicode_escape(std::string& output, uint16_t code_unit) {
+      output += '\\';
+      output += 'u';
+      for (int shift = 12; shift >= 0; shift -= 4) {
+         uint8_t digit = (code_unit >> shift) & 0xF;
+         output += (digit < 10) ? ('0' + digit) : ('A' + (digit - 10));
+      }
+   }
+
+   // Function to calculate the length of the escaped JSON string.
+   inline constexpr size_t escaped_length(const std::string_view input) {
+      size_t length = 0;
+      size_t i = 0;
+      size_t len = input.size();
+
+      while (i < len) {
+         unsigned char c = static_cast<unsigned char>(input[i++]);
+
+         if (c <= 0x7F) {
+            // ASCII character
+            switch (c) {
+               case '\"': case '\\': case '\b': case '\f': case '\n': case '\r': case '\t':
+                  length += 2; // Escaped as two characters
+                  break;
+               default:
+                  if (c <= 0x1F) {
+                     length += 6; // Control character, escaped as \u00XX
+                  } else {
+                     length += 1; // Regular character
+                  }
+                  break;
+            }
+         } else {
+            // Multibyte UTF-8 character
+            uint32_t codepoint = 0;
+            int bytes = 0;
+
+            if ((c & 0xE0) == 0xC0) {
+               // 2-byte sequence
+               codepoint = c & 0x1F;
+               bytes = 1;
+            } else if ((c & 0xF0) == 0xE0) {
+               // 3-byte sequence
+               codepoint = c & 0x0F;
+               bytes = 2;
+            } else if ((c & 0xF8) == 0xF0) {
+               // 4-byte sequence
+               codepoint = c & 0x07;
+               bytes = 3;
+            } else {
+               // Invalid UTF-8 start byte
+               codepoint = 0xFFFD;
+               bytes = 0;
+            }
+
+            bool invalid_sequence = false;
+
+            for (int j = 0; j < bytes; ++j) {
+               if (i == len) {
+                  invalid_sequence = true;
+                  break;
+               }
+               unsigned char c2 = static_cast<unsigned char>(input[i]);
+               if ((c2 & 0xC0) != 0x80) {
+                  invalid_sequence = true;
+                  break;
+               }
+               codepoint = (codepoint << 6) | (c2 & 0x3F);
+               ++i;
+            }
+
+            if (invalid_sequence) {
+               // Invalid UTF-8 sequence, replace with U+FFFD
+               codepoint = 0xFFFD;
+            }
+
+            if (codepoint <= 0xFFFF) {
+               length += 6; // Escaped as \uXXXX
+            } else {
+               length += 12; // Surrogate pair, escaped as \uXXXX\uXXXX
+            }
+         }
+      }
+
+      return length;
+   }
+
+   // Main function to escape the JSON string.
+   inline constexpr std::string escape_json_string(const std::string_view input, const size_t output_length) {
+
+      std::string output;
+      output.reserve(output_length);
+
+      size_t i = 0;
+      size_t len = input.size();
+
+      while (i < len) {
+         unsigned char c = static_cast<unsigned char>(input[i++]);
+
+         if (c <= 0x7F) {
+            // ASCII character
+            switch (c) {
+               case '\"': output += "\\\""; break;
+               case '\\': output += "\\\\"; break;
+               case '\b': output += "\\b";  break;
+               case '\f': output += "\\f";  break;
+               case '\n': output += "\\n";  break;
+               case '\r': output += "\\r";  break;
+               case '\t': output += "\\t";  break;
+               default:
+                  if (c <= 0x1F) {
+                     // Control character, escape using \u00XX
+                     output += "\\u00";
+                     uint8_t high_nibble = (c >> 4) & 0xF;
+                     uint8_t low_nibble = c & 0xF;
+                     output += (high_nibble < 10) ? ('0' + high_nibble) : ('A' + high_nibble - 10);
+                     output += (low_nibble < 10) ? ('0' + low_nibble) : ('A' + low_nibble - 10);
+                  } else {
+                     output += c;
+                  }
+                  break;
+            }
+         } else {
+            // Multibyte UTF-8 character
+            uint32_t codepoint = 0;
+            int bytes = 0;
+
+            if ((c & 0xE0) == 0xC0) {
+               // 2-byte sequence
+               codepoint = c & 0x1F;
+               bytes = 1;
+            } else if ((c & 0xF0) == 0xE0) {
+               // 3-byte sequence
+               codepoint = c & 0x0F;
+               bytes = 2;
+            } else if ((c & 0xF8) == 0xF0) {
+               // 4-byte sequence
+               codepoint = c & 0x07;
+               bytes = 3;
+            } else {
+               // Invalid UTF-8 start byte, replace with U+FFFD
+               codepoint = 0xFFFD;
+               bytes = 0;
+            }
+
+            bool invalid_sequence = false;
+
+            for (int j = 0; j < bytes; ++j) {
+               if (i == len) {
+                  invalid_sequence = true;
+                  break;
+               }
+               unsigned char c2 = static_cast<unsigned char>(input[i]);
+               if ((c2 & 0xC0) != 0x80) {
+                  invalid_sequence = true;
+                  break;
+               }
+               codepoint = (codepoint << 6) | (c2 & 0x3F);
+               ++i;
+            }
+
+            if (invalid_sequence) {
+               // Invalid UTF-8 sequence, replace with U+FFFD
+               codepoint = 0xFFFD;
+            }
+
+            if (codepoint <= 0xFFFF) {
+               // BMP character
+               append_unicode_escape(output, static_cast<uint16_t>(codepoint));
+            } else {
+               // Supplementary character (needs surrogate pair)
+               codepoint -= 0x10000;
+               uint16_t high_surrogate = 0xD800 + (codepoint >> 10);
+               uint16_t low_surrogate = 0xDC00 + (codepoint & 0x3FF);
+               append_unicode_escape(output, high_surrogate);
+               append_unicode_escape(output, low_surrogate);
+            }
+         }
+      }
+
+      return output;
+   }
+}
+
+namespace glz
+{
+   template <string_literal Str>
+   inline constexpr auto escape_unicode = []() constexpr -> std::string_view {
+      constexpr auto escaped = []() constexpr {
+         constexpr auto output_length = detail::escaped_length(Str.sv());
+         std::array<char, output_length + 1> result{}; // + 1 for null character
+         const auto escaped = detail::escape_json_string(Str.sv(), output_length);
+         for (size_t i = 0; i < output_length; ++i) {
+            result[i] = escaped[i];
+         }
+         return result;
+      }();
+
+      // make_static here required for GCC 12, in the future just make escaped static
+      auto& arr = detail::make_static<escaped>::value;
+      return {arr.data(), arr.size() - 1};
+   }();
+}