Merge branch 'topic/bbannier/issue-1788'

zeek · Jan 6, 2025 · 77649cd · 77649cd
2 parents d300532 + 9a773ce
commit 77649cd
Show file tree

Hide file tree

Showing 37 changed files with 862 additions and 361 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -37,3 +37,6 @@
 [submodule "3rdparty/any"]
 	path = 3rdparty/any
 	url = https://github.com/thelink2012/any.git
+[submodule "3rdparty/utfcpp"]
+	path = 3rdparty/utfcpp
+	url = https://github.com/nemtrif/utfcpp.git
diff --git a/3rdparty/LICENSE.3rdparty b/3rdparty/LICENSE.3rdparty
@@ -352,6 +352,34 @@ permanent authorization for you to choose that version for the
 Library.
 
 
+================================================================================
+utfcpp
+================================================================================
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
 ================================================================================
 utf8proc
 ================================================================================

diff --git a/3rdparty/utfcpp b/3rdparty/utfcpp
diff --git a/CHANGES b/CHANGES
@@ -1,3 +1,53 @@
+1.13.0-dev.14 | 2025-01-06 16:29:53 +0100
+
+  * Do not require owning strings for `startsWith`. (Benjamin Bannier, Corelight)
+
+    By passing in a `const std::string&` we would have incurred construction
+    costs of an owning string when all we needed was a non-owning view.
+
+    We also clean up the implementation of `startsWith` to use standard
+    library functionality.
+
+  * Move implementation of `Bytes::[lower|upper]` to cc file. (Benjamin Bannier, Corelight)
+
+  * GH-1788: Implement decoding and encoding UTF16 `Bytes`. (Benjamin Bannier, Corelight)
+
+    This adds two new charsets `UTF16LE` and `UTF16BE` for little and big
+    endian UTF16 respectively.
+
+    We also clean up use of the Unicode replacement character to make it
+    work consistently between UTF16 and UTF8.
+
+    Closes #1788.
+
+  * Add utfcpp submodule. (Benjamin Bannier, Corelight)
+
+  * Remove unneeded explicit conversion to HILTI enum in test. (Benjamin Bannier, Corelight)
+
+  * Add optional `errors` parameter to `strings::encode`. (Benjamin Bannier, Corelight)
+
+    This parameter defaults to `DecodeErrorStrategy::REPLACE` like the
+    previous implicit parameter used in the implementation.
+
+  * Move `Bytes` encoding into `string`. (Benjamin Bannier, Corelight)
+
+    This allows a cleaner separation between `Bytes` as "bags of bytes" and
+    `string` as "valid UTF8". Having such a clean separation will make
+    adding support for more encodings less duplicative.
+
+  * Use unified `DecodeErrorStrategy` for both `bytes` and `string`. (Benjamin Bannier, Corelight)
+
+    This was already exposed as a single type in Spicy and HILTI anyway. We
+    also move `Charset` into the `unicode` namespace.
+
+  * Update for spicy-1.12.0 release. (Benjamin Bannier, Corelight)
+
+  * Keep test `spicy.rt.time` working for the next 200 years. (Benjamin Bannier, Corelight)
+
+    This test still compares the current time against hardcoded times to
+    check ordering. Extend the upper value so this test passes for the next
+    ~200 years.
+
 1.13.0-dev.2 | 2024-12-19 17:08:09 +0100
 
   * GH-1949: Fix codegen for string literals containing null bytes. (Benjamin Bannier, Corelight)

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.13.0-dev.2
+1.13.0-dev.14
diff --git a/doc/autogen/spicy-types.spicy b/doc/autogen/spicy-types.spicy
@@ -54,8 +54,10 @@ Specifies the character set for bytes encoding/decoding.
 .. spicy-code::
 
     type Charset = enum {
-        ASCII,
-        UTF8
+        ASCII,    # ASCII encoding
+        UTF8,     # UTF8 encoding
+        UTF16LE,  # UTF16 little endian encoding
+        UTF16BE,  # UTF16 big endian encoding
     };
 
 .. _spicy_decodeerrorstrategy:
@@ -67,9 +69,9 @@ Specifies how data is handled that's not representable in a specified character
 .. spicy-code::
 
     type DecodeErrorStrategy = enum {
-        IGNORE,  # data is skipped but processing continues
-        REPLACE, # data is replaced with a valid place-holder and processing continues
-        STRICT   # runtime error is triggered
+        IGNORE,   # data is skipped but processing continues
+        REPLACE,  # data is replaced with a valid place-holder and processing continues
+        STRICT    # runtime error is triggered
     };
 
 .. _spicy_matchstate:

diff --git a/doc/autogen/types/string.rst b/doc/autogen/types/string.rst
@@ -1,6 +1,6 @@
 .. rubric:: Methods
 
-.. spicy:method:: string::encode string encode False bytes ([ charset: spicy::Charset = spicy::Charset::UTF8 ])
+.. spicy:method:: string::encode string encode False bytes ([ charset: spicy::Charset = spicy::Charset::UTF8 ], [ errors: spicy::DecodeErrorStrategy = spicy::DecodeErrorStrategy::REPLACE ])
 
     Converts the string into a binary representation encoded with the
     given character set.

diff --git a/hilti/lib/hilti.hlt b/hilti/lib/hilti.hlt
@@ -11,8 +11,8 @@ public type Side = enum { Left, Right, Both } &cxxname="hilti::rt::bytes::Side";
 public type AddressFamily = enum { IPv4, IPv6 } &cxxname="hilti::rt::AddressFamily";
 public type RealType = enum { IEEE754_Single, IEEE754_Double } &cxxname="hilti::rt::real::Type";
 public type Protocol = enum { TCP, UDP, ICMP } &cxxname="hilti::rt::Protocol";
-public type Charset = enum { ASCII, UTF8 } &cxxname="hilti::rt::bytes::Charset";
-public type DecodeErrorStrategy = enum { IGNORE, REPLACE, STRICT } &cxxname="hilti::rt::bytes::DecodeErrorStrategy";
+public type Charset = enum { ASCII, UTF8, UTF16LE, UTF16BE } &cxxname="hilti::rt::unicode::Charset";
+public type DecodeErrorStrategy = enum { IGNORE, REPLACE, STRICT } &cxxname="hilti::rt::unicode::DecodeErrorStrategy";
 public type Captures = vector<bytes>;
 public type Profiler = __library_type("hilti::rt::Profiler");
 

diff --git a/hilti/runtime/CMakeLists.txt b/hilti/runtime/CMakeLists.txt
@@ -46,6 +46,7 @@ set(SOURCES
     src/types/stream.cc
     src/types/string.cc
     src/types/time.cc
+    src/unicode.cc
     src/util.cc
     src/version.cc
     ${PROJECT_SOURCE_DIR}/3rdparty/utf8proc/utf8proc.c)
@@ -67,8 +68,10 @@ foreach (lib hilti-rt hilti-rt-debug)
 
     add_dependencies(${lib}-objects fiber)
     target_include_directories(
-        ${lib}-objects PRIVATE ${PROJECT_SOURCE_DIR}/3rdparty/fiber/include
-                               ${PROJECT_SOURCE_DIR}/3rdparty/fiber/deps/cxx-header-utils/include)
+        ${lib}-objects
+        PRIVATE ${PROJECT_SOURCE_DIR}/3rdparty/fiber/include
+                ${PROJECT_SOURCE_DIR}/3rdparty/fiber/deps/cxx-header-utils/include
+                ${PROJECT_SOURCE_DIR}/3rdparty/utfcpp/source)
 
     add_library(${lib} STATIC)
     target_link_libraries(${lib} ${lib}-objects)

diff --git a/hilti/runtime/include/types/bytes.h b/hilti/runtime/include/types/bytes.h
@@ -15,9 +15,9 @@
 #include <hilti/rt/json-fwd.h>
 #include <hilti/rt/result.h>
 #include <hilti/rt/safe-int.h>
-#include <hilti/rt/types/string.h>
 #include <hilti/rt/types/time.h>
 #include <hilti/rt/types/vector.h>
+#include <hilti/rt/unicode.h>
 #include <hilti/rt/util.h>
 
 namespace hilti::rt {
@@ -38,12 +38,6 @@ HILTI_RT_ENUM_WITH_DEFAULT(Side, Left,
                            Both   // left and right side
 );
 
-/** For bytes decoding, which character set to use. */
-HILTI_RT_ENUM(Charset, Undef, UTF8, ASCII);
-
-/** For bytes decoding, how to handle decoding errors. */
-using DecodeErrorStrategy = string::DecodeErrorStrategy;
-
 /**
  * Safe bytes iterator traversing the content of an instance.
  *
@@ -257,17 +251,10 @@ class Bytes : protected std::string {
     using C = std::shared_ptr<const Base*>;
 
     /**
-     * Creates a bytes instance from a raw string representation
-     * encoded in a specified character set.
-     *
-     * @param s raw data
-     * @param cs character set the raw data is assumed to be encoded in
-     * @param errors how to handle errors when decoding the data
-     * @return bytes instances encoding *s* in character set *cs*
+     * Creates a bytes instance from a raw string representation.
      */
-    Bytes(std::string s, bytes::Charset cs, bytes::DecodeErrorStrategy errors = bytes::DecodeErrorStrategy::REPLACE);
+    Bytes(Base s) : Base(std::move(s)) {}
 
-    Bytes(Base&& str) : Base(std::move(str)) {}
     Bytes(const Bytes& xs) : Base(xs) {}
     Bytes(Bytes&& xs) noexcept : Base(std::move(xs)) {}
 
@@ -442,8 +429,8 @@ class Bytes : protected std::string {
      * @param errors how to handle errors when decoding the data
      * @return UTF8 string
      */
-    std::string decode(bytes::Charset cs,
-                       bytes::DecodeErrorStrategy errors = bytes::DecodeErrorStrategy::REPLACE) const;
+    std::string decode(unicode::Charset cs,
+                       unicode::DecodeErrorStrategy errors = unicode::DecodeErrorStrategy::REPLACE) const;
 
     /** Returns true if the data begins with a given, other bytes instance. */
     bool startsWith(const Bytes& b) const { return hilti::rt::startsWith(*this, b); }
@@ -457,9 +444,7 @@ class Bytes : protected std::string {
      * @param errors how to handle errors when decoding/encoding the data
      * @return an upper case version of the instance
      */
-    Bytes upper(bytes::Charset cs, bytes::DecodeErrorStrategy errors = bytes::DecodeErrorStrategy::REPLACE) const {
-        return Bytes(hilti::rt::string::upper(decode(cs, errors), errors), cs, errors);
-    }
+    Bytes upper(unicode::Charset cs, unicode::DecodeErrorStrategy errors = unicode::DecodeErrorStrategy::REPLACE) const;
 
     /**
      * Returns an upper-case version of the instance.
@@ -468,9 +453,7 @@ class Bytes : protected std::string {
      * @param errors how to handle errors when decoding/encoding the data
      * @return a lower case version of the instance
      */
-    Bytes lower(bytes::Charset cs, bytes::DecodeErrorStrategy errors = bytes::DecodeErrorStrategy::REPLACE) const {
-        return Bytes(hilti::rt::string::lower(decode(cs, errors), errors), cs, errors);
-    }
+    Bytes lower(unicode::Charset cs, unicode::DecodeErrorStrategy errors = unicode::DecodeErrorStrategy::REPLACE) const;
 
     /**
      * Removes leading and/or trailing sequences of all characters of a set
@@ -685,8 +668,6 @@ inline std::string detail::to_string_for_print<Bytes>(const Bytes& x) {
 namespace detail::adl {
 std::string to_string(const Bytes& x, adl::tag /*unused*/);
 std::string to_string(const bytes::Side& x, adl::tag /*unused*/);
-std::string to_string(const bytes::Charset& x, adl::tag /*unused*/);
-std::string to_string(const bytes::DecodeErrorStrategy& x, adl::tag /*unused*/);
 } // namespace detail::adl
 
 } // namespace hilti::rt

diff --git a/hilti/runtime/include/types/string.h b/hilti/runtime/include/types/string.h
@@ -9,18 +9,14 @@
 #include <hilti/rt/extension-points.h>
 #include <hilti/rt/safe-int.h>
 #include <hilti/rt/types/vector.h>
+#include <hilti/rt/unicode.h>
 #include <hilti/rt/util.h>
 
 namespace hilti::rt {
 
-namespace string {
+class Bytes;
 
-/* When processing UTF8, how to handle invalid data not representing UTF8 codepoints. */
-HILTI_RT_ENUM_WITH_DEFAULT(DecodeErrorStrategy, IGNORE,
-                           IGNORE,  // skip data
-                           REPLACE, // replace with a place-holder
-                           STRICT   // throw a runtime error
-);
+namespace string {
 
 /**
  * Computes the length of a UTF8 string in number of codepoints.
@@ -30,7 +26,8 @@ HILTI_RT_ENUM_WITH_DEFAULT(DecodeErrorStrategy, IGNORE,
  * @return the length of the input string
  * @throws RuntimeError if the input is not a valid UTF8 string
  */
-integer::safe<uint64_t> size(const std::string& s, DecodeErrorStrategy errors = DecodeErrorStrategy::REPLACE);
+integer::safe<uint64_t> size(const std::string& s,
+                             unicode::DecodeErrorStrategy errors = unicode::DecodeErrorStrategy::REPLACE);
 
 /**
  * Computes a lower-case version of an UTF8 string.
@@ -40,7 +37,7 @@ integer::safe<uint64_t> size(const std::string& s, DecodeErrorStrategy errors =
  * @return a lower-case version of the input string
  * @throws RuntimeError if the input is not a valid UTF8 string
  */
-std::string lower(std::string_view s, DecodeErrorStrategy errors = DecodeErrorStrategy::REPLACE);
+std::string lower(std::string_view s, unicode::DecodeErrorStrategy errors = unicode::DecodeErrorStrategy::REPLACE);
 
 /**
  * Computes a upper-case version of an UTF8 string.
@@ -50,7 +47,7 @@ std::string lower(std::string_view s, DecodeErrorStrategy errors = DecodeErrorSt
  * @return a upper-case version of the input string
  * @throws RuntimeError if the input is not a valid UTF8 string
  */
-std::string upper(std::string_view s, DecodeErrorStrategy errors = DecodeErrorStrategy::REPLACE);
+std::string upper(std::string_view s, unicode::DecodeErrorStrategy errors = unicode::DecodeErrorStrategy::REPLACE);
 
 /**
  * Splits the string at sequences of whitespace.
@@ -88,6 +85,18 @@ std::tuple<std::string, std::string> split1(const std::string& s);
  */
 std::tuple<std::string, std::string> split1(const std::string& s, const std::string& sep);
 
+/**
+ * Creates a bytes instance from a raw string representation
+ * encoded in a specified character set.
+ *
+ * @param s raw data
+ * @param cs character set the raw data is assumed to be encoded in
+ * @param errors how to handle errors when decoding the data
+ * @return bytes instances encoding *s* in character set *cs*
+ */
+rt::Bytes encode(std::string s, unicode::Charset cs,
+                 unicode::DecodeErrorStrategy errors = unicode::DecodeErrorStrategy::REPLACE);
+
 } // namespace string
 
 namespace detail::adl {

diff --git a/hilti/runtime/include/unicode.h b/hilti/runtime/include/unicode.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2020-2023 by the Zeek Project. See LICENSE for details.
+
+#pragma once
+
+#include <cstdint>
+
+#include <hilti/rt/extension-points.h>
+#include <hilti/rt/util.h>
+
+namespace hilti::rt {
+
+namespace unicode {
+
+/* When processing unicode, how to handle invalid data not representing unicode codepoints. */
+HILTI_RT_ENUM_WITH_DEFAULT(DecodeErrorStrategy, IGNORE,
+                           IGNORE,  // skip data
+                           REPLACE, // replace with a place-holder
+                           STRICT   // throw a runtime error
+);
+
+/** For bytes decoding, which character set to use. */
+HILTI_RT_ENUM(Charset, Undef, UTF8, UTF16LE, UTF16BE, ASCII);
+
+constexpr uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;
+
+} // namespace unicode
+
+namespace detail::adl {
+std::string to_string(const unicode::DecodeErrorStrategy& x, adl::tag /*unused*/);
+std::string to_string(const unicode::Charset& x, adl::tag /*unused*/);
+} // namespace detail::adl
+
+} // namespace hilti::rt
diff --git a/hilti/runtime/include/util.h b/hilti/runtime/include/util.h
@@ -240,7 +240,7 @@ std::string replace(std::string s, std::string_view o, std::string_view n);
  *
  * \note This function is not UTF8-aware.
  */
-bool startsWith(const std::string& s, const std::string& prefix);
+bool startsWith(std::string_view s, std::string_view prefix);
 
 /**
  * Python-style enumerate() that returns an iterable yielding pairs `(index,