openvinotoolkit · praasz · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025
@@ -118,21 +118,35 @@ const TensorIndexMap cast_to_tensor_index_map(const py::dict& inputs) {
 
 namespace string_helpers {
 
+namespace {
+auto find_last_not_null(const char* str, size_t length) {
+    return std::find_if(std::make_reverse_iterator(str + length),
+                        std::make_reverse_iterator(str),
+                        [](const auto& c) {
+                            return c != '\0';
+                        })
+        .base();
+}
+}  // namespace
+
 py::array bytes_array_from_tensor(ov::Tensor&& t) {
     if (t.get_element_type() != ov::element::string) {
         OPENVINO_THROW("Tensor's type must be a string!");
     }
     auto data = t.data<std::string>();
+
+    // numpy array stores all bytes of strings but when encode it remove trailing null characters
+    // find max stride as max length of string but without trailing null characters
     auto max_element = std::max_element(data, data + t.get_size(), [](const std::string& x, const std::string& y) {
         return x.length() < y.length();
     });
     auto max_stride = max_element->length();
     auto dtype = py::dtype("|S" + std::to_string(max_stride));
     // Adjusting strides to follow the numpy convention:
     py::array array;
-    auto new_strides = t.get_strides();
-    if (new_strides.size() == 0) {
-        array = py::array(dtype, t.get_shape(), {});
+
+    if (auto new_strides = t.get_strides(); new_strides.empty()) {
+        array = py::array(dtype, t.get_shape(), new_strides);
     } else {
         auto element_stride = new_strides[new_strides.size() - 1];
         for (size_t i = 0; i < new_strides.size(); ++i) {
@@ -141,12 +155,11 @@ py::array bytes_array_from_tensor(ov::Tensor&& t) {
         array = py::array(dtype, t.get_shape(), new_strides);
     }
     // Create an empty array and populate it with utf-8 encoded strings:
-    auto ptr = array.data();
+    auto ptr = reinterpret_cast<char*>(array.mutable_data());
     for (size_t i = 0; i < t.get_size(); ++i) {
-        auto start = &data[i][0];
-        auto length = data[i].length();
-        auto end = std::copy(start, start + length, (char*)ptr + i * max_stride);
-        std::fill_n(end, max_stride - length, 0);
+        const auto length = data[i].length();
+        ptr = std::copy_n(data[i].begin(), length, ptr);
+        ptr = std::fill_n(ptr, max_stride - length, '\0');
     }
     return array;
 }
@@ -185,18 +198,20 @@ void fill_tensor_from_strings(ov::Tensor& tensor, py::array& array) {
     if (tensor.get_size() != static_cast<size_t>(array.size())) {
         OPENVINO_THROW("Passed array must have the same size (number of elements) as the Tensor!");
     }
-    py::buffer_info buf = array.request();
+
+    const auto buf = array.request();
     auto data = tensor.data<std::string>();
-    for (size_t i = 0; i < tensor.get_size(); ++i) {
-        char* ptr = reinterpret_cast<char*>(buf.ptr) + (i * buf.itemsize);
+
+    for (auto a_first = reinterpret_cast<const uint8_t*>(buf.ptr), a_last = a_first + array.nbytes(); a_first < a_last;
+         a_first += array.itemsize(), ++data) {
         // TODO: check other unicode kinds? 2BYTE and 1BYTE?
-        PyObject* _unicode_obj =
-            PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, reinterpret_cast<void*>(ptr), buf.itemsize / 4);
-        PyObject* _utf8_obj = PyUnicode_AsUTF8String(_unicode_obj);
-        const char* _tmp_str = PyBytes_AsString(_utf8_obj);
-        data[i] = std::string(_tmp_str);
+        auto _unicode_obj = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, a_first, array.itemsize() / 4);
+
+        Py_ssize_t utf8_size = 0;
+        const auto utf8_str = PyUnicode_AsUTF8AndSize(_unicode_obj, &utf8_size);
+
+        *data = std::string(utf8_str, find_last_not_null(utf8_str, utf8_size));
         Py_XDECREF(_unicode_obj);
-        Py_XDECREF(_utf8_obj);
     }
 }
 

@@ -74,9 +74,15 @@ def test_empty_string_tensor(init_type):
         (["text", "abc", "openvino"]),
         (["text", "больше текста", "jeszcze więcej słów", "효과가 있었어"]),
         ([["text"], ["abc"], ["openvino"]]),
-        ([["jeszcze więcej słów", "효과가 있었어"]]),
-    ],
-)
+        ([["text"]]),
+        (["tex\u0000t\u0000tt"]),
+        ([["abĆ"]]),
+        ([["tex\u0000tttt"], ["abĆ"]]),
+        ([["jeszcze więcej słówe"], [u"효#과가 있었어"]]),
+        ([["jeszcze\u0000 więcej słówekó"]]),
+        ([["효과가 있었어"]]),
+        (["ab\u0000Ć"]),
+    ])
 def test_init_with_list(string_data):
     tensor = ov.Tensor(string_data)
     assert tensor.element_type == ov.Type.string
@@ -89,6 +95,25 @@ def test_init_with_list(string_data):
     check_string_based(tensor, _string_data)
 
 
+def test_init_with_list_rare_real_scenario():
+    input_data = ["tex\u0000\u0000ttt\u0000\u0000", "ab\u0000Ć"]
+    tensor = ov.Tensor(input_data)
+    assert tensor.element_type == ov.Type.string
+    # Convert to numpy to perform all checks. Memory is not shared,
+    np_string_data = np.array(input_data)
+    # Encoded:
+    check_bytes_based(tensor, np_string_data)
+    # Decoded:
+    str_tensor_data = tensor.str_data
+    assert str_tensor_data.shape == np_string_data.shape
+    # case when OV is not aligned with numpy format
+    # strides are different as trailing null characters are not stored in the tensor
+    # is rare to have any use of trailing null character in the string
+    assert str_tensor_data.strides != np_string_data.strides
+    assert np.array_equal(str_tensor_data, np_string_data)
+    assert not (np.shares_memory(str_tensor_data, np_string_data))
+
+
 @pytest.mark.parametrize(
     ("string_data"),
     [