diff --git a/src/bindings/python/src/pyopenvino/core/common.cpp b/src/bindings/python/src/pyopenvino/core/common.cpp index 10ae0ed0ea6042..91699776ea0e27 100644 --- a/src/bindings/python/src/pyopenvino/core/common.cpp +++ b/src/bindings/python/src/pyopenvino/core/common.cpp @@ -118,11 +118,25 @@ const TensorIndexMap cast_to_tensor_index_map(const py::dict& inputs) { namespace string_helpers { +namespace { +auto find_last_not_null(const char* str, size_t length) { + return std::find_if(std::make_reverse_iterator(str + length), + std::make_reverse_iterator(str), + [](const auto& c) { + return c != '\0'; + }) + .base(); +} +} // namespace + py::array bytes_array_from_tensor(ov::Tensor&& t) { if (t.get_element_type() != ov::element::string) { OPENVINO_THROW("Tensor's type must be a string!"); } auto data = t.data(); + + // numpy array stores all bytes of strings but when encode it remove trailing null characters + // find max stride as max length of string but without trailing null characters auto max_element = std::max_element(data, data + t.get_size(), [](const std::string& x, const std::string& y) { return x.length() < y.length(); }); @@ -130,9 +144,9 @@ py::array bytes_array_from_tensor(ov::Tensor&& t) { auto dtype = py::dtype("|S" + std::to_string(max_stride)); // Adjusting strides to follow the numpy convention: py::array array; - auto new_strides = t.get_strides(); - if (new_strides.size() == 0) { - array = py::array(dtype, t.get_shape(), {}); + + if (auto new_strides = t.get_strides(); new_strides.empty()) { + array = py::array(dtype, t.get_shape(), new_strides); } else { auto element_stride = new_strides[new_strides.size() - 1]; for (size_t i = 0; i < new_strides.size(); ++i) { @@ -141,12 +155,11 @@ py::array bytes_array_from_tensor(ov::Tensor&& t) { array = py::array(dtype, t.get_shape(), new_strides); } // Create an empty array and populate it with utf-8 encoded strings: - auto ptr = array.data(); + auto ptr = reinterpret_cast(array.mutable_data()); for (size_t i = 0; i < t.get_size(); ++i) { - auto start = &data[i][0]; - auto length = data[i].length(); - auto end = std::copy(start, start + length, (char*)ptr + i * max_stride); - std::fill_n(end, max_stride - length, 0); + const auto length = data[i].length(); + ptr = std::copy_n(data[i].begin(), length, ptr); + ptr = std::fill_n(ptr, max_stride - length, '\0'); } return array; } @@ -185,18 +198,20 @@ void fill_tensor_from_strings(ov::Tensor& tensor, py::array& array) { if (tensor.get_size() != static_cast(array.size())) { OPENVINO_THROW("Passed array must have the same size (number of elements) as the Tensor!"); } - py::buffer_info buf = array.request(); + + const auto buf = array.request(); auto data = tensor.data(); - for (size_t i = 0; i < tensor.get_size(); ++i) { - char* ptr = reinterpret_cast(buf.ptr) + (i * buf.itemsize); + + for (auto a_first = reinterpret_cast(buf.ptr), a_last = a_first + array.nbytes(); a_first < a_last; + a_first += array.itemsize(), ++data) { // TODO: check other unicode kinds? 2BYTE and 1BYTE? - PyObject* _unicode_obj = - PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, reinterpret_cast(ptr), buf.itemsize / 4); - PyObject* _utf8_obj = PyUnicode_AsUTF8String(_unicode_obj); - const char* _tmp_str = PyBytes_AsString(_utf8_obj); - data[i] = std::string(_tmp_str); + auto _unicode_obj = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, a_first, array.itemsize() / 4); + + Py_ssize_t utf8_size = 0; + const auto utf8_str = PyUnicode_AsUTF8AndSize(_unicode_obj, &utf8_size); + + *data = std::string(utf8_str, find_last_not_null(utf8_str, utf8_size)); Py_XDECREF(_unicode_obj); - Py_XDECREF(_utf8_obj); } } diff --git a/src/bindings/python/tests/test_runtime/test_tensor_string.py b/src/bindings/python/tests/test_runtime/test_tensor_string.py index ff9b40504c7064..8d44ce4d34f2a3 100644 --- a/src/bindings/python/tests/test_runtime/test_tensor_string.py +++ b/src/bindings/python/tests/test_runtime/test_tensor_string.py @@ -74,9 +74,15 @@ def test_empty_string_tensor(init_type): (["text", "abc", "openvino"]), (["text", "больше текста", "jeszcze więcej słów", "효과가 있었어"]), ([["text"], ["abc"], ["openvino"]]), - ([["jeszcze więcej słów", "효과가 있었어"]]), - ], -) + ([["text"]]), + (["tex\u0000t\u0000tt"]), + ([["abĆ"]]), + ([["tex\u0000tttt"], ["abĆ"]]), + ([["jeszcze więcej słówe"], [u"효#과가 있었어"]]), + ([["jeszcze\u0000 więcej słówekó"]]), + ([["효과가 있었어"]]), + (["ab\u0000Ć"]), + ]) def test_init_with_list(string_data): tensor = ov.Tensor(string_data) assert tensor.element_type == ov.Type.string @@ -89,6 +95,25 @@ def test_init_with_list(string_data): check_string_based(tensor, _string_data) +def test_init_with_list_rare_real_scenario(): + input_data = ["tex\u0000\u0000ttt\u0000\u0000", "ab\u0000Ć"] + tensor = ov.Tensor(input_data) + assert tensor.element_type == ov.Type.string + # Convert to numpy to perform all checks. Memory is not shared, + np_string_data = np.array(input_data) + # Encoded: + check_bytes_based(tensor, np_string_data) + # Decoded: + str_tensor_data = tensor.str_data + assert str_tensor_data.shape == np_string_data.shape + # case when OV is not aligned with numpy format + # strides are different as trailing null characters are not stored in the tensor + # is rare to have any use of trailing null character in the string + assert str_tensor_data.strides != np_string_data.strides + assert np.array_equal(str_tensor_data, np_string_data) + assert not (np.shares_memory(str_tensor_data, np_string_data)) + + @pytest.mark.parametrize( ("string_data"), [