Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PyAPI] Fix string truncation when special \u0000 is inside string #28373

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 32 additions & 17 deletions src/bindings/python/src/pyopenvino/core/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,21 +118,35 @@ const TensorIndexMap cast_to_tensor_index_map(const py::dict& inputs) {

namespace string_helpers {

namespace {
auto find_last_not_null(const char* str, size_t length) {
return std::find_if(std::make_reverse_iterator(str + length),
std::make_reverse_iterator(str),
[](const auto& c) {
return c != '\0';
})
.base();
}
} // namespace

py::array bytes_array_from_tensor(ov::Tensor&& t) {
if (t.get_element_type() != ov::element::string) {
OPENVINO_THROW("Tensor's type must be a string!");
}
auto data = t.data<std::string>();

// numpy array stores all bytes of strings but when encode it remove trailing null characters
// find max stride as max length of string but without trailing null characters
auto max_element = std::max_element(data, data + t.get_size(), [](const std::string& x, const std::string& y) {
return x.length() < y.length();
});
auto max_stride = max_element->length();
auto dtype = py::dtype("|S" + std::to_string(max_stride));
// Adjusting strides to follow the numpy convention:
py::array array;
auto new_strides = t.get_strides();
if (new_strides.size() == 0) {
array = py::array(dtype, t.get_shape(), {});

if (auto new_strides = t.get_strides(); new_strides.empty()) {
array = py::array(dtype, t.get_shape(), new_strides);
} else {
auto element_stride = new_strides[new_strides.size() - 1];
for (size_t i = 0; i < new_strides.size(); ++i) {
Expand All @@ -141,12 +155,11 @@ py::array bytes_array_from_tensor(ov::Tensor&& t) {
array = py::array(dtype, t.get_shape(), new_strides);
}
// Create an empty array and populate it with utf-8 encoded strings:
auto ptr = array.data();
auto ptr = reinterpret_cast<char*>(array.mutable_data());
for (size_t i = 0; i < t.get_size(); ++i) {
auto start = &data[i][0];
auto length = data[i].length();
auto end = std::copy(start, start + length, (char*)ptr + i * max_stride);
std::fill_n(end, max_stride - length, 0);
const auto length = data[i].length();
ptr = std::copy_n(data[i].begin(), length, ptr);
ptr = std::fill_n(ptr, max_stride - length, '\0');
}
return array;
}
Expand Down Expand Up @@ -185,18 +198,20 @@ void fill_tensor_from_strings(ov::Tensor& tensor, py::array& array) {
if (tensor.get_size() != static_cast<size_t>(array.size())) {
OPENVINO_THROW("Passed array must have the same size (number of elements) as the Tensor!");
}
py::buffer_info buf = array.request();

const auto buf = array.request();
auto data = tensor.data<std::string>();
for (size_t i = 0; i < tensor.get_size(); ++i) {
char* ptr = reinterpret_cast<char*>(buf.ptr) + (i * buf.itemsize);

for (auto a_first = reinterpret_cast<const uint8_t*>(buf.ptr), a_last = a_first + array.nbytes(); a_first < a_last;
a_first += array.itemsize(), ++data) {
// TODO: check other unicode kinds? 2BYTE and 1BYTE?
PyObject* _unicode_obj =
PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, reinterpret_cast<void*>(ptr), buf.itemsize / 4);
PyObject* _utf8_obj = PyUnicode_AsUTF8String(_unicode_obj);
const char* _tmp_str = PyBytes_AsString(_utf8_obj);
data[i] = std::string(_tmp_str);
auto _unicode_obj = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, a_first, array.itemsize() / 4);

Py_ssize_t utf8_size = 0;
const auto utf8_str = PyUnicode_AsUTF8AndSize(_unicode_obj, &utf8_size);

*data = std::string(utf8_str, find_last_not_null(utf8_str, utf8_size));
Py_XDECREF(_unicode_obj);
Py_XDECREF(_utf8_obj);
}
}

Expand Down
31 changes: 28 additions & 3 deletions src/bindings/python/tests/test_runtime/test_tensor_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,15 @@ def test_empty_string_tensor(init_type):
(["text", "abc", "openvino"]),
(["text", "больше текста", "jeszcze więcej słów", "효과가 있었어"]),
([["text"], ["abc"], ["openvino"]]),
([["jeszcze więcej słów", "효과가 있었어"]]),
],
)
([["text"]]),
(["tex\u0000t\u0000tt"]),
([["abĆ"]]),
([["tex\u0000tttt"], ["abĆ"]]),
([["jeszcze więcej słówe"], [u"효#과가 있었어"]]),
([["jeszcze\u0000 więcej słówekó"]]),
([["효과가 있었어"]]),
(["ab\u0000Ć"]),
])
def test_init_with_list(string_data):
tensor = ov.Tensor(string_data)
assert tensor.element_type == ov.Type.string
Expand All @@ -89,6 +95,25 @@ def test_init_with_list(string_data):
check_string_based(tensor, _string_data)


def test_init_with_list_rare_real_scenario():
input_data = ["tex\u0000\u0000ttt\u0000\u0000", "ab\u0000Ć"]
tensor = ov.Tensor(input_data)
assert tensor.element_type == ov.Type.string
# Convert to numpy to perform all checks. Memory is not shared,
np_string_data = np.array(input_data)
# Encoded:
check_bytes_based(tensor, np_string_data)
# Decoded:
str_tensor_data = tensor.str_data
assert str_tensor_data.shape == np_string_data.shape
# case when OV is not aligned with numpy format
# strides are different as trailing null characters are not stored in the tensor
# is rare to have any use of trailing null character in the string
assert str_tensor_data.strides != np_string_data.strides
assert np.array_equal(str_tensor_data, np_string_data)
assert not (np.shares_memory(str_tensor_data, np_string_data))


@pytest.mark.parametrize(
("string_data"),
[
Expand Down
Loading