Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-35289: [Python] Support large variable width types in numpy conversion #36701

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 61 additions & 16 deletions python/pyarrow/src/arrow/python/numpy_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -224,9 +224,11 @@ class NumPyConverter {

// NumPy ascii string arrays
Status Visit(const BinaryType& type);
Status Visit(const LargeBinaryType& type);

// NumPy unicode arrays
Status Visit(const StringType& type);
Status Visit(const LargeStringType& type);

Status Visit(const StructType& type);

Expand Down Expand Up @@ -284,6 +286,12 @@ class NumPyConverter {
return PushArray(arr_data);
}

template <typename T>
Status VisitBinary(T* builder);

template <typename T>
Status VisitString(T* builder);

Status TypeNotImplemented(std::string type_name) {
return Status::NotImplemented("NumPyConverter doesn't implement <", type_name,
"> conversion. ");
Expand Down Expand Up @@ -553,24 +561,23 @@ inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* d
// Create 16MB chunks for binary data
constexpr int32_t kBinaryChunksize = 1 << 24;

Status NumPyConverter::Visit(const BinaryType& type) {
::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_);

template <typename T>
Status NumPyConverter::VisitBinary(T* builder) {
auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));

auto AppendNotNull = [&builder, this](const uint8_t* data) {
auto AppendNotNull = [builder, this](const uint8_t* data) {
// This is annoying. NumPy allows strings to have nul-terminators, so
// we must check for them here
const size_t item_size =
strnlen(reinterpret_cast<const char*>(data), static_cast<size_t>(itemsize_));
return builder.Append(data, static_cast<int32_t>(item_size));
return builder->Append(data, static_cast<int32_t>(item_size));
};

if (mask_ != nullptr) {
Ndarray1DIndexer<uint8_t> mask_values(mask_);
for (int64_t i = 0; i < length_; ++i) {
if (mask_values[i]) {
RETURN_NOT_OK(builder.AppendNull());
RETURN_NOT_OK(builder->AppendNull());
} else {
RETURN_NOT_OK(AppendNotNull(data));
}
Expand All @@ -583,6 +590,14 @@ Status NumPyConverter::Visit(const BinaryType& type) {
}
}

return Status::OK();
}

Status NumPyConverter::Visit(const BinaryType& type) {
::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_);

RETURN_NOT_OK(VisitBinary(&builder));

ArrayVector result;
RETURN_NOT_OK(builder.Finish(&result));
for (auto arr : result) {
Expand All @@ -591,6 +606,16 @@ Status NumPyConverter::Visit(const BinaryType& type) {
return Status::OK();
}

Status NumPyConverter::Visit(const LargeBinaryType& type) {
::arrow::LargeBinaryBuilder builder(pool_);

RETURN_NOT_OK(VisitBinary(&builder));

std::shared_ptr<Array> result;
RETURN_NOT_OK(builder.Finish(&result));
return PushArray(result->data());
}

Status NumPyConverter::Visit(const FixedSizeBinaryType& type) {
auto byte_width = type.byte_width();

Expand Down Expand Up @@ -630,8 +655,8 @@ namespace {
// NumPy unicode is UCS4/UTF32 always
constexpr int kNumPyUnicodeSize = 4;

Status AppendUTF32(const char* data, int64_t itemsize, int byteorder,
::arrow::internal::ChunkedStringBuilder* builder) {
template <typename T>
Status AppendUTF32(const char* data, int64_t itemsize, int byteorder, T* builder) {
// The binary \x00\x00\x00\x00 indicates a nul terminator in NumPy unicode,
// so we need to detect that here to truncate if necessary. Yep.
Py_ssize_t actual_length = 0;
Expand Down Expand Up @@ -659,11 +684,8 @@ Status AppendUTF32(const char* data, int64_t itemsize, int byteorder,

} // namespace

Status NumPyConverter::Visit(const StringType& type) {
util::InitializeUTF8();

::arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_);

template <typename T>
Status NumPyConverter::VisitString(T* builder) {
auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));

char numpy_byteorder = dtype_->byteorder;
Expand Down Expand Up @@ -707,23 +729,23 @@ Status NumPyConverter::Visit(const StringType& type) {
auto AppendNonNullValue = [&](const uint8_t* data) {
if (is_binary_type) {
if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) {
return builder.Append(data, static_cast<int32_t>(itemsize_));
return builder->Append(data, static_cast<int32_t>(itemsize_));
} else {
return Status::Invalid("Encountered non-UTF8 binary value: ",
HexEncode(data, itemsize_));
}
} else {
// is_unicode_type case
return AppendUTF32(reinterpret_cast<const char*>(data), itemsize_, byteorder,
&builder);
builder);
}
};

if (mask_ != nullptr) {
Ndarray1DIndexer<uint8_t> mask_values(mask_);
for (int64_t i = 0; i < length_; ++i) {
if (mask_values[i]) {
RETURN_NOT_OK(builder.AppendNull());
RETURN_NOT_OK(builder->AppendNull());
} else {
RETURN_NOT_OK(AppendNonNullValue(data));
}
Expand All @@ -736,6 +758,16 @@ Status NumPyConverter::Visit(const StringType& type) {
}
}

return Status::OK();
}

Status NumPyConverter::Visit(const StringType& type) {
util::InitializeUTF8();

::arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_);

RETURN_NOT_OK(VisitString(&builder));

ArrayVector result;
RETURN_NOT_OK(builder.Finish(&result));
for (auto arr : result) {
Expand All @@ -744,6 +776,19 @@ Status NumPyConverter::Visit(const StringType& type) {
return Status::OK();
}

Status NumPyConverter::Visit(const LargeStringType& type) {
util::InitializeUTF8();

::arrow::LargeStringBuilder builder(pool_);

RETURN_NOT_OK(VisitString(&builder));

std::shared_ptr<Array> result;
RETURN_NOT_OK(builder.Finish(&result));
RETURN_NOT_OK(PushArray(result->data()));
return Status::OK();
}

Status NumPyConverter::Visit(const StructType& type) {
std::vector<NumPyConverter> sub_converters;
std::vector<OwnedRefNoGIL> sub_arrays;
Expand Down
81 changes: 47 additions & 34 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2355,32 +2355,33 @@ def test_array_from_numpy_timedelta_incorrect_unit():
pa.array(data)


def test_array_from_numpy_ascii():
@pytest.mark.parametrize('binary_type', [pa.binary(), pa.large_binary()])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, one last comment (promised! ;)): I think we want to keep testing the below also in the case of not specifying the type (in which case we infer the small types), and I am not entirely sure this is explicitly covered elsewhere (implicitly for sure). But so could parametrize this slightly differently with:

Suggested change
@pytest.mark.parametrize('binary_type', [pa.binary(), pa.large_binary()])
@pytest.mark.parametrize('typ, expected_type', [(None, pa.binary()), (pa.binary(), pa.binary()), (pa.large_binary(), pa.large_binary())])

And then the same for string test below.

Or, maybe simpler, just duplicate the first case to have a version without a type specified:

    # without specified type, always binary
    arrow_arr = pa.array(arr)
    assert arrow_arr.type == 'binary'
    expected = ..

    arrow_arr = pa.array(arr, binary_type)
    assert arrow_arr.type == binary_type
    expected = ..

(I assume that for the inference it shouldn't matter if there are strides or not)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I thought about that as I was making the update hah, will add back the inferring

Copy link
Author

@Kimahriman Kimahriman May 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a third parameter and just set the expected type in the func

@pytest.mark.parametrize('string_type', [None, pa.utf8(), pa.large_utf8()])
def test_array_from_numpy_unicode(string_type):
    # Default when no type is specified should be utf8
    expected_type = string_type or pa.utf8()

def test_array_from_numpy_ascii(binary_type):
arr = np.array(['abcde', 'abc', ''], dtype='|S5')

arrow_arr = pa.array(arr)
assert arrow_arr.type == 'binary'
expected = pa.array(['abcde', 'abc', ''], type='binary')
arrow_arr = pa.array(arr, binary_type)
assert arrow_arr.type == binary_type
expected = pa.array(['abcde', 'abc', ''], type=binary_type)
assert arrow_arr.equals(expected)

mask = np.array([False, True, False])
arrow_arr = pa.array(arr, mask=mask)
expected = pa.array(['abcde', None, ''], type='binary')
arrow_arr = pa.array(arr, binary_type, mask=mask)
expected = pa.array(['abcde', None, ''], type=binary_type)
assert arrow_arr.equals(expected)

# Strided variant
arr = np.array(['abcde', 'abc', ''] * 5, dtype='|S5')[::2]
mask = np.array([False, True, False] * 5)[::2]
arrow_arr = pa.array(arr, mask=mask)
arrow_arr = pa.array(arr, binary_type, mask=mask)

expected = pa.array(['abcde', '', None, 'abcde', '', None, 'abcde', ''],
type='binary')
type=binary_type)
assert arrow_arr.equals(expected)

# 0 itemsize
arr = np.array(['', '', ''], dtype='|S0')
arrow_arr = pa.array(arr)
expected = pa.array(['', '', ''], type='binary')
arrow_arr = pa.array(arr, binary_type)
expected = pa.array(['', '', ''], type=binary_type)
assert arrow_arr.equals(expected)


Expand Down Expand Up @@ -2499,35 +2500,36 @@ def test_interval_array_from_dateoffset():
assert list(actual_list[0]) == expected_from_pandas


def test_array_from_numpy_unicode():
@pytest.mark.parametrize('string_type', [pa.utf8(), pa.large_utf8()])
def test_array_from_numpy_unicode(string_type):
dtypes = ['<U5', '>U5']

for dtype in dtypes:
arr = np.array(['abcde', 'abc', ''], dtype=dtype)

arrow_arr = pa.array(arr)
assert arrow_arr.type == 'utf8'
expected = pa.array(['abcde', 'abc', ''], type='utf8')
arrow_arr = pa.array(arr, string_type)
assert arrow_arr.type == string_type
expected = pa.array(['abcde', 'abc', ''], type=string_type)
assert arrow_arr.equals(expected)

mask = np.array([False, True, False])
arrow_arr = pa.array(arr, mask=mask)
expected = pa.array(['abcde', None, ''], type='utf8')
arrow_arr = pa.array(arr, string_type, mask=mask)
expected = pa.array(['abcde', None, ''], type=string_type)
assert arrow_arr.equals(expected)

# Strided variant
arr = np.array(['abcde', 'abc', ''] * 5, dtype=dtype)[::2]
mask = np.array([False, True, False] * 5)[::2]
arrow_arr = pa.array(arr, mask=mask)
arrow_arr = pa.array(arr, string_type, mask=mask)

expected = pa.array(['abcde', '', None, 'abcde', '', None,
'abcde', ''], type='utf8')
'abcde', ''], type=string_type)
assert arrow_arr.equals(expected)

# 0 itemsize
arr = np.array(['', '', ''], dtype='<U0')
arrow_arr = pa.array(arr)
expected = pa.array(['', '', ''], type='utf8')
arrow_arr = pa.array(arr, string_type)
expected = pa.array(['', '', ''], type=string_type)
assert arrow_arr.equals(expected)


Expand Down Expand Up @@ -3113,8 +3115,9 @@ def test_array_from_numpy_str_utf8():

@pytest.mark.slow
@pytest.mark.large_memory
def test_numpy_binary_overflow_to_chunked():
# ARROW-3762, ARROW-5966
@pytest.mark.parametrize('large_types', [False, True])
def test_numpy_binary_overflow_to_chunked(large_types):
# ARROW-3762, ARROW-5966, GH-35289

# 2^31 + 1 bytes
values = [b'x']
Expand All @@ -3131,24 +3134,34 @@ def test_numpy_binary_overflow_to_chunked():
unicode_values += [unicode_unique_strings[i % 10]
for i in range(1 << 11)]

for case, ex_type in [(values, pa.binary()),
(unicode_values, pa.utf8())]:
binary_type = pa.large_binary() if large_types else pa.binary()
string_type = pa.large_utf8() if large_types else pa.utf8()
for case, ex_type in [(values, binary_type),
(unicode_values, string_type)]:
arr = np.array(case)
arrow_arr = pa.array(arr)
arrow_arr = pa.array(arr, ex_type)
arr = None

assert isinstance(arrow_arr, pa.ChunkedArray)
assert arrow_arr.type == ex_type
if large_types:
# Large types shouldn't be chunked
assert isinstance(arrow_arr, pa.Array)

# Split up into 16MB chunks. 128 * 16 = 2048, so 129
assert arrow_arr.num_chunks == 129
for i in range(len(arrow_arr)):
val = arrow_arr[i]
assert val.as_py() == case[i]
else:
assert isinstance(arrow_arr, pa.ChunkedArray)

# Split up into 16MB chunks. 128 * 16 = 2048, so 129
assert arrow_arr.num_chunks == 129

value_index = 0
for i in range(arrow_arr.num_chunks):
chunk = arrow_arr.chunk(i)
for val in chunk:
assert val.as_py() == case[value_index]
value_index += 1
value_index = 0
for i in range(arrow_arr.num_chunks):
chunk = arrow_arr.chunk(i)
for val in chunk:
assert val.as_py() == case[value_index]
value_index += 1


@pytest.mark.large_memory
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/tests/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,10 @@ def test_type_to_pandas_dtype():
(pa.date64(), M8),
(pa.timestamp('ms'), M8),
(pa.binary(), np.object_),
(pa.large_binary(), np.object_),
(pa.binary(12), np.object_),
(pa.string(), np.object_),
(pa.large_string(), np.object_),
(pa.list_(pa.int8()), np.object_),
# (pa.list_(pa.int8(), 2), np.object_), # TODO needs pandas conversion
(pa.map_(pa.int64(), pa.float64()), np.object_),
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/types.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ cdef dict _pandas_type_map = {
'ns': np.dtype('timedelta64[ns]'),
},
_Type_BINARY: np.object_,
_Type_LARGE_BINARY: np.object_,
_Type_FIXED_SIZE_BINARY: np.object_,
_Type_STRING: np.object_,
_Type_LARGE_STRING: np.object_,
_Type_LIST: np.object_,
_Type_MAP: np.object_,
_Type_DECIMAL128: np.object_,
Expand Down
Loading