apache · Kimahriman · Jul 14, 2023 · Jul 14, 2023 · Aug 23, 2023 · Aug 23, 2023
diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc
@@ -224,9 +224,11 @@ class NumPyConverter {
 
   // NumPy ascii string arrays
   Status Visit(const BinaryType& type);
+  Status Visit(const LargeBinaryType& type);
 
   // NumPy unicode arrays
   Status Visit(const StringType& type);
+  Status Visit(const LargeStringType& type);
 
   Status Visit(const StructType& type);
 
@@ -284,6 +286,12 @@ class NumPyConverter {
     return PushArray(arr_data);
   }
 
+  template <typename T>
+  Status VisitBinary(T* builder);
+
+  template <typename T>
+  Status VisitString(T* builder);
+
   Status TypeNotImplemented(std::string type_name) {
     return Status::NotImplemented("NumPyConverter doesn't implement <", type_name,
                                   "> conversion. ");
@@ -553,24 +561,23 @@ inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* d
 // Create 16MB chunks for binary data
 constexpr int32_t kBinaryChunksize = 1 << 24;
 
-Status NumPyConverter::Visit(const BinaryType& type) {
-  ::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_);
-
+template <typename T>
+Status NumPyConverter::VisitBinary(T* builder) {
   auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
 
-  auto AppendNotNull = [&builder, this](const uint8_t* data) {
+  auto AppendNotNull = [builder, this](const uint8_t* data) {
     // This is annoying. NumPy allows strings to have nul-terminators, so
     // we must check for them here
     const size_t item_size =
         strnlen(reinterpret_cast<const char*>(data), static_cast<size_t>(itemsize_));
-    return builder.Append(data, static_cast<int32_t>(item_size));
+    return builder->Append(data, static_cast<int32_t>(item_size));
   };
 
   if (mask_ != nullptr) {
     Ndarray1DIndexer<uint8_t> mask_values(mask_);
     for (int64_t i = 0; i < length_; ++i) {
       if (mask_values[i]) {
-        RETURN_NOT_OK(builder.AppendNull());
+        RETURN_NOT_OK(builder->AppendNull());
       } else {
         RETURN_NOT_OK(AppendNotNull(data));
       }
@@ -583,6 +590,14 @@ Status NumPyConverter::Visit(const BinaryType& type) {
     }
   }
 
+  return Status::OK();
+}
+
+Status NumPyConverter::Visit(const BinaryType& type) {
+  ::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_);
+
+  RETURN_NOT_OK(VisitBinary(&builder));
+
   ArrayVector result;
   RETURN_NOT_OK(builder.Finish(&result));
   for (auto arr : result) {
@@ -591,6 +606,16 @@ Status NumPyConverter::Visit(const BinaryType& type) {
   return Status::OK();
 }
 
+Status NumPyConverter::Visit(const LargeBinaryType& type) {
+  ::arrow::LargeBinaryBuilder builder(pool_);
+
+  RETURN_NOT_OK(VisitBinary(&builder));
+
+  std::shared_ptr<Array> result;
+  RETURN_NOT_OK(builder.Finish(&result));
+  return PushArray(result->data());
+}
+
 Status NumPyConverter::Visit(const FixedSizeBinaryType& type) {
   auto byte_width = type.byte_width();
 
@@ -630,8 +655,8 @@ namespace {
 // NumPy unicode is UCS4/UTF32 always
 constexpr int kNumPyUnicodeSize = 4;
 
-Status AppendUTF32(const char* data, int64_t itemsize, int byteorder,
-                   ::arrow::internal::ChunkedStringBuilder* builder) {
+template <typename T>
+Status AppendUTF32(const char* data, int64_t itemsize, int byteorder, T* builder) {
   // The binary \x00\x00\x00\x00 indicates a nul terminator in NumPy unicode,
   // so we need to detect that here to truncate if necessary. Yep.
   Py_ssize_t actual_length = 0;
@@ -659,11 +684,8 @@ Status AppendUTF32(const char* data, int64_t itemsize, int byteorder,
 
 }  // namespace
 
-Status NumPyConverter::Visit(const StringType& type) {
-  util::InitializeUTF8();
-
-  ::arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_);
-
+template <typename T>
+Status NumPyConverter::VisitString(T* builder) {
   auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
 
   char numpy_byteorder = dtype_->byteorder;
@@ -707,23 +729,23 @@ Status NumPyConverter::Visit(const StringType& type) {
   auto AppendNonNullValue = [&](const uint8_t* data) {
     if (is_binary_type) {
       if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) {
-        return builder.Append(data, static_cast<int32_t>(itemsize_));
+        return builder->Append(data, static_cast<int32_t>(itemsize_));
       } else {
         return Status::Invalid("Encountered non-UTF8 binary value: ",
                                HexEncode(data, itemsize_));
       }
     } else {
       // is_unicode_type case
       return AppendUTF32(reinterpret_cast<const char*>(data), itemsize_, byteorder,
-                         &builder);
+                         builder);
     }
   };
 
   if (mask_ != nullptr) {
     Ndarray1DIndexer<uint8_t> mask_values(mask_);
     for (int64_t i = 0; i < length_; ++i) {
       if (mask_values[i]) {
-        RETURN_NOT_OK(builder.AppendNull());
+        RETURN_NOT_OK(builder->AppendNull());
       } else {
         RETURN_NOT_OK(AppendNonNullValue(data));
       }
@@ -736,6 +758,16 @@ Status NumPyConverter::Visit(const StringType& type) {
     }
   }
 
+  return Status::OK();
+}
+
+Status NumPyConverter::Visit(const StringType& type) {
+  util::InitializeUTF8();
+
+  ::arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_);
+
+  RETURN_NOT_OK(VisitString(&builder));
+
   ArrayVector result;
   RETURN_NOT_OK(builder.Finish(&result));
   for (auto arr : result) {
@@ -744,6 +776,19 @@ Status NumPyConverter::Visit(const StringType& type) {
   return Status::OK();
 }
 
+Status NumPyConverter::Visit(const LargeStringType& type) {
+  util::InitializeUTF8();
+
+  ::arrow::LargeStringBuilder builder(pool_);
+
+  RETURN_NOT_OK(VisitString(&builder));
+
+  std::shared_ptr<Array> result;
+  RETURN_NOT_OK(builder.Finish(&result));
+  RETURN_NOT_OK(PushArray(result->data()));
+  return Status::OK();
+}
+
 Status NumPyConverter::Visit(const StructType& type) {
   std::vector<NumPyConverter> sub_converters;
   std::vector<OwnedRefNoGIL> sub_arrays;

diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
@@ -2355,32 +2355,33 @@ def test_array_from_numpy_timedelta_incorrect_unit():
             pa.array(data)
 
 
-def test_array_from_numpy_ascii():
+@pytest.mark.parametrize('binary_type', [pa.binary(), pa.large_binary()])
-@pytest.mark.parametrize('binary_type', [pa.binary(), pa.large_binary()])
+@pytest.mark.parametrize('typ, expected_type', [(None, pa.binary()), (pa.binary(), pa.binary()), (pa.large_binary(), pa.large_binary())])
-@pytest.mark.parametrize('binary_type', [pa.binary(), pa.large_binary()])
+@pytest.mark.parametrize('typ, expected_type', [(None, pa.binary()), (pa.binary(), pa.binary()), (pa.large_binary(), pa.large_binary())])
+def test_array_from_numpy_ascii(binary_type):
     arr = np.array(['abcde', 'abc', ''], dtype='|S5')
 
-    arrow_arr = pa.array(arr)
-    assert arrow_arr.type == 'binary'
-    expected = pa.array(['abcde', 'abc', ''], type='binary')
+    arrow_arr = pa.array(arr, binary_type)
+    assert arrow_arr.type == binary_type
+    expected = pa.array(['abcde', 'abc', ''], type=binary_type)
     assert arrow_arr.equals(expected)
 
     mask = np.array([False, True, False])
-    arrow_arr = pa.array(arr, mask=mask)
-    expected = pa.array(['abcde', None, ''], type='binary')
+    arrow_arr = pa.array(arr, binary_type, mask=mask)
+    expected = pa.array(['abcde', None, ''], type=binary_type)
     assert arrow_arr.equals(expected)
 
     # Strided variant
     arr = np.array(['abcde', 'abc', ''] * 5, dtype='|S5')[::2]
     mask = np.array([False, True, False] * 5)[::2]
-    arrow_arr = pa.array(arr, mask=mask)
+    arrow_arr = pa.array(arr, binary_type, mask=mask)
 
     expected = pa.array(['abcde', '', None, 'abcde', '', None, 'abcde', ''],
-                        type='binary')
+                        type=binary_type)
     assert arrow_arr.equals(expected)
 
     # 0 itemsize
     arr = np.array(['', '', ''], dtype='|S0')
-    arrow_arr = pa.array(arr)
-    expected = pa.array(['', '', ''], type='binary')
+    arrow_arr = pa.array(arr, binary_type)
+    expected = pa.array(['', '', ''], type=binary_type)
     assert arrow_arr.equals(expected)
 
 
@@ -2499,35 +2500,36 @@ def test_interval_array_from_dateoffset():
     assert list(actual_list[0]) == expected_from_pandas
 
 
-def test_array_from_numpy_unicode():
+@pytest.mark.parametrize('string_type', [pa.utf8(), pa.large_utf8()])
+def test_array_from_numpy_unicode(string_type):
     dtypes = ['<U5', '>U5']
 
     for dtype in dtypes:
         arr = np.array(['abcde', 'abc', ''], dtype=dtype)
 
-        arrow_arr = pa.array(arr)
-        assert arrow_arr.type == 'utf8'
-        expected = pa.array(['abcde', 'abc', ''], type='utf8')
+        arrow_arr = pa.array(arr, string_type)
+        assert arrow_arr.type == string_type
+        expected = pa.array(['abcde', 'abc', ''], type=string_type)
         assert arrow_arr.equals(expected)
 
         mask = np.array([False, True, False])
-        arrow_arr = pa.array(arr, mask=mask)
-        expected = pa.array(['abcde', None, ''], type='utf8')
+        arrow_arr = pa.array(arr, string_type, mask=mask)
+        expected = pa.array(['abcde', None, ''], type=string_type)
         assert arrow_arr.equals(expected)
 
         # Strided variant
         arr = np.array(['abcde', 'abc', ''] * 5, dtype=dtype)[::2]
         mask = np.array([False, True, False] * 5)[::2]
-        arrow_arr = pa.array(arr, mask=mask)
+        arrow_arr = pa.array(arr, string_type, mask=mask)
 
         expected = pa.array(['abcde', '', None, 'abcde', '', None,
-                             'abcde', ''], type='utf8')
+                             'abcde', ''], type=string_type)
         assert arrow_arr.equals(expected)
 
     # 0 itemsize
     arr = np.array(['', '', ''], dtype='<U0')
-    arrow_arr = pa.array(arr)
-    expected = pa.array(['', '', ''], type='utf8')
+    arrow_arr = pa.array(arr, string_type)
+    expected = pa.array(['', '', ''], type=string_type)
     assert arrow_arr.equals(expected)
 
 
@@ -3113,8 +3115,9 @@ def test_array_from_numpy_str_utf8():
 
 @pytest.mark.slow
 @pytest.mark.large_memory
-def test_numpy_binary_overflow_to_chunked():
-    # ARROW-3762, ARROW-5966
+@pytest.mark.parametrize('large_types', [False, True])
+def test_numpy_binary_overflow_to_chunked(large_types):
+    # ARROW-3762, ARROW-5966, GH-35289
 
     # 2^31 + 1 bytes
     values = [b'x']
@@ -3131,24 +3134,34 @@ def test_numpy_binary_overflow_to_chunked():
     unicode_values += [unicode_unique_strings[i % 10]
                        for i in range(1 << 11)]
 
-    for case, ex_type in [(values, pa.binary()),
-                          (unicode_values, pa.utf8())]:
+    binary_type = pa.large_binary() if large_types else pa.binary()
+    string_type = pa.large_utf8() if large_types else pa.utf8()
+    for case, ex_type in [(values, binary_type),
+                          (unicode_values, string_type)]:
         arr = np.array(case)
-        arrow_arr = pa.array(arr)
+        arrow_arr = pa.array(arr, ex_type)
         arr = None
 
-        assert isinstance(arrow_arr, pa.ChunkedArray)
         assert arrow_arr.type == ex_type
+        if large_types:
+            # Large types shouldn't be chunked
+            assert isinstance(arrow_arr, pa.Array)
 
-        # Split up into 16MB chunks. 128 * 16 = 2048, so 129
-        assert arrow_arr.num_chunks == 129
+            for i in range(len(arrow_arr)):
+                val = arrow_arr[i]
+                assert val.as_py() == case[i]
+        else:
+            assert isinstance(arrow_arr, pa.ChunkedArray)
+
+            # Split up into 16MB chunks. 128 * 16 = 2048, so 129
+            assert arrow_arr.num_chunks == 129
 
-        value_index = 0
-        for i in range(arrow_arr.num_chunks):
-            chunk = arrow_arr.chunk(i)
-            for val in chunk:
-                assert val.as_py() == case[value_index]
-                value_index += 1
+            value_index = 0
+            for i in range(arrow_arr.num_chunks):
+                chunk = arrow_arr.chunk(i)
+                for val in chunk:
+                    assert val.as_py() == case[value_index]
+                    value_index += 1
 
 
 @pytest.mark.large_memory

diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py
@@ -72,8 +72,10 @@ def test_type_to_pandas_dtype():
         (pa.date64(), M8),
         (pa.timestamp('ms'), M8),
         (pa.binary(), np.object_),
+        (pa.large_binary(), np.object_),
         (pa.binary(12), np.object_),
         (pa.string(), np.object_),
+        (pa.large_string(), np.object_),
         (pa.list_(pa.int8()), np.object_),
         # (pa.list_(pa.int8(), 2), np.object_),  # TODO needs pandas conversion
         (pa.map_(pa.int64(), pa.float64()), np.object_),

diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
@@ -63,8 +63,10 @@ cdef dict _pandas_type_map = {
         'ns': np.dtype('timedelta64[ns]'),
     },
     _Type_BINARY: np.object_,
+    _Type_LARGE_BINARY: np.object_,
     _Type_FIXED_SIZE_BINARY: np.object_,
     _Type_STRING: np.object_,
+    _Type_LARGE_STRING: np.object_,
     _Type_LIST: np.object_,
     _Type_MAP: np.object_,
     _Type_DECIMAL128: np.object_,