Skip to content

Commit

Permalink
Merge branch 'apache:main' into emscripten_python_changes
Browse files Browse the repository at this point in the history
  • Loading branch information
joemarshall authored Jul 5, 2024
2 parents fa0e497 + 5b5c164 commit 71a2f6a
Show file tree
Hide file tree
Showing 85 changed files with 3,011 additions and 894 deletions.
1 change: 1 addition & 0 deletions .asf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ github:
- davisusanibar
- jbonofre
- js8544
- laurentgo
- vibhatha
- zanmato1984
- ZhangHuiGui
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ jobs:
shell: bash
run: |
gem install test-unit
pip install "cython>=0.29.31" setuptools six pytest jira setuptools-scm
pip install "cython>=0.29.31" setuptools pytest jira setuptools-scm
- name: Run Release Test
env:
ARROW_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
Expand Down
14 changes: 14 additions & 0 deletions ci/docker/centos-7-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,25 @@

FROM centos:centos7

# Update mirrors to use vault.centos.org as CentOS 7
# is EOL since 2024-06-30
RUN sed -i \
-e 's/^mirrorlist/#mirrorlist/' \
-e 's/^#baseurl/baseurl/' \
-e 's/mirror\.centos\.org/vault.centos.org/' \
/etc/yum.repos.d/*.repo

# devtoolset is required for C++17
RUN \
yum install -y \
centos-release-scl \
epel-release && \
sed -i \
-e 's/^mirrorlist/#mirrorlist/' \
-e 's/^#baseurl/baseurl/' \
-e 's/^# baseurl/baseurl/' \
-e 's/mirror\.centos\.org/vault.centos.org/' \
/etc/yum.repos.d/CentOS-SCLo-scl*.repo && \
yum install -y \
cmake3 \
curl \
Expand Down
10 changes: 9 additions & 1 deletion ci/docker/conda-python-substrait.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,19 @@ FROM ${repo}:${arch}-conda-python-${python}
COPY ci/conda_env_python.txt \
ci/conda_env_sphinx.txt \
/arrow/ci/

# Note: openjdk is pinned to 17 because the
# substrait repo currently pins to jdk 17.
# Newer jdk versions are currently failing
# due to the recent upgrade to Gradle 8 via
# install_substrait_consumer.sh.
# https://github.com/substrait-io/substrait-java/issues/274
RUN mamba install -q -y \
--file arrow/ci/conda_env_python.txt \
--file arrow/ci/conda_env_sphinx.txt \
$([ "$python" == "3.9" ] && echo "pickle5") \
python=${python} openjdk \
python=${python} \
openjdk=17 \
nomkl && \
mamba clean --all

Expand Down
12 changes: 12 additions & 0 deletions ci/docker/python-wheel-manylinux.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,18 @@ ARG manylinux
ENV MANYLINUX_VERSION=${manylinux}

# Ensure dnf is installed, especially for the manylinux2014 base
RUN if [ "${MANYLINUX_VERSION}" = "2014" ]; then \
sed -i \
-e 's/^mirrorlist/#mirrorlist/' \
-e 's/^#baseurl/baseurl/' \
-e 's/mirror\.centos\.org/vault.centos.org/' \
/etc/yum.repos.d/*.repo; \
if [ "${arch}" != "amd64" ]; then \
sed -i \
-e 's,vault\.centos\.org/centos,vault.centos.org/altarch,' \
/etc/yum.repos.d/CentOS-SCLo-scl-rh.repo; \
fi; \
fi
RUN yum install -y dnf

# Install basic dependencies
Expand Down
6 changes: 6 additions & 0 deletions ci/scripts/PKGBUILD
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ build() {
mkdir -p ${cpp_build_dir}
pushd ${cpp_build_dir}

# We use static cURL in google-cloud-cpp. If we can use cURL's CMake
# package, we don't need to specify CURL_STATICLIB explicitly. But
# we don't have cURL's CMake package. We need to use CXXFLAGS
# instead of ARROW_CXXFLAGS because ARROW_CXXFLAGS aren't passed to
# ExternProjects.
export CXXFLAGS="${CXXFLAGS} -DCURL_STATICLIB"
# The Rtools libutf8proc is a static lib, but Findutf8proc.cmake doesn't
# set the appropriate compiler definition.
export CPPFLAGS="-DUTF8PROC_STATIC"
Expand Down
7 changes: 6 additions & 1 deletion cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4225,7 +4225,8 @@ macro(build_nlohmann_json)
set(NLOHMANN_JSON_INCLUDE_DIR "${NLOHMANN_JSON_PREFIX}/include")
set(NLOHMANN_JSON_CMAKE_ARGS
${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>"
-DJSON_BuildTests=OFF)
# google-cloud-cpp requires JSON_MultipleHeaders=ON
-DJSON_BuildTests=OFF -DJSON_MultipleHeaders=ON)

set(NLOHMANN_JSON_BUILD_BYPRODUCTS ${NLOHMANN_JSON_PREFIX}/include/nlohmann/json.hpp)

Expand Down Expand Up @@ -4294,6 +4295,7 @@ macro(build_google_cloud_cpp_storage)
# We need this to build with OpenSSL 3.0.
# See also: https://github.com/googleapis/google-cloud-cpp/issues/8544
-DGOOGLE_CLOUD_CPP_ENABLE_WERROR=OFF
-DGOOGLE_CLOUD_CPP_WITH_MOCKS=OFF
-DOPENSSL_CRYPTO_LIBRARY=${OPENSSL_CRYPTO_LIBRARY}
-DOPENSSL_INCLUDE_DIR=${OPENSSL_INCLUDE_DIR}
-DOPENSSL_SSL_LIBRARY=${OPENSSL_SSL_LIBRARY})
Expand Down Expand Up @@ -4380,6 +4382,9 @@ macro(build_google_cloud_cpp_storage)
nlohmann_json::nlohmann_json
OpenSSL::SSL
OpenSSL::Crypto)
if(WIN32)
target_link_libraries(google-cloud-cpp::rest-internal INTERFACE ws2_32)
endif()

add_library(google-cloud-cpp::storage STATIC IMPORTED)
set_target_properties(google-cloud-cpp::storage
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/acero/hash_join_node_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3220,7 +3220,7 @@ TEST(HashJoin, ManyJoins) {
// stack), which is essentially the recursive usage of the temp vector stack.

// A fair number of joins to guarantee temp vector stack overflow before GH-41335.
const int num_joins = 64;
const int num_joins = 16;

// `ExecBatchBuilder::num_rows_max()` is the number of rows for swiss join to accumulate
// before outputting.
Expand Down
8 changes: 5 additions & 3 deletions cpp/src/arrow/compute/api_scalar.cc
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,8 @@ static auto kMatchSubstringOptionsType = GetFunctionOptionsType<MatchSubstringOp
static auto kNullOptionsType = GetFunctionOptionsType<NullOptions>(
DataMember("nan_is_null", &NullOptions::nan_is_null));
static auto kPadOptionsType = GetFunctionOptionsType<PadOptions>(
DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding));
DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding),
DataMember("lean_left_on_odd_padding", &PadOptions::lean_left_on_odd_padding));
static auto kReplaceSliceOptionsType = GetFunctionOptionsType<ReplaceSliceOptions>(
DataMember("start", &ReplaceSliceOptions::start),
DataMember("stop", &ReplaceSliceOptions::stop),
Expand Down Expand Up @@ -480,10 +481,11 @@ NullOptions::NullOptions(bool nan_is_null)
: FunctionOptions(internal::kNullOptionsType), nan_is_null(nan_is_null) {}
constexpr char NullOptions::kTypeName[];

PadOptions::PadOptions(int64_t width, std::string padding)
PadOptions::PadOptions(int64_t width, std::string padding, bool lean_left_on_odd_padding)
: FunctionOptions(internal::kPadOptionsType),
width(width),
padding(std::move(padding)) {}
padding(std::move(padding)),
lean_left_on_odd_padding(lean_left_on_odd_padding) {}
PadOptions::PadOptions() : PadOptions(0, " ") {}
constexpr char PadOptions::kTypeName[];

Expand Down
7 changes: 6 additions & 1 deletion cpp/src/arrow/compute/api_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -358,14 +358,19 @@ class ARROW_EXPORT StrftimeOptions : public FunctionOptions {

class ARROW_EXPORT PadOptions : public FunctionOptions {
public:
explicit PadOptions(int64_t width, std::string padding = " ");
explicit PadOptions(int64_t width, std::string padding = " ",
bool lean_left_on_odd_padding = true);
PadOptions();
static constexpr char const kTypeName[] = "PadOptions";

/// The desired string length.
int64_t width;
/// What to pad the string with. Should be one codepoint (Unicode)/byte (ASCII).
std::string padding;
/// What to do if there is an odd number of padding characters (in case of centered
/// padding). Defaults to aligning on the left (i.e. adding the extra padding character
/// on the right)
bool lean_left_on_odd_padding = true;
};

class ARROW_EXPORT TrimOptions : public FunctionOptions {
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/compute/function_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ TEST(FunctionOptions, Equality) {
#endif
options.emplace_back(new PadOptions(5, " "));
options.emplace_back(new PadOptions(10, "A"));
options.emplace_back(new PadOptions(10, "A", false));
options.emplace_back(new TrimOptions(" "));
options.emplace_back(new TrimOptions("abc"));
options.emplace_back(new SliceOptions(/*start=*/1));
Expand Down
10 changes: 7 additions & 3 deletions cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1142,9 +1142,13 @@ struct AsciiPadTransform : public StringTransformBase {
int64_t left = 0;
int64_t right = 0;
if (PadLeft && PadRight) {
// If odd number of spaces, put the extra space on the right
left = spaces / 2;
right = spaces - left;
if (options_.lean_left_on_odd_padding) {
left = spaces / 2;
right = spaces - left;
} else {
right = spaces / 2;
left = spaces - right;
}
} else if (PadLeft) {
left = spaces;
} else if (PadRight) {
Expand Down
10 changes: 10 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2117,6 +2117,12 @@ TYPED_TEST(TestStringKernels, PadUTF8) {
R"([null, "a\u2008\u2008\u2008\u2008", "bb\u2008\u2008\u2008", "b\u00E1r\u2008\u2008", "foobar"])",
&options);

PadOptions options2{/*width=*/5, "\xe2\x80\x88", /*lean_left_on_odd_padding=*/false};
this->CheckUnary(
"utf8_center", R"([null, "a", "bb", "b\u00E1r", "foobar"])", this->type(),
R"([null, "\u2008\u2008a\u2008\u2008", "\u2008\u2008bb\u2008", "\u2008b\u00E1r\u2008", "foobar"])",
&options2);

PadOptions options_bad{/*width=*/3, /*padding=*/"spam"};
auto input = ArrayFromJSON(this->type(), R"(["foo"])");
EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
Expand Down Expand Up @@ -2459,6 +2465,10 @@ TYPED_TEST(TestStringKernels, PadAscii) {
this->CheckUnary("ascii_rpad", R"([null, "a", "bb", "bar", "foobar"])", this->type(),
R"([null, "a ", "bb ", "bar ", "foobar"])", &options);

PadOptions options2{/*width=*/5, " ", /*lean_left_on_odd_padding=*/false};
this->CheckUnary("ascii_center", R"([null, "a", "bb", "bar", "foobar"])", this->type(),
R"([null, " a ", " bb ", " bar ", "foobar"])", &options2);

PadOptions options_bad{/*width=*/3, /*padding=*/"spam"};
auto input = ArrayFromJSON(this->type(), R"(["foo"])");
EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
Expand Down
10 changes: 7 additions & 3 deletions cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
Original file line number Diff line number Diff line change
Expand Up @@ -930,9 +930,13 @@ struct Utf8PadTransform : public StringTransformBase {
int64_t left = 0;
int64_t right = 0;
if (PadLeft && PadRight) {
// If odd number of spaces, put the extra space on the right
left = spaces / 2;
right = spaces - left;
if (options_.lean_left_on_odd_padding) {
left = spaces / 2;
right = spaces - left;
} else {
right = spaces / 2;
left = spaces - right;
}
} else if (PadLeft) {
left = spaces;
} else if (PadRight) {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1741,7 +1741,7 @@ const FunctionDoc millisecond_doc{

const FunctionDoc microsecond_doc{
"Extract microsecond values",
("Millisecond returns number of microseconds since the last full millisecond.\n"
("Microsecond returns number of microseconds since the last full millisecond.\n"
"Null values emit null.\n"
"An error is returned if the values have a defined timezone but it\n"
"cannot be found in the timezone database."),
Expand Down
67 changes: 66 additions & 1 deletion cpp/src/arrow/compute/kernels/vector_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,16 @@
#include "arrow/compute/api_vector.h"
#include "arrow/compute/kernels/common_internal.h"
#include "arrow/result.h"
#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/list_util.h"
#include "arrow/visit_type_inline.h"

namespace arrow {

using internal::CountSetBits;
using list_util::internal::RangeOfValuesUsed;

namespace compute {
namespace internal {
namespace {
Expand Down Expand Up @@ -76,6 +83,63 @@ struct ListParentIndicesArray {

Status Visit(const LargeListType& type) { return VisitList(type); }

template <typename Type, typename offset_type = typename Type::offset_type>
Status VisitListView(const Type&) {
ArraySpan list_view{*input};

const offset_type* offsets = list_view.GetValues<offset_type>(1);
const offset_type* sizes = list_view.GetValues<offset_type>(2);
int64_t values_offset;
int64_t values_length;
ARROW_ASSIGN_OR_RAISE(std::tie(values_offset, values_length),
RangeOfValuesUsed(list_view));

ARROW_ASSIGN_OR_RAISE(auto indices_validity,
AllocateEmptyBitmap(values_length, ctx->memory_pool()));
auto* out_indices_validity = indices_validity->mutable_data();
int64_t total_pop_count = 0;

ARROW_ASSIGN_OR_RAISE(auto indices, ctx->Allocate(values_length * sizeof(int64_t)));
auto* out_indices = indices->template mutable_data_as<int64_t>();
memset(out_indices, -1, values_length * sizeof(int64_t));

const auto* validity = list_view.GetValues<uint8_t>(0, 0);
RETURN_NOT_OK(arrow::internal::VisitSetBitRuns(
validity, list_view.offset, list_view.length,
[this, offsets, sizes, out_indices, out_indices_validity, values_offset,
&total_pop_count](int64_t run_start, int64_t run_length) {
for (int64_t i = run_start; i < run_start + run_length; ++i) {
auto validity_offset = offsets[i] - values_offset;
const int64_t pop_count =
CountSetBits(out_indices_validity, validity_offset, sizes[i]);
if (ARROW_PREDICT_FALSE(pop_count > 0)) {
return Status::Invalid(
"Function 'list_parent_indices' cannot produce parent indices for "
"values used by more than one list-view array element.");
}
bit_util::SetBitmap(out_indices_validity, validity_offset, sizes[i]);
total_pop_count += sizes[i];
for (auto j = static_cast<int64_t>(offsets[i]);
j < static_cast<int64_t>(offsets[i]) + sizes[i]; ++j) {
out_indices[j - values_offset] = i + base_output_offset;
}
}
return Status::OK();
}));

DCHECK_LE(total_pop_count, values_length);
const int64_t null_count = values_length - total_pop_count;
BufferVector buffers{null_count > 0 ? std::move(indices_validity) : nullptr,
std::move(indices)};
out = std::make_shared<ArrayData>(int64(), values_length, std::move(buffers),
null_count);
return Status::OK();
}

Status Visit(const ListViewType& type) { return VisitListView(type); }

Status Visit(const LargeListViewType& type) { return VisitListView(type); }

Status Visit(const FixedSizeListType& type) {
using offset_type = typename FixedSizeListType::offset_type;
const offset_type slot_length = type.list_size();
Expand Down Expand Up @@ -125,7 +189,7 @@ const FunctionDoc list_flatten_doc(

const FunctionDoc list_parent_indices_doc(
"Compute parent indices of nested list values",
("`lists` must have a list-like type.\n"
("`lists` must have a list-like or list-view type.\n"
"For each value in each list of `lists`, the top-level list index\n"
"is emitted."),
{"lists"});
Expand All @@ -147,6 +211,7 @@ class ListParentIndicesFunction : public MetaFunction {

int64_t base_output_offset = 0;
ArrayVector out_chunks;
out_chunks.reserve(input->num_chunks());
for (const auto& chunk : input->chunks()) {
ARROW_ASSIGN_OR_RAISE(auto out_chunk,
ListParentIndicesArray::Exec(&kernel_ctx, chunk->data(),
Expand Down
Loading

0 comments on commit 71a2f6a

Please sign in to comment.