Skip to content

Commit

Permalink
Merge branch 'GH-36760-Go]-Adding-avro-ocf-reader---reader' of https:…
Browse files Browse the repository at this point in the history
…//github.com/loicalleyne/arrow into GH-36760-Go]-Adding-avro-ocf-reader---reader
  • Loading branch information
loicalleyne committed Aug 10, 2023
2 parents b730534 + 4505b43 commit 5865fef
Show file tree
Hide file tree
Showing 266 changed files with 7,717 additions and 1,833 deletions.
2 changes: 1 addition & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ VCPKG="501db0f17ef6df184fcdbfbe0f87cde2313b6ab1" # 2023.04.15 Release
# ci/docker/python-wheel-windows-vs2017.dockerfile.
# This is a workaround for our CI problem that "archery docker build" doesn't
# use pulled built images in dev/tasks/python-wheels/github.windows.yml.
PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2022-06-12
PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2023-08-02

# Use conanio/${CONAN} for "docker-compose run --rm conan". See
# https://github.com/conan-io/conan-docker-tools#readme for available
Expand Down
6 changes: 6 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ updates:
interval: "weekly"
commit-message:
prefix: "MINOR: [CI] "
- package-ecosystem: "npm"
directory: "/js/"
schedule:
interval: "monthly"
commit-message:
prefix: "MINOR: [JS] "
- package-ecosystem: "nuget"
directory: "/csharp/"
schedule:
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ on:
- '.github/workflows/cpp.yml'
- 'ci/docker/**'
- 'ci/scripts/cpp_*'
- 'ci/scripts/install_azurite.sh'
- 'ci/scripts/install_gcs_testbench.sh'
- 'ci/scripts/install_minio.sh'
- 'ci/scripts/msys2_*'
Expand All @@ -34,6 +35,7 @@ on:
- '.github/workflows/cpp.yml'
- 'ci/docker/**'
- 'ci/scripts/cpp_*'
- 'ci/scripts/install_azurite.sh'
- 'ci/scripts/install_gcs_testbench.sh'
- 'ci/scripts/install_minio.sh'
- 'ci/scripts/msys2_*'
Expand Down Expand Up @@ -201,6 +203,8 @@ jobs:
ci/scripts/install_minio.sh latest /usr/local
- name: Install Google Cloud Storage Testbench
run: ci/scripts/install_gcs_testbench.sh default
- name: Install Azurite Storage Emulator
run: ci/scripts/install_azurite.sh
- name: Setup ccache
run: |
ci/scripts/ccache_setup.sh
Expand Down
3 changes: 2 additions & 1 deletion ci/docker/python-wheel-manylinux.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ RUN vcpkg install \
--x-feature=flight \
--x-feature=gcs \
--x-feature=json \
--x-feature=parquet
--x-feature=parquet \
--x-feature=s3

# Configure Python for applications running in the bash shell of this Dockerfile
ARG python=3.8
Expand Down
3 changes: 2 additions & 1 deletion ci/docker/python-wheel-windows-vs2017.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ RUN vcpkg install \
--x-feature=flight \
--x-feature=gcs \
--x-feature=json \
--x-feature=parquet
--x-feature=parquet \
--x-feature=s3

# Remove previous installations of python from the base image
# NOTE: a more recent base image (tried with 2.12.1) comes with python 3.9.7
Expand Down
4 changes: 4 additions & 0 deletions ci/docker/ubuntu-20.04-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ RUN apt-get update -y -q && \
make \
ninja-build \
nlohmann-json3-dev \
npm \
pkg-config \
protobuf-compiler \
python3-dev \
Expand All @@ -123,6 +124,9 @@ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local
COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/
RUN /arrow/ci/scripts/install_gcs_testbench.sh default

COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/
RUN /arrow/ci/scripts/install_azurite.sh

COPY ci/scripts/install_ceph.sh /arrow/ci/scripts/
RUN /arrow/ci/scripts/install_ceph.sh

Expand Down
4 changes: 4 additions & 0 deletions ci/docker/ubuntu-22.04-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ RUN apt-get update -y -q && \
make \
ninja-build \
nlohmann-json3-dev \
npm \
pkg-config \
protobuf-compiler \
protobuf-compiler-grpc \
Expand Down Expand Up @@ -153,6 +154,9 @@ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local
COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/
RUN /arrow/ci/scripts/install_gcs_testbench.sh default

COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/
RUN /arrow/ci/scripts/install_azurite.sh

COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/
RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin

Expand Down
1 change: 1 addition & 0 deletions ci/scripts/cpp_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ cmake \
-DARROW_C_FLAGS_RELWITHDEBINFO="${ARROW_C_FLAGS_RELWITHDEBINFO:-}" \
-DARROW_DATASET=${ARROW_DATASET:-ON} \
-DARROW_DEPENDENCY_SOURCE=${ARROW_DEPENDENCY_SOURCE:-AUTO} \
-DARROW_ENABLE_THREADING=${ARROW_ENABLE_THREADING:-ON} \
-DARROW_ENABLE_TIMING_TESTS=${ARROW_ENABLE_TIMING_TESTS:-ON} \
-DARROW_EXTRA_ERROR_CONTEXT=${ARROW_EXTRA_ERROR_CONTEXT:-OFF} \
-DARROW_FILESYSTEM=${ARROW_FILESYSTEM:-ON} \
Expand Down
37 changes: 37 additions & 0 deletions ci/scripts/install_azurite.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

set -e

case "$(uname)" in
Darwin)
npm install -g azurite
which azurite
;;
MINGW*)
choco install nodejs.install
npm install -g azurite
;;
Linux)
npm install -g azurite
which azurite
;;
esac
echo "node version = $(node --version)"
echo "azurite version = $(azurite --version)"
1 change: 0 additions & 1 deletion ci/scripts/java_jni_macos_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ cmake \
-DARROW_PARQUET=${ARROW_PARQUET} \
-DARROW_S3=${ARROW_S3} \
-DARROW_USE_CCACHE=${ARROW_USE_CCACHE} \
-DAWSSDK_SOURCE=BUNDLED \
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
-DCMAKE_INSTALL_LIBDIR=lib \
-DCMAKE_INSTALL_PREFIX=${install_dir} \
Expand Down
4 changes: 0 additions & 4 deletions ci/scripts/python_wheel_manylinux_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,6 @@ fi
mkdir /tmp/arrow-build
pushd /tmp/arrow-build

# ARROW-17501: We can remove -DAWSSDK_SOURCE=BUNDLED once
# https://github.com/aws/aws-sdk-cpp/issues/1809 is fixed and vcpkg
# ships the fix.
cmake \
-DARROW_ACERO=${ARROW_ACERO} \
-DARROW_BUILD_SHARED=ON \
Expand Down Expand Up @@ -120,7 +117,6 @@ cmake \
-DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY} \
-DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB} \
-DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD} \
-DAWSSDK_SOURCE=BUNDLED \
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
-DCMAKE_INSTALL_LIBDIR=lib \
-DCMAKE_INSTALL_PREFIX=/tmp/arrow-dist \
Expand Down
4 changes: 0 additions & 4 deletions ci/scripts/python_wheel_windows_build.bat
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,6 @@ set ARROW_WITH_LZ4=ON
set ARROW_WITH_SNAPPY=ON
set ARROW_WITH_ZLIB=ON
set ARROW_WITH_ZSTD=ON
@rem Workaround for https://github.com/aws/aws-sdk-cpp/issues/1809 .
@rem Use (old) bundled AWS SDK C++ instead of (newer) AWS SDK C++.
set AWSSDK_SOURCE=BUNDLED
set CMAKE_UNITY_BUILD=ON
set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
set VCPKG_ROOT=C:\vcpkg
Expand Down Expand Up @@ -90,7 +87,6 @@ cmake ^
-DARROW_WITH_SNAPPY=%ARROW_WITH_SNAPPY% ^
-DARROW_WITH_ZLIB=%ARROW_WITH_ZLIB% ^
-DARROW_WITH_ZSTD=%ARROW_WITH_ZSTD% ^
-DAWSSDK_SOURCE=%AWSSDK_SOURCE% ^
-DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE% ^
-DCMAKE_CXX_COMPILER=clcache ^
-DCMAKE_INSTALL_PREFIX=C:\arrow-dist ^
Expand Down
1 change: 1 addition & 0 deletions cpp/Brewfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ brew "grpc"
brew "llvm@14"
brew "lz4"
brew "ninja"
brew "node"
brew "openssl@3"
brew "protobuf"
brew "python"
Expand Down
2 changes: 2 additions & 0 deletions cpp/cmake_modules/DefineOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ takes precedence over ccache if a storage backend is configured" ON)

define_option(ARROW_WITH_MUSL "Whether the system libc is musl or not" OFF)

define_option(ARROW_ENABLE_THREADING "Enable threading in Arrow core" ON)

#----------------------------------------------------------------------
set_option_category("Test and benchmark")

Expand Down
21 changes: 17 additions & 4 deletions cpp/src/arrow/acero/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,14 @@ add_arrow_acero_test(hash_join_node_test SOURCES hash_join_node_test.cc
bloom_filter_test.cc)
add_arrow_acero_test(pivot_longer_node_test SOURCES pivot_longer_node_test.cc
test_nodes.cc)
add_arrow_acero_test(asof_join_node_test SOURCES asof_join_node_test.cc test_nodes.cc)

# asof_join_node uses std::thread internally
# and doesn't use ThreadPool so it will
# be broken if threading is turned off
if(ARROW_ENABLE_THREADING)
add_arrow_acero_test(asof_join_node_test SOURCES asof_join_node_test.cc test_nodes.cc)
endif()

add_arrow_acero_test(tpch_node_test SOURCES tpch_node_test.cc)
add_arrow_acero_test(union_node_test SOURCES union_node_test.cc)
add_arrow_acero_test(aggregate_node_test SOURCES aggregate_node_test.cc)
Expand Down Expand Up @@ -221,7 +228,9 @@ if(ARROW_BUILD_BENCHMARKS)
add_arrow_acero_benchmark(project_benchmark SOURCES benchmark_util.cc
project_benchmark.cc)

add_arrow_acero_benchmark(asof_join_benchmark SOURCES asof_join_benchmark.cc)
if(ARROW_ENABLE_THREADING)
add_arrow_acero_benchmark(asof_join_benchmark SOURCES asof_join_benchmark.cc)
endif()

add_arrow_acero_benchmark(tpch_benchmark SOURCES tpch_benchmark.cc)

Expand All @@ -244,7 +253,9 @@ if(ARROW_BUILD_BENCHMARKS)
target_link_libraries(arrow-acero-expression-benchmark PUBLIC arrow_acero_static)
target_link_libraries(arrow-acero-filter-benchmark PUBLIC arrow_acero_static)
target_link_libraries(arrow-acero-project-benchmark PUBLIC arrow_acero_static)
target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_static)
if(ARROW_ENABLE_THREADING)
target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_static)
endif()
target_link_libraries(arrow-acero-tpch-benchmark PUBLIC arrow_acero_static)
if(ARROW_BUILD_OPENMP_BENCHMARKS)
target_link_libraries(arrow-acero-hash-join-benchmark PUBLIC arrow_acero_static)
Expand All @@ -253,7 +264,9 @@ if(ARROW_BUILD_BENCHMARKS)
target_link_libraries(arrow-acero-expression-benchmark PUBLIC arrow_acero_shared)
target_link_libraries(arrow-acero-filter-benchmark PUBLIC arrow_acero_shared)
target_link_libraries(arrow-acero-project-benchmark PUBLIC arrow_acero_shared)
target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_shared)
if(ARROW_ENABLE_THREADING)
target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_shared)
endif()
target_link_libraries(arrow-acero-tpch-benchmark PUBLIC arrow_acero_shared)
if(ARROW_BUILD_OPENMP_BENCHMARKS)
target_link_libraries(arrow-acero-hash-join-benchmark PUBLIC arrow_acero_shared)
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/acero/aggregate_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@
// segment-keys is used to refine the partitioning. However, segment-keys are different in
// that they partition only consecutive rows into a single group. Such a partition of
// consecutive rows is called a segment group. For example, consider a column X with
// values [A, A, B, A] at row-indices [0, 1, 2]. A regular group-by aggregation with keys
// [X] yields a row-index partitioning [[0, 1, 3], [2]] whereas a segmented-group-by
// values [A, A, B, A] at row-indices [0, 1, 2, 3]. A regular group-by aggregation with
// keys [X] yields a row-index partitioning [[0, 1, 3], [2]] whereas a segmented-group-by
// aggregation with segment-keys [X] yields [[0, 1], [2], [3]].
//
// The implementation first segments the input using the segment-keys, then groups by the
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/acero/asof_join_node.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/config.h"
#include "arrow/util/future.h"
#include "arrow/util/string.h"

Expand Down Expand Up @@ -1707,6 +1708,10 @@ class AsofJoinNode : public ExecNode {
}

Status StartProducing() override {
#ifndef ARROW_ENABLE_THREADING
return Status::NotImplemented("ASOF join requires threading enabled");
#endif

ARROW_ASSIGN_OR_RAISE(process_task_, plan_->query_context()->BeginExternalTask(
"AsofJoinNode::ProcessThread"));
if (!process_task_.is_valid()) {
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/arrow/acero/bloom_filter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "arrow/acero/util.h" // PREFETCH
#include "arrow/util/bit_util.h" // Log2
#include "arrow/util/bitmap_ops.h" // CountSetBits
#include "arrow/util/config.h"

namespace arrow {
namespace acero {
Expand Down Expand Up @@ -426,6 +427,9 @@ void BloomFilterBuilder_Parallel::CleanUp() {

std::unique_ptr<BloomFilterBuilder> BloomFilterBuilder::Make(
BloomFilterBuildStrategy strategy) {
#ifndef ARROW_ENABLE_THREADING
strategy = BloomFilterBuildStrategy::SINGLE_THREADED;
#endif
switch (strategy) {
case BloomFilterBuildStrategy::SINGLE_THREADED: {
std::unique_ptr<BloomFilterBuilder> impl{new BloomFilterBuilder_SingleThreaded()};
Expand Down
7 changes: 6 additions & 1 deletion cpp/src/arrow/acero/bloom_filter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include "arrow/acero/util.h"
#include "arrow/compute/key_hash.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/config.h"
#include "arrow/util/cpu_info.h"

namespace arrow {

Expand Down Expand Up @@ -468,7 +470,7 @@ TEST(BloomFilter, Basic) {

std::vector<BloomFilterBuildStrategy> strategies;
strategies.push_back(BloomFilterBuildStrategy::SINGLE_THREADED);
#ifndef ARROW_VALGRIND
#if defined(ARROW_ENABLE_THREADING) && !defined(ARROW_VALGRIND)
strategies.push_back(BloomFilterBuildStrategy::PARALLEL);
#endif

Expand Down Expand Up @@ -501,7 +503,10 @@ TEST(BloomFilter, Scaling) {
num_build.push_back(4000000);

std::vector<BloomFilterBuildStrategy> strategies;
#ifdef ARROW_ENABLE_THREADING
strategies.push_back(BloomFilterBuildStrategy::PARALLEL);
#endif
strategies.push_back(BloomFilterBuildStrategy::SINGLE_THREADED);

for (const auto hardware_flags : HardwareFlagsForTesting()) {
for (const auto& strategy : strategies) {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/acero/exec_plan.h
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ DeclarationToBatchesAsync(Declaration declaration, ExecContext exec_context);
/// \brief Utility method to run a declaration and return results as a RecordBatchReader
///
/// If an exec context is not provided then a default exec context will be used based
/// on the value of `use_threads`. If `use_threads` is false then the CPU exeuctor will
/// on the value of `use_threads`. If `use_threads` is false then the CPU executor will
/// be a serial executor and all CPU work will be done on the calling thread. I/O tasks
/// will still happen on the I/O executor and may be multi-threaded.
///
Expand Down
10 changes: 5 additions & 5 deletions cpp/src/arrow/acero/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class ARROW_ACERO_EXPORT ExecNodeOptions {
///
/// For each batch received a new task will be created to push that batch downstream.
/// This task will slice smaller units of size `ExecPlan::kMaxBatchSize` from the
/// parent batch and call InputRecieved. Thus, if the `generator` yields a large
/// parent batch and call InputReceived. Thus, if the `generator` yields a large
/// batch it may result in several calls to InputReceived.
///
/// The SourceNode will, by default, assign an implicit ordering to outgoing batches.
Expand Down Expand Up @@ -115,7 +115,7 @@ class ARROW_ACERO_EXPORT TableSourceNodeOptions : public ExecNodeOptions {
/// Create an instance from values
TableSourceNodeOptions(std::shared_ptr<Table> table,
int64_t max_batch_size = kDefaultMaxBatchSize)
: table(table), max_batch_size(max_batch_size) {}
: table(std::move(table)), max_batch_size(max_batch_size) {}

/// \brief a table which acts as the data source
std::shared_ptr<Table> table;
Expand All @@ -135,7 +135,7 @@ class ARROW_ACERO_EXPORT NamedTableNodeOptions : public ExecNodeOptions {
public:
/// Create an instance from values
NamedTableNodeOptions(std::vector<std::string> names, std::shared_ptr<Schema> schema)
: names(std::move(names)), schema(schema) {}
: names(std::move(names)), schema(std::move(schema)) {}

/// \brief the names to put in the serialized plan
std::vector<std::string> names;
Expand All @@ -156,7 +156,7 @@ class ARROW_ACERO_EXPORT SchemaSourceNodeOptions : public ExecNodeOptions {
/// Create an instance that will create a new task on io_executor for each iteration
SchemaSourceNodeOptions(std::shared_ptr<Schema> schema, ItMaker it_maker,
arrow::internal::Executor* io_executor)
: schema(schema),
: schema(std::move(schema)),
it_maker(std::move(it_maker)),
io_executor(io_executor),
requires_io(true) {}
Expand All @@ -165,7 +165,7 @@ class ARROW_ACERO_EXPORT SchemaSourceNodeOptions : public ExecNodeOptions {
/// executor
SchemaSourceNodeOptions(std::shared_ptr<Schema> schema, ItMaker it_maker,
bool requires_io = false)
: schema(schema),
: schema(std::move(schema)),
it_maker(std::move(it_maker)),
io_executor(NULLPTR),
requires_io(requires_io) {}
Expand Down
Loading

0 comments on commit 5865fef

Please sign in to comment.