From 137606199a4af88aae8c669c3d8d40b99569937b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 16 Oct 2024 10:56:17 -0700 Subject: [PATCH 01/58] Implement `castStringsToBooleans` Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 14 +++ src/main/cpp/src/json_utils.cu | 94 +++++++++++++++++++ src/main/cpp/src/json_utils.hpp | 17 ++++ .../nvidia/spark/rapids/jni/JSONUtils.java | 7 ++ 4 files changed, 132 insertions(+) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 67758e859..239b8cda6 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -198,4 +198,18 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_makeStructs( } CATCH_STD(env, 0); } + +JNIEXPORT jlong JNICALL +Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToBooleans(JNIEnv* env, jclass, jlong j_input) +{ + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = *reinterpret_cast(j_input); + return cudf::jni::ptr_as_jlong(spark_rapids_jni::cast_strings_to_booleans(input).release()); + } + CATCH_STD(env, 0); } + +} // extern "C" diff --git a/src/main/cpp/src/json_utils.cu b/src/main/cpp/src/json_utils.cu index 85b2dc930..c5ec9c352 100644 --- a/src/main/cpp/src/json_utils.cu +++ b/src/main/cpp/src/json_utils.cu @@ -14,7 +14,10 @@ * limitations under the License. */ +#include "json_utils.hpp" + #include +#include #include #include #include @@ -30,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -252,6 +256,69 @@ std::unique_ptr make_structs(std::vector const& return std::make_unique(structs, stream, mr); } +namespace { + +std::pair, rmm::device_uvector> cast_strings_to_booleans( + cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) +{ + auto output = cudf::make_fixed_width_column( + cudf::data_type{cudf::type_id::BOOL8}, input.size(), cudf::mask_state::UNALLOCATED, stream, mr); + auto validity = rmm::device_uvector(input.size(), stream); // intentionally not use `mr` + + auto const d_input_ptr = cudf::column_device_view::create(input, stream); + auto const output_it = thrust::make_zip_iterator( + thrust::make_tuple(output->mutable_view().begin(), validity.begin())); + thrust::tabulate(rmm::exec_policy_nosync(stream), + output_it, + output_it + input.size(), + [input = *d_input_ptr] __device__(auto idx) -> thrust::tuple { + if (input.is_valid(idx)) { + auto const d_str = input.element(idx); + if (d_str.size_bytes() == 4 && d_str[0] == 't' && d_str[1] == 'r' && + d_str[2] == 'u' && d_str[3] == 'e') { + return {true, true}; + } + if (d_str.size_bytes() == 5 && d_str[0] == 'f' && d_str[1] == 'a' && + d_str[2] == 'l' && d_str[3] == 's' && d_str[4] == 'e') { + return {false, true}; + } + } + + // Either null input, or the input string is neither `true` nor `false`. + return {false, false}; + }); + + return {std::move(output), std::move(validity)}; +} + +std::unique_ptr convert_column_type(cudf::column_view const& input, + json_schema_element const& schema, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + return nullptr; +} + +} // namespace + +std::unique_ptr convert_types( + cudf::table_view const& input, + std::vector> const& schema, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const num_columns = input.num_columns(); + CUDF_EXPECTS(static_cast(num_columns) == schema.size(), + "Numbers of columns in the input table is different from schema size."); + + std::vector> converted_cols(num_columns); + for (int i = 0; i < num_columns; ++i) { + converted_cols[i] = convert_column_type(input.column(i), schema[i].second, stream, mr); + } + + return nullptr; +} + } // namespace detail std::tuple, std::unique_ptr, char> concat_json( @@ -272,4 +339,31 @@ std::unique_ptr make_structs(std::vector const& return detail::make_structs(children, is_null, stream, mr); } +std::unique_ptr convert_types( + cudf::table_view const& input, + std::vector> const& schema, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::convert_types(input, schema, stream, mr); +} + +std::unique_ptr cast_strings_to_booleans(cudf::column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + + auto [output, validity] = detail::cast_strings_to_booleans(input, stream, mr); + auto [null_mask, null_count] = cudf::detail::valid_if( + validity.begin(), validity.end(), thrust::identity{}, stream, mr); + if(null_count > 0) { + output->set_null_mask(std::move(null_mask), null_count); + } else { + output->set_null_mask(rmm::device_buffer{}, 0); + } + return std::move(output); +} + } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 5671a7329..79314fc46 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -43,4 +43,21 @@ std::unique_ptr make_structs( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +struct json_schema_element { + cudf::data_type type; + + std::vector> child_types; +}; + +std::unique_ptr convert_types( + cudf::table_view const& input, + std::vector> const& schema, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr cast_strings_to_booleans( + cudf::column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + } // namespace spark_rapids_jni diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 1a41e5861..2f9195127 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -220,6 +220,11 @@ public static ColumnVector makeStructs(ColumnView[] children, ColumnView isNull) return new ColumnVector(makeStructs(handles, isNull.getNativeView())); } + public static ColumnVector castStringsToBooleans(ColumnVector input) { + assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; + return new ColumnVector(castStringsToBooleans(input.getNativeView())); + } + private static native int getMaxJSONPathDepth(); private static native long getJsonObject(long input, @@ -241,4 +246,6 @@ private static native long[] getJsonObjectMultiplePaths(long input, private static native long[] concatenateJsonStrings(long input); private static native long makeStructs(long[] children, long isNull); + + private static native long castStringsToBooleans(long input); } From c3fa10dad07fb0a25ec13fa277916fb517edadcf Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 16 Oct 2024 14:55:28 -0700 Subject: [PATCH 02/58] Implement `removeQuotes` Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 14 +++ src/main/cpp/src/json_utils.cu | 117 +++++++++++++++++- src/main/cpp/src/json_utils.hpp | 5 + .../nvidia/spark/rapids/jni/JSONUtils.java | 7 ++ 4 files changed, 140 insertions(+), 3 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 239b8cda6..e82113c07 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -212,4 +212,18 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToBooleans(JNIEnv* env, jc CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_removeQuotes(JNIEnv* env, + jclass, + jlong j_input) +{ + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = *reinterpret_cast(j_input); + return cudf::jni::ptr_as_jlong(spark_rapids_jni::remove_quotes(input).release()); + } + CATCH_STD(env, 0); +} + } // extern "C" diff --git a/src/main/cpp/src/json_utils.cu b/src/main/cpp/src/json_utils.cu index c5ec9c352..6d04450d7 100644 --- a/src/main/cpp/src/json_utils.cu +++ b/src/main/cpp/src/json_utils.cu @@ -18,9 +18,12 @@ #include #include +#include +#include #include #include #include +#include #include #include @@ -29,6 +32,8 @@ #include #include +#include +#include #include #include #include @@ -291,6 +296,102 @@ std::pair, rmm::device_uvector> cast_strings return {std::move(output), std::move(validity)}; } +// TODO: remove this. +template +rmm::device_uvector make_chars_buffer(cudf::column_view const& offsets, + int64_t chars_size, + IndexPairIterator begin, + cudf::size_type string_count, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto chars_data = rmm::device_uvector(chars_size, stream, mr); + auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets); + + auto const src_ptrs = cudf::detail::make_counting_transform_iterator( + 0u, cuda::proclaim_return_type([begin] __device__(uint32_t idx) { + // Due to a bug in cub (https://github.com/NVIDIA/cccl/issues/586), + // we have to use `const_cast` to remove `const` qualifier from the source pointer. + // This should be fine as long as we only read but not write anything to the source. + return reinterpret_cast(const_cast(begin[idx].first)); + })); + auto const src_sizes = cudf::detail::make_counting_transform_iterator( + 0u, cuda::proclaim_return_type([begin] __device__(uint32_t idx) { + return begin[idx].second; + })); + auto const dst_ptrs = cudf::detail::make_counting_transform_iterator( + 0u, + cuda::proclaim_return_type([offsets = d_offsets, output = chars_data.data()] __device__( + uint32_t idx) { return output + offsets[idx]; })); + + size_t temp_storage_bytes = 0; + CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched( + nullptr, temp_storage_bytes, src_ptrs, dst_ptrs, src_sizes, string_count, stream.value())); + rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); + CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(d_temp_storage.data(), + temp_storage_bytes, + src_ptrs, + dst_ptrs, + src_sizes, + string_count, + stream.value())); + + return chars_data; +} + +std::pair, rmm::device_uvector> remove_quotes( + cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) +{ + auto const d_input_ptr = cudf::column_device_view::create(input, stream); + auto const string_count = input.size(); + + // Materialize the output string sizes to avoid repeated computation when being used multiple + // times later on. + auto output_sizes = rmm::device_uvector(string_count, stream); + thrust::tabulate(rmm::exec_policy_nosync(stream), + output_sizes.begin(), + output_sizes.end(), + [input = *d_input_ptr] __device__(cudf::size_type idx) -> cudf::size_type { + if (input.is_null(idx)) { return 0; } + + auto const d_str = input.element(idx); + auto const size = d_str.size_bytes(); + + // Need to check for size, since the input string may contain just a single + // character `"`. Such input should not be considered as quoted. + auto const is_quoted = size > 1 && d_str[0] == '"' && d_str[size - 1] == '"'; + return is_quoted ? size - 2 : size; + }); + + auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column( + output_sizes.begin(), output_sizes.end(), stream, mr); + + auto const input_sv = cudf::strings_column_view{input}; + auto const d_input_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input_sv.offsets()); + auto const index_pair_fn = cuda::proclaim_return_type>( + [chars = input_sv.chars_begin(stream), + input_offsets = d_input_offsets, + output_sizes = output_sizes.begin()] __device__(cudf::size_type idx) { + auto const start_offset = input_offsets[idx]; + auto const end_offset = input_offsets[idx + 1]; + auto const input_size = end_offset - start_offset; + auto const output_size = output_sizes[idx]; + + return thrust::pair{chars + start_offset + (input_size == output_size ? 0 : 1), output_size}; + }); + auto const index_pair_it = cudf::detail::make_counting_transform_iterator(0, index_pair_fn); + auto chars_data = /*cudf::strings::detail::*/ make_chars_buffer( + offsets_column->view(), bytes, index_pair_it, string_count, stream, mr); + + auto output = cudf::make_strings_column(string_count, + std::move(offsets_column), + chars_data.release(), + input.null_count(), + cudf::detail::copy_bitmask(input, stream, mr)); + return {std::move(output), rmm::device_uvector{0, stream}}; +} + std::unique_ptr convert_column_type(cudf::column_view const& input, json_schema_element const& schema, rmm::cuda_stream_view stream, @@ -356,9 +457,9 @@ std::unique_ptr cast_strings_to_booleans(cudf::column_view const& CUDF_FUNC_RANGE(); auto [output, validity] = detail::cast_strings_to_booleans(input, stream, mr); - auto [null_mask, null_count] = cudf::detail::valid_if( - validity.begin(), validity.end(), thrust::identity{}, stream, mr); - if(null_count > 0) { + auto [null_mask, null_count] = + cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr); + if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } else { output->set_null_mask(rmm::device_buffer{}, 0); @@ -366,4 +467,14 @@ std::unique_ptr cast_strings_to_booleans(cudf::column_view const& return std::move(output); } +std::unique_ptr remove_quotes(cudf::column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + + auto [output, validity] = detail::remove_quotes(input, stream, mr); + return std::move(output); +} + } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 79314fc46..198487b73 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -60,4 +60,9 @@ std::unique_ptr cast_strings_to_booleans( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +std::unique_ptr remove_quotes( + cudf::column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + } // namespace spark_rapids_jni diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 68e2d1b4e..60380909d 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -225,6 +225,11 @@ public static ColumnVector castStringsToBooleans(ColumnView input) { return new ColumnVector(castStringsToBooleans(input.getNativeView())); } + public static ColumnVector removeQuotes(ColumnView input) { + assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; + return new ColumnVector(removeQuotes(input.getNativeView())); + } + private static native int getMaxJSONPathDepth(); private static native long getJsonObject(long input, @@ -248,4 +253,6 @@ private static native long[] getJsonObjectMultiplePaths(long input, private static native long makeStructs(long[] children, long isNull); private static native long castStringsToBooleans(long input); + + private static native long removeQuotes(long input); } From ae2b41fe1c58f1b59748b8b84ca86e999472d93e Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 16 Oct 2024 16:28:07 -0700 Subject: [PATCH 03/58] Rewrite using offsets and chars Signed-off-by: Nghia Truong --- src/main/cpp/src/json_utils.cu | 121 ++++++++++++++++++--------------- 1 file changed, 65 insertions(+), 56 deletions(-) diff --git a/src/main/cpp/src/json_utils.cu b/src/main/cpp/src/json_utils.cu index 6d04450d7..e2a47fd67 100644 --- a/src/main/cpp/src/json_utils.cu +++ b/src/main/cpp/src/json_utils.cu @@ -263,35 +263,47 @@ std::unique_ptr make_structs(std::vector const& namespace { +using string_index_pair = thrust::pair; + std::pair, rmm::device_uvector> cast_strings_to_booleans( cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto output = cudf::make_fixed_width_column( - cudf::data_type{cudf::type_id::BOOL8}, input.size(), cudf::mask_state::UNALLOCATED, stream, mr); - auto validity = rmm::device_uvector(input.size(), stream); // intentionally not use `mr` + auto const string_count = input.size(); + auto output = cudf::make_fixed_width_column( + cudf::data_type{cudf::type_id::BOOL8}, string_count, cudf::mask_state::UNALLOCATED, stream, mr); + auto validity = rmm::device_uvector(string_count, stream); // intentionally not use `mr` + auto const input_sv = cudf::strings_column_view{input}; + auto const offsets_it = + cudf::detail::offsetalator_factory::make_input_iterator(input_sv.offsets()); auto const d_input_ptr = cudf::column_device_view::create(input, stream); + auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); auto const output_it = thrust::make_zip_iterator( thrust::make_tuple(output->mutable_view().begin(), validity.begin())); - thrust::tabulate(rmm::exec_policy_nosync(stream), - output_it, - output_it + input.size(), - [input = *d_input_ptr] __device__(auto idx) -> thrust::tuple { - if (input.is_valid(idx)) { - auto const d_str = input.element(idx); - if (d_str.size_bytes() == 4 && d_str[0] == 't' && d_str[1] == 'r' && - d_str[2] == 'u' && d_str[3] == 'e') { - return {true, true}; - } - if (d_str.size_bytes() == 5 && d_str[0] == 'f' && d_str[1] == 'a' && - d_str[2] == 'l' && d_str[3] == 's' && d_str[4] == 'e') { - return {false, true}; - } - } - - // Either null input, or the input string is neither `true` nor `false`. - return {false, false}; - }); + thrust::tabulate( + rmm::exec_policy_nosync(stream), + output_it, + output_it + string_count, + [chars = input_sv.chars_begin(stream), offsets = offsets_it, is_valid = is_valid_it] __device__( + auto idx) -> thrust::tuple { + if (is_valid[idx]) { + auto const start_offset = offsets[idx]; + auto const end_offset = offsets[idx + 1]; + auto const size = end_offset - start_offset; + auto const str = chars + start_offset; + + if (size == 4 && str[0] == 't' && str[1] == 'r' && str[2] == 'u' && str[3] == 'e') { + return {true, true}; + } + if (size == 5 && str[0] == 'f' && str[1] == 'a' && str[2] == 'l' && str[3] == 's' && + str[4] == 'e') { + return {false, true}; + } + } + + // Either null input, or the input string is neither `true` nor `false`. + return {false, false}; + }); return {std::move(output), std::move(validity)}; } @@ -342,47 +354,44 @@ rmm::device_uvector make_chars_buffer(cudf::column_view const& offsets, std::pair, rmm::device_uvector> remove_quotes( cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto const d_input_ptr = cudf::column_device_view::create(input, stream); - auto const string_count = input.size(); + auto const input_sv = cudf::strings_column_view{input}; + auto const input_offsets_it = + cudf::detail::offsetalator_factory::make_input_iterator(input_sv.offsets()); + auto const d_input_ptr = cudf::column_device_view::create(input, stream); + auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); - // Materialize the output string sizes to avoid repeated computation when being used multiple - // times later on. - auto output_sizes = rmm::device_uvector(string_count, stream); + auto const string_count = input.size(); + auto string_pairs = rmm::device_uvector(string_count, stream); thrust::tabulate(rmm::exec_policy_nosync(stream), - output_sizes.begin(), - output_sizes.end(), - [input = *d_input_ptr] __device__(cudf::size_type idx) -> cudf::size_type { - if (input.is_null(idx)) { return 0; } - - auto const d_str = input.element(idx); - auto const size = d_str.size_bytes(); + string_pairs.begin(), + string_pairs.end(), + [chars = input_sv.chars_begin(stream), + offsets = input_offsets_it, + is_valid = is_valid_it] __device__(cudf::size_type idx) -> string_index_pair { + if (!is_valid[idx]) { return {nullptr, 0}; } + + auto const start_offset = offsets[idx]; + auto const end_offset = offsets[idx + 1]; + auto const size = end_offset - start_offset; + auto const str = chars + start_offset; // Need to check for size, since the input string may contain just a single // character `"`. Such input should not be considered as quoted. - auto const is_quoted = size > 1 && d_str[0] == '"' && d_str[size - 1] == '"'; - return is_quoted ? size - 2 : size; + auto const is_quoted = size > 1 && str[0] == '"' && str[size - 1] == '"'; + auto const output_size = is_quoted ? size - 2 : size; + return {chars + start_offset + (is_quoted ? 1 : 0), output_size}; }); - auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column( - output_sizes.begin(), output_sizes.end(), stream, mr); - - auto const input_sv = cudf::strings_column_view{input}; - auto const d_input_offsets = - cudf::detail::offsetalator_factory::make_input_iterator(input_sv.offsets()); - auto const index_pair_fn = cuda::proclaim_return_type>( - [chars = input_sv.chars_begin(stream), - input_offsets = d_input_offsets, - output_sizes = output_sizes.begin()] __device__(cudf::size_type idx) { - auto const start_offset = input_offsets[idx]; - auto const end_offset = input_offsets[idx + 1]; - auto const input_size = end_offset - start_offset; - auto const output_size = output_sizes[idx]; - - return thrust::pair{chars + start_offset + (input_size == output_size ? 0 : 1), output_size}; - }); - auto const index_pair_it = cudf::detail::make_counting_transform_iterator(0, index_pair_fn); - auto chars_data = /*cudf::strings::detail::*/ make_chars_buffer( - offsets_column->view(), bytes, index_pair_it, string_count, stream, mr); + auto const size_it = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [string_pairs = string_pairs.begin()] __device__(cudf::size_type idx) -> cudf::size_type { + return string_pairs[idx].second; + })); + auto [offsets_column, bytes] = + cudf::strings::detail::make_offsets_child_column(size_it, size_it + string_count, stream, mr); + auto chars_data = /*cudf::strings::detail::*/ make_chars_buffer( + offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr); auto output = cudf::make_strings_column(string_count, std::move(offsets_column), From 8d7ad2ed7f3122ed42ec30c091bad123d628938a Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 16 Oct 2024 20:46:01 -0700 Subject: [PATCH 04/58] Fix empty input Signed-off-by: Nghia Truong --- src/main/cpp/src/json_utils.cu | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/main/cpp/src/json_utils.cu b/src/main/cpp/src/json_utils.cu index e2a47fd67..7498ddf74 100644 --- a/src/main/cpp/src/json_utils.cu +++ b/src/main/cpp/src/json_utils.cu @@ -269,7 +269,12 @@ std::pair, rmm::device_uvector> cast_strings cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { auto const string_count = input.size(); - auto output = cudf::make_fixed_width_column( + if (string_count == 0) { + return {cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}), + rmm::device_uvector(0, stream)}; + } + + auto output = cudf::make_fixed_width_column( cudf::data_type{cudf::type_id::BOOL8}, string_count, cudf::mask_state::UNALLOCATED, stream, mr); auto validity = rmm::device_uvector(string_count, stream); // intentionally not use `mr` @@ -354,14 +359,19 @@ rmm::device_uvector make_chars_buffer(cudf::column_view const& offsets, std::pair, rmm::device_uvector> remove_quotes( cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + auto const string_count = input.size(); + if (string_count == 0) { + return {cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}), + rmm::device_uvector(0, stream)}; + } + auto const input_sv = cudf::strings_column_view{input}; auto const input_offsets_it = cudf::detail::offsetalator_factory::make_input_iterator(input_sv.offsets()); auto const d_input_ptr = cudf::column_device_view::create(input, stream); auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); - auto const string_count = input.size(); - auto string_pairs = rmm::device_uvector(string_count, stream); + auto string_pairs = rmm::device_uvector(string_count, stream); thrust::tabulate(rmm::exec_policy_nosync(stream), string_pairs.begin(), string_pairs.end(), @@ -398,7 +408,9 @@ std::pair, rmm::device_uvector> remove_quote chars_data.release(), input.null_count(), cudf::detail::copy_bitmask(input, stream, mr)); - return {std::move(output), rmm::device_uvector{0, stream}}; + + // This function does not return the validity vector. + return {std::move(output), rmm::device_uvector(0, stream)}; } std::unique_ptr convert_column_type(cudf::column_view const& input, From 9e759c41881925442e3e88e63c787660f261a541 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 16 Oct 2024 20:57:14 -0700 Subject: [PATCH 05/58] Misc Signed-off-by: Nghia Truong --- src/main/cpp/src/json_utils.cu | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/main/cpp/src/json_utils.cu b/src/main/cpp/src/json_utils.cu index 7498ddf74..0a9920192 100644 --- a/src/main/cpp/src/json_utils.cu +++ b/src/main/cpp/src/json_utils.cu @@ -310,6 +310,9 @@ std::pair, rmm::device_uvector> cast_strings return {false, false}; }); + // Reset null count, as it is invalidated after calling to `mutable_view()`. + output->set_null_mask(rmm::device_buffer{}, 0); + return {std::move(output), std::move(validity)}; } @@ -480,11 +483,7 @@ std::unique_ptr cast_strings_to_booleans(cudf::column_view const& auto [output, validity] = detail::cast_strings_to_booleans(input, stream, mr); auto [null_mask, null_count] = cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr); - if (null_count > 0) { - output->set_null_mask(std::move(null_mask), null_count); - } else { - output->set_null_mask(rmm::device_buffer{}, 0); - } + if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } return std::move(output); } From 2fff9498de32a003f4da7f6d3c625151591d348b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 16 Oct 2024 21:26:12 -0700 Subject: [PATCH 06/58] Add `nullifyIfNotQuoted` option for `removeQuotes` Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 8 +-- src/main/cpp/src/json_utils.cu | 58 ++++++++++++++----- src/main/cpp/src/json_utils.hpp | 1 + .../nvidia/spark/rapids/jni/JSONUtils.java | 6 +- 4 files changed, 52 insertions(+), 21 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index e82113c07..7d63c7a6e 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -212,16 +212,16 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToBooleans(JNIEnv* env, jc CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_removeQuotes(JNIEnv* env, - jclass, - jlong j_input) +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_removeQuotes( + JNIEnv* env, jclass, jlong j_input, jboolean nullify_if_not_quoted) { JNI_NULL_CHECK(env, j_input, "j_input is null", 0); try { cudf::jni::auto_set_device(env); auto const input = *reinterpret_cast(j_input); - return cudf::jni::ptr_as_jlong(spark_rapids_jni::remove_quotes(input).release()); + return cudf::jni::ptr_as_jlong( + spark_rapids_jni::remove_quotes(input, nullify_if_not_quoted).release()); } CATCH_STD(env, 0); } diff --git a/src/main/cpp/src/json_utils.cu b/src/main/cpp/src/json_utils.cu index 0a9920192..9dcbe019e 100644 --- a/src/main/cpp/src/json_utils.cu +++ b/src/main/cpp/src/json_utils.cu @@ -276,7 +276,7 @@ std::pair, rmm::device_uvector> cast_strings auto output = cudf::make_fixed_width_column( cudf::data_type{cudf::type_id::BOOL8}, string_count, cudf::mask_state::UNALLOCATED, stream, mr); - auto validity = rmm::device_uvector(string_count, stream); // intentionally not use `mr` + auto validity = rmm::device_uvector(string_count, stream); auto const input_sv = cudf::strings_column_view{input}; auto const offsets_it = @@ -311,7 +311,7 @@ std::pair, rmm::device_uvector> cast_strings }); // Reset null count, as it is invalidated after calling to `mutable_view()`. - output->set_null_mask(rmm::device_buffer{}, 0); + output->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); return {std::move(output), std::move(validity)}; } @@ -360,7 +360,10 @@ rmm::device_uvector make_chars_buffer(cudf::column_view const& offsets, } std::pair, rmm::device_uvector> remove_quotes( - cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) + cudf::column_view const& input, + bool nullify_if_not_quoted, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { auto const string_count = input.size(); if (string_count == 0) { @@ -378,7 +381,8 @@ std::pair, rmm::device_uvector> remove_quote thrust::tabulate(rmm::exec_policy_nosync(stream), string_pairs.begin(), string_pairs.end(), - [chars = input_sv.chars_begin(stream), + [nullify_if_not_quoted, + chars = input_sv.chars_begin(stream), offsets = input_offsets_it, is_valid = is_valid_it] __device__(cudf::size_type idx) -> string_index_pair { if (!is_valid[idx]) { return {nullptr, 0}; } @@ -390,7 +394,9 @@ std::pair, rmm::device_uvector> remove_quote // Need to check for size, since the input string may contain just a single // character `"`. Such input should not be considered as quoted. - auto const is_quoted = size > 1 && str[0] == '"' && str[size - 1] == '"'; + auto const is_quoted = size > 1 && str[0] == '"' && str[size - 1] == '"'; + if (nullify_if_not_quoted && !is_quoted) { return {nullptr, 0}; } + auto const output_size = is_quoted ? size - 2 : size; return {chars + start_offset + (is_quoted ? 1 : 0), output_size}; }); @@ -406,14 +412,32 @@ std::pair, rmm::device_uvector> remove_quote auto chars_data = /*cudf::strings::detail::*/ make_chars_buffer( offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr); - auto output = cudf::make_strings_column(string_count, - std::move(offsets_column), - chars_data.release(), - input.null_count(), - cudf::detail::copy_bitmask(input, stream, mr)); - - // This function does not return the validity vector. - return {std::move(output), rmm::device_uvector(0, stream)}; + if (nullify_if_not_quoted) { + auto validity = rmm::device_uvector(string_count, stream); + thrust::transform( + rmm::exec_policy_nosync(stream), + string_pairs.begin(), + string_pairs.end(), + validity.begin(), + [] __device__(string_index_pair const& pair) { return pair.first != nullptr; }); + + // Null mask and null count will be updated later from the validity vector. + auto output = cudf::make_strings_column(string_count, + std::move(offsets_column), + chars_data.release(), + 0, + rmm::device_buffer{0, stream, mr}); + + return {std::move(output), std::move(validity)}; + } else { + auto output = cudf::make_strings_column(string_count, + std::move(offsets_column), + chars_data.release(), + input.null_count(), + cudf::detail::copy_bitmask(input, stream, mr)); + + return {std::move(output), rmm::device_uvector(0, stream)}; + } } std::unique_ptr convert_column_type(cudf::column_view const& input, @@ -488,12 +512,18 @@ std::unique_ptr cast_strings_to_booleans(cudf::column_view const& } std::unique_ptr remove_quotes(cudf::column_view const& input, + bool nullify_if_not_quoted, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - auto [output, validity] = detail::remove_quotes(input, stream, mr); + auto [output, validity] = detail::remove_quotes(input, nullify_if_not_quoted, stream, mr); + if (validity.size() > 0) { + auto [null_mask, null_count] = + cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr); + if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } + } return std::move(output); } diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 198487b73..e9c1c5360 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -62,6 +62,7 @@ std::unique_ptr cast_strings_to_booleans( std::unique_ptr remove_quotes( cudf::column_view const& input, + bool nullify_if_not_quoted, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 60380909d..81770b8fb 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -225,9 +225,9 @@ public static ColumnVector castStringsToBooleans(ColumnView input) { return new ColumnVector(castStringsToBooleans(input.getNativeView())); } - public static ColumnVector removeQuotes(ColumnView input) { + public static ColumnVector removeQuotes(ColumnView input, boolean nullifyIfNotQuoted) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - return new ColumnVector(removeQuotes(input.getNativeView())); + return new ColumnVector(removeQuotes(input.getNativeView(), nullifyIfNotQuoted)); } private static native int getMaxJSONPathDepth(); @@ -254,5 +254,5 @@ private static native long[] getJsonObjectMultiplePaths(long input, private static native long castStringsToBooleans(long input); - private static native long removeQuotes(long input); + private static native long removeQuotes(long input, boolean nullifyIfNotQuoted); } From d09de416d2450b3efa42510738cd465f7dcb11a5 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 17 Oct 2024 19:29:30 -0700 Subject: [PATCH 07/58] Implement `castStringsToDecimals` Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 15 ++ src/main/cpp/src/json_utils.cu | 166 ++++++++++++++++++ src/main/cpp/src/json_utils.hpp | 8 + src/main/cpp/tests/cast_float_to_string.cpp | 58 +----- .../nvidia/spark/rapids/jni/JSONUtils.java | 8 + 5 files changed, 202 insertions(+), 53 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 7d63c7a6e..e53655718 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -212,6 +212,21 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToBooleans(JNIEnv* env, jc CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToDecimals( + JNIEnv* env, jclass, jlong j_input, jint precision, jint scale, jboolean is_us_locale) +{ + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = *reinterpret_cast(j_input); + + return cudf::jni::ptr_as_jlong( + spark_rapids_jni::cast_strings_to_decimals(input, precision, scale, is_us_locale).release()); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_removeQuotes( JNIEnv* env, jclass, jlong j_input, jboolean nullify_if_not_quoted) { diff --git a/src/main/cpp/src/json_utils.cu b/src/main/cpp/src/json_utils.cu index 9dcbe019e..8220b7b95 100644 --- a/src/main/cpp/src/json_utils.cu +++ b/src/main/cpp/src/json_utils.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "cast_string.hpp" #include "json_utils.hpp" #include @@ -33,6 +34,7 @@ #include #include +#include #include #include #include @@ -43,6 +45,8 @@ #include #include +#include + namespace spark_rapids_jni { namespace detail { @@ -359,6 +363,156 @@ rmm::device_uvector make_chars_buffer(cudf::column_view const& offsets, return chars_data; } +// TODO there is a bug here around 0 https://github.com/NVIDIA/spark-rapids/issues/10898 +std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, + int precision, + int scale, + bool is_us_locale, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const string_count = input.size(); + if (string_count == 0) { + auto const dtype = [precision, scale]() { + if (precision <= std::numeric_limits::digits10) { + return cudf::data_type(cudf::type_id::DECIMAL32, scale); + } else if (precision <= std::numeric_limits::digits10) { + return cudf::data_type(cudf::type_id::DECIMAL64, scale); + } else if (precision <= std::numeric_limits<__int128_t>::digits10) { + return cudf::data_type(cudf::type_id::DECIMAL128, scale); + } else { + CUDF_FAIL("Unable to support decimal with precision " + std::to_string(precision)); + } + }(); + return cudf::make_empty_column(dtype); + } + + CUDF_EXPECTS(is_us_locale, "String to decimal conversion is only supported in US locale."); + + auto const input_sv = cudf::strings_column_view{input}; + auto const in_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input_sv.offsets()); + + // Count the number of characters `"`. + rmm::device_uvector quote_counts(string_count, stream); + // Count the number of characters `"` and `,` in each string. + rmm::device_uvector remove_counts(string_count, stream); + + { + using count_type = thrust::tuple; + auto const check_it = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [chars = input_sv.chars_begin(stream)] __device__(auto idx) { + auto const c = chars[idx]; + auto const is_quote = c == '"'; + auto const should_remove = is_quote || c == ','; + return count_type{static_cast(is_quote), static_cast(should_remove)}; + })); + auto const plus_op = + cuda::proclaim_return_type([] __device__(count_type lhs, count_type rhs) { + return count_type{thrust::get<0>(lhs) + thrust::get<0>(rhs), + thrust::get<1>(lhs) + thrust::get<1>(rhs)}; + }); + + auto const out_count_it = + thrust::make_zip_iterator(quote_counts.begin(), remove_counts.begin()); + + std::size_t temp_storage_bytes = 0; + cub::DeviceSegmentedReduce::Reduce(nullptr, + temp_storage_bytes, + check_it, + out_count_it, + string_count, + in_offsets, + in_offsets + 1, + plus_op, + count_type{0, 0}, + stream.value()); + auto d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream}; + cub::DeviceSegmentedReduce::Reduce(d_temp_storage.data(), + temp_storage_bytes, + check_it, + out_count_it, + string_count, + in_offsets, + in_offsets + 1, + plus_op, + count_type{0, 0}, + stream.value()); + } + + auto const out_size_it = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [offsets = in_offsets, + quote_counts = quote_counts.begin(), + remove_counts = remove_counts.begin()] __device__(auto idx) { + auto const input_size = offsets[idx + 1] - offsets[idx]; + // If the current row is a non-quoted string, just return the original string. + if (quote_counts[idx] == 0) { return static_cast(input_size); } + // Otherwise, we will modify the string, removing characters '"' and ','. + return static_cast(input_size - remove_counts[idx]); + })); + auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column( + out_size_it, out_size_it + string_count, stream, mr); + + // If the output strings column does not change in its total bytes, we know that it does not have + // any '"' or ',' characters. + if (bytes == input_sv.chars_size(stream)) { + return string_to_decimal(precision, scale, input_sv, false, false, stream, mr); + } + + auto const out_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); + auto chars_data = rmm::device_uvector(bytes, stream, mr); + + // Since the strings store decimal numbers, they should be very short. + // As such, using one thread per string should be good. + thrust::for_each(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(string_count), + [in_offsets, + out_offsets, + input = input_sv.chars_begin(stream), + output = chars_data.begin()] __device__(auto idx) { + auto const in_size = in_offsets[idx + 1] - in_offsets[idx]; + auto const out_size = out_offsets[idx + 1] - out_offsets[idx]; + if (in_size == 0) { return; } + + // If the output size is not changed, we are returning the original unquoted + // string. Such string may still contain other alphabet characters, but that + // should be handled in the conversion function later on. + if (in_size == out_size) { + memcpy(output + out_offsets[idx], input + in_offsets[idx], in_size); + } else { // copy byte by byte, ignoring '"' and ',' characters. + auto in_ptr = input + in_offsets[idx]; + auto in_end = input + in_offsets[idx + 1]; + auto out_ptr = output + out_offsets[idx]; + while (in_ptr != in_end) { + if (*in_ptr != '"' && *in_ptr != ',') { + *out_ptr = *in_ptr; + ++out_ptr; + } + ++in_ptr; + } + } + }); + + auto const unquoted_strings = cudf::make_strings_column(string_count, + std::move(offsets_column), + chars_data.release(), + 0, + rmm::device_buffer{0, stream, mr}); + return string_to_decimal(precision, + scale, + cudf::strings_column_view{unquoted_strings->view()}, + false, + false, + stream, + mr); +} + std::pair, rmm::device_uvector> remove_quotes( cudf::column_view const& input, bool nullify_if_not_quoted, @@ -511,6 +665,18 @@ std::unique_ptr cast_strings_to_booleans(cudf::column_view const& return std::move(output); } +std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, + int precision, + int scale, + bool is_us_locale, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + + return detail::cast_strings_to_decimals(input, precision, scale, is_us_locale, stream, mr); +} + std::unique_ptr remove_quotes(cudf::column_view const& input, bool nullify_if_not_quoted, rmm::cuda_stream_view stream, diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index e9c1c5360..924432a57 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -60,6 +60,14 @@ std::unique_ptr cast_strings_to_booleans( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +std::unique_ptr cast_strings_to_decimals( + cudf::column_view const& input, + int precision, + int scale, + bool is_us_locale, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + std::unique_ptr remove_quotes( cudf::column_view const& input, bool nullify_if_not_quoted, diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp index a118ec7fe..a9c8a332f 100644 --- a/src/main/cpp/tests/cast_float_to_string.cpp +++ b/src/main/cpp/tests/cast_float_to_string.cpp @@ -16,68 +16,20 @@ #include #include +#include #include -#include - -#include +#include using namespace cudf; -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR}; - struct FloatToStringTests : public cudf::test::BaseFixture {}; TEST_F(FloatToStringTests, FromFloats32) { - auto const floats = - cudf::test::fixed_width_column_wrapper{100.0f, - 654321.25f, - -12761.125f, - 0.f, - 5.0f, - -4.0f, - std::numeric_limits::quiet_NaN(), - 123456789012.34f, - -0.0f}; - - auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream()); + auto const input = cudf::test::strings_column_wrapper{R"("26/08/2015")"}; + auto out = spark_rapids_jni::remove_quotes(input, true); - auto const expected = cudf::test::strings_column_wrapper{ - "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "1.2345679E11", "-0.0"}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); + // cudf::test::print(out->view()); } - -TEST_F(FloatToStringTests, FromFloats64) -{ - auto const floats = - cudf::test::fixed_width_column_wrapper{100.0d, - 654321.25d, - -12761.125d, - 1.123456789123456789d, - 0.000000000000000000123456789123456789d, - 0.0d, - 5.0d, - -4.0d, - std::numeric_limits::quiet_NaN(), - 839542223232.794248339d, - -0.0d}; - - auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream()); - - auto const expected = cudf::test::strings_column_wrapper{"100.0", - "654321.25", - "-12761.125", - "1.1234567891234568", - "1.234567891234568E-19", - "0.0", - "5.0", - "-4.0", - "NaN", - "8.395422232327942E11", - "-0.0"}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); -} \ No newline at end of file diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 81770b8fb..9f838cd4b 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -225,6 +225,12 @@ public static ColumnVector castStringsToBooleans(ColumnView input) { return new ColumnVector(castStringsToBooleans(input.getNativeView())); } + public static ColumnVector castStringsToDecimals(ColumnView input, int precision, int scale, + boolean isUSLocale) { + assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; + return new ColumnVector(castStringsToDecimals(input.getNativeView(), precision, scale, isUSLocale)); + } + public static ColumnVector removeQuotes(ColumnView input, boolean nullifyIfNotQuoted) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; return new ColumnVector(removeQuotes(input.getNativeView(), nullifyIfNotQuoted)); @@ -254,5 +260,7 @@ private static native long[] getJsonObjectMultiplePaths(long input, private static native long castStringsToBooleans(long input); + private static native long castStringsToDecimals(long input, int precision, int scale, boolean isUSLocale); + private static native long removeQuotes(long input, boolean nullifyIfNotQuoted); } From 576b65ca0991e8f95faae6da8807b3ce016569f6 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 18 Oct 2024 09:59:43 -0700 Subject: [PATCH 08/58] Implement `removeQuotesForFloats` Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 13 ++++ src/main/cpp/src/json_utils.cu | 75 +++++++++++++++++++ src/main/cpp/src/json_utils.hpp | 5 ++ .../nvidia/spark/rapids/jni/JSONUtils.java | 7 ++ 4 files changed, 100 insertions(+) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index e53655718..171f65cb2 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -241,4 +241,17 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_removeQuotes( CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL +Java_com_nvidia_spark_rapids_jni_JSONUtils_removeQuotesForFloats(JNIEnv* env, jclass, jlong j_input) +{ + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = *reinterpret_cast(j_input); + return cudf::jni::ptr_as_jlong(spark_rapids_jni::remove_quotes_for_floats(input).release()); + } + CATCH_STD(env, 0); +} + } // extern "C" diff --git a/src/main/cpp/src/json_utils.cu b/src/main/cpp/src/json_utils.cu index 8220b7b95..8862ddc7c 100644 --- a/src/main/cpp/src/json_utils.cu +++ b/src/main/cpp/src/json_utils.cu @@ -594,6 +594,71 @@ std::pair, rmm::device_uvector> remove_quote } } +// TODO: extract commond code for this and `remove_quotes`. +std::pair, rmm::device_uvector> remove_quotes_for_floats( + cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) +{ + auto const string_count = input.size(); + if (string_count == 0) { + return {cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}), + rmm::device_uvector(0, stream)}; + } + + auto const input_sv = cudf::strings_column_view{input}; + auto const input_offsets_it = + cudf::detail::offsetalator_factory::make_input_iterator(input_sv.offsets()); + auto const d_input_ptr = cudf::column_device_view::create(input, stream); + auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); + + auto string_pairs = rmm::device_uvector(string_count, stream); + thrust::tabulate( + rmm::exec_policy_nosync(stream), + string_pairs.begin(), + string_pairs.end(), + [chars = input_sv.chars_begin(stream), + offsets = input_offsets_it, + is_valid = is_valid_it] __device__(cudf::size_type idx) -> string_index_pair { + if (!is_valid[idx]) { return {nullptr, 0}; } + + auto const start_offset = offsets[idx]; + auto const end_offset = offsets[idx + 1]; + auto const size = end_offset - start_offset; + auto const str = chars + start_offset; + + // Need to check for size, since the input string may contain just a single + // character `"`. Such input should not be considered as quoted. + auto const is_quoted = size > 1 && str[0] == '"' && str[size - 1] == '"'; + + // This is a special case, when `"INF"` is not accepted in `from_json`. + // We need to check for such string and nullify it. + if (is_quoted && size == 5 && str[1] == 'I' && str[2] == 'N' && str[3] == 'F') { + return {nullptr, 0}; + } + + auto const output_size = is_quoted ? size - 2 : size; + return {chars + start_offset + (is_quoted ? 1 : 0), output_size}; + }); + + auto const size_it = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [string_pairs = string_pairs.begin()] __device__(cudf::size_type idx) -> cudf::size_type { + return string_pairs[idx].second; + })); + auto [offsets_column, bytes] = + cudf::strings::detail::make_offsets_child_column(size_it, size_it + string_count, stream, mr); + auto chars_data = /*cudf::strings::detail::*/ make_chars_buffer( + offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr); + + auto output = cudf::make_strings_column(string_count, + std::move(offsets_column), + chars_data.release(), + input.null_count(), + cudf::detail::copy_bitmask(input, stream, mr)); + + return {std::move(output), rmm::device_uvector(0, stream)}; +} + std::unique_ptr convert_column_type(cudf::column_view const& input, json_schema_element const& schema, rmm::cuda_stream_view stream, @@ -693,4 +758,14 @@ std::unique_ptr remove_quotes(cudf::column_view const& input, return std::move(output); } +std::unique_ptr remove_quotes_for_floats(cudf::column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + + auto [output, validity] = detail::remove_quotes_for_floats(input, stream, mr); + return std::move(output); +} + } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 924432a57..5447da294 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -74,4 +74,9 @@ std::unique_ptr remove_quotes( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +std::unique_ptr remove_quotes_for_floats( + cudf::column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + } // namespace spark_rapids_jni diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 9f838cd4b..5dfb8a259 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -236,6 +236,11 @@ public static ColumnVector removeQuotes(ColumnView input, boolean nullifyIfNotQu return new ColumnVector(removeQuotes(input.getNativeView(), nullifyIfNotQuoted)); } + public static ColumnVector removeQuotesForFloats(ColumnView input) { + assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; + return new ColumnVector(removeQuotesForFloats(input.getNativeView())); + } + private static native int getMaxJSONPathDepth(); private static native long getJsonObject(long input, @@ -263,4 +268,6 @@ private static native long[] getJsonObjectMultiplePaths(long input, private static native long castStringsToDecimals(long input, int precision, int scale, boolean isUSLocale); private static native long removeQuotes(long input, boolean nullifyIfNotQuoted); + + private static native long removeQuotesForFloats(long input); } From 2bd53353d2a7409f249e4e850ef6bc83de90a5ae Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 18 Oct 2024 10:45:50 -0700 Subject: [PATCH 09/58] Fix `removeQuotesForFloats` Signed-off-by: Nghia Truong --- src/main/cpp/src/json_utils.cu | 69 +++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/src/main/cpp/src/json_utils.cu b/src/main/cpp/src/json_utils.cu index 8862ddc7c..434f9adca 100644 --- a/src/main/cpp/src/json_utils.cu +++ b/src/main/cpp/src/json_utils.cu @@ -611,33 +611,50 @@ std::pair, rmm::device_uvector> remove_quote auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); auto string_pairs = rmm::device_uvector(string_count, stream); - thrust::tabulate( - rmm::exec_policy_nosync(stream), - string_pairs.begin(), - string_pairs.end(), - [chars = input_sv.chars_begin(stream), - offsets = input_offsets_it, - is_valid = is_valid_it] __device__(cudf::size_type idx) -> string_index_pair { - if (!is_valid[idx]) { return {nullptr, 0}; } - - auto const start_offset = offsets[idx]; - auto const end_offset = offsets[idx + 1]; - auto const size = end_offset - start_offset; - auto const str = chars + start_offset; - - // Need to check for size, since the input string may contain just a single - // character `"`. Such input should not be considered as quoted. - auto const is_quoted = size > 1 && str[0] == '"' && str[size - 1] == '"'; - - // This is a special case, when `"INF"` is not accepted in `from_json`. - // We need to check for such string and nullify it. - if (is_quoted && size == 5 && str[1] == 'I' && str[2] == 'N' && str[3] == 'F') { - return {nullptr, 0}; - } + thrust::tabulate(rmm::exec_policy_nosync(stream), + string_pairs.begin(), + string_pairs.end(), + [chars = input_sv.chars_begin(stream), + offsets = input_offsets_it, + is_valid = is_valid_it] __device__(cudf::size_type idx) -> string_index_pair { + if (!is_valid[idx]) { return {nullptr, 0}; } - auto const output_size = is_quoted ? size - 2 : size; - return {chars + start_offset + (is_quoted ? 1 : 0), output_size}; - }); + auto const start_offset = offsets[idx]; + auto const end_offset = offsets[idx + 1]; + auto const size = end_offset - start_offset; + auto const str = chars + start_offset; + + // Need to check for size, since the input string may contain just a single + // character `"`. Such input should not be considered as quoted. + auto const is_quoted = size > 1 && str[0] == '"' && str[size - 1] == '"'; + + // We check and remove quotes only for the special cases (non-numeric numbers + // wrapped in double quotes) that are accepted in `from_json`. + // They are "NaN", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity". + if (is_quoted) { + // "NaN" + auto accepted = size == 5 && str[1] == 'N' && str[2] == 'a' && str[3] == 'N'; + + // "+INF" and "-INF" + accepted = accepted || (size == 6 && (str[1] == '+' || str[1] == '-') && + str[2] == 'I' && str[3] == 'N' && str[4] == 'F'); + + // "Infinity" + accepted = accepted || (size == 10 && str[1] == 'I' && str[2] == 'n' && + str[3] == 'f' && str[4] == 'i' && str[5] == 'n' && + str[6] == 'i' && str[7] == 't' && str[8] == 'y'); + + // "+Infinity" and "-Infinity" + accepted = accepted || (size == 11 && (str[1] == '+' || str[1] == '-') && + str[2] == 'I' && str[3] == 'n' && str[4] == 'f' && + str[5] == 'i' && str[6] == 'n' && str[7] == 'i' && + str[8] == 't' && str[9] == 'y'); + + if (accepted) { return {str + 1, size - 2}; } + } + + return {str, size}; + }); auto const size_it = cudf::detail::make_counting_transform_iterator( 0, From 21c80a5cb14cdd5d5a7788ec4be73a15e7b2847d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 18 Oct 2024 11:23:05 -0700 Subject: [PATCH 10/58] Implement `castStringsToIntegers` Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 17 ++ src/main/cpp/src/json_utils.cu | 151 +++++++++++++----- src/main/cpp/src/json_utils.hpp | 6 + .../nvidia/spark/rapids/jni/JSONUtils.java | 8 + 4 files changed, 144 insertions(+), 38 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 171f65cb2..795698636 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -227,6 +227,23 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsTo CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToIntegers( + JNIEnv* env, jclass, jlong j_input, jint output_type_id) +{ + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = *reinterpret_cast(j_input); + + return cudf::jni::ptr_as_jlong( + spark_rapids_jni::cast_strings_to_integers( + input, cudf::data_type{static_cast(output_type_id)}) + .release()); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_removeQuotes( JNIEnv* env, jclass, jlong j_input, jboolean nullify_if_not_quoted) { diff --git a/src/main/cpp/src/json_utils.cu b/src/main/cpp/src/json_utils.cu index 434f9adca..c123c302a 100644 --- a/src/main/cpp/src/json_utils.cu +++ b/src/main/cpp/src/json_utils.cu @@ -269,12 +269,55 @@ namespace { using string_index_pair = thrust::pair; +// TODO: remove this. +template +rmm::device_uvector make_chars_buffer(cudf::column_view const& offsets, + int64_t chars_size, + IndexPairIterator begin, + cudf::size_type string_count, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto chars_data = rmm::device_uvector(chars_size, stream, mr); + auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets); + + auto const src_ptrs = cudf::detail::make_counting_transform_iterator( + 0u, cuda::proclaim_return_type([begin] __device__(uint32_t idx) { + // Due to a bug in cub (https://github.com/NVIDIA/cccl/issues/586), + // we have to use `const_cast` to remove `const` qualifier from the source pointer. + // This should be fine as long as we only read but not write anything to the source. + return reinterpret_cast(const_cast(begin[idx].first)); + })); + auto const src_sizes = cudf::detail::make_counting_transform_iterator( + 0u, cuda::proclaim_return_type([begin] __device__(uint32_t idx) { + return begin[idx].second; + })); + auto const dst_ptrs = cudf::detail::make_counting_transform_iterator( + 0u, + cuda::proclaim_return_type([offsets = d_offsets, output = chars_data.data()] __device__( + uint32_t idx) { return output + offsets[idx]; })); + + size_t temp_storage_bytes = 0; + CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched( + nullptr, temp_storage_bytes, src_ptrs, dst_ptrs, src_sizes, string_count, stream.value())); + rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); + CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(d_temp_storage.data(), + temp_storage_bytes, + src_ptrs, + dst_ptrs, + src_sizes, + string_count, + stream.value())); + + return chars_data; +} + std::pair, rmm::device_uvector> cast_strings_to_booleans( cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { auto const string_count = input.size(); if (string_count == 0) { - return {cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}), + return {cudf::make_empty_column(cudf::data_type{cudf::type_id::BOOL8}), rmm::device_uvector(0, stream)}; } @@ -320,47 +363,68 @@ std::pair, rmm::device_uvector> cast_strings return {std::move(output), std::move(validity)}; } -// TODO: remove this. -template -rmm::device_uvector make_chars_buffer(cudf::column_view const& offsets, - int64_t chars_size, - IndexPairIterator begin, - cudf::size_type string_count, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::pair, rmm::device_uvector> cast_strings_to_integers( + cudf::column_view const& input, + cudf::data_type output_type, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - auto chars_data = rmm::device_uvector(chars_size, stream, mr); - auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets); + auto const string_count = input.size(); + if (string_count == 0) { + return {cudf::make_empty_column(output_type), rmm::device_uvector(0, stream)}; + } - auto const src_ptrs = cudf::detail::make_counting_transform_iterator( - 0u, cuda::proclaim_return_type([begin] __device__(uint32_t idx) { - // Due to a bug in cub (https://github.com/NVIDIA/cccl/issues/586), - // we have to use `const_cast` to remove `const` qualifier from the source pointer. - // This should be fine as long as we only read but not write anything to the source. - return reinterpret_cast(const_cast(begin[idx].first)); - })); - auto const src_sizes = cudf::detail::make_counting_transform_iterator( - 0u, cuda::proclaim_return_type([begin] __device__(uint32_t idx) { - return begin[idx].second; - })); - auto const dst_ptrs = cudf::detail::make_counting_transform_iterator( - 0u, - cuda::proclaim_return_type([offsets = d_offsets, output = chars_data.data()] __device__( - uint32_t idx) { return output + offsets[idx]; })); + auto const input_sv = cudf::strings_column_view{input}; + auto const input_offsets_it = + cudf::detail::offsetalator_factory::make_input_iterator(input_sv.offsets()); + auto const d_input_ptr = cudf::column_device_view::create(input, stream); + auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); - size_t temp_storage_bytes = 0; - CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched( - nullptr, temp_storage_bytes, src_ptrs, dst_ptrs, src_sizes, string_count, stream.value())); - rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); - CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(d_temp_storage.data(), - temp_storage_bytes, - src_ptrs, - dst_ptrs, - src_sizes, - string_count, - stream.value())); + auto string_pairs = rmm::device_uvector(string_count, stream); + // Since the strings store integer numbers, they should be very short. + // As such, using one thread per string should be good. + thrust::tabulate(rmm::exec_policy_nosync(stream), + string_pairs.begin(), + string_pairs.end(), + [chars = input_sv.chars_begin(stream), + offsets = input_offsets_it, + is_valid = is_valid_it] __device__(cudf::size_type idx) -> string_index_pair { + if (!is_valid[idx]) { return {nullptr, 0}; } - return chars_data; + auto const start_offset = offsets[idx]; + auto const end_offset = offsets[idx + 1]; + + auto in_ptr = chars + start_offset; + auto in_end = chars + end_offset; + while (in_ptr != in_end) { + if (*in_ptr == '.' || *in_ptr == 'e' || *in_ptr == 'E') { + return {nullptr, 0}; + } + ++in_ptr; + } + + return {chars + start_offset, end_offset - start_offset}; + }); + + auto const size_it = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [string_pairs = string_pairs.begin()] __device__(cudf::size_type idx) -> cudf::size_type { + return string_pairs[idx].second; + })); + auto [offsets_column, bytes] = + cudf::strings::detail::make_offsets_child_column(size_it, size_it + string_count, stream, mr); + auto chars_data = /*cudf::strings::detail::*/ make_chars_buffer( + offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr); + + // Don't care about the null mask, as nulls imply empty strings, and will be nullified. + auto const sanitized_input = + cudf::make_strings_column(string_count, std::move(offsets_column), chars_data.release(), 0, {}); + + auto output = string_to_integer( + output_type, cudf::strings_column_view{sanitized_input->view()}, false, false, stream, mr); + + return {std::move(output), rmm::device_uvector(0, stream)}; } // TODO there is a bug here around 0 https://github.com/NVIDIA/spark-rapids/issues/10898 @@ -747,6 +811,17 @@ std::unique_ptr cast_strings_to_booleans(cudf::column_view const& return std::move(output); } +std::unique_ptr cast_strings_to_integers(cudf::column_view const& input, + cudf::data_type output_type, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + + auto [output, validity] = detail::cast_strings_to_integers(input, output_type, stream, mr); + return std::move(output); +} + std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, int precision, int scale, diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 5447da294..9bf958adf 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -68,6 +68,12 @@ std::unique_ptr cast_strings_to_decimals( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +std::unique_ptr cast_strings_to_integers( + cudf::column_view const& input, + cudf::data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + std::unique_ptr remove_quotes( cudf::column_view const& input, bool nullify_if_not_quoted, diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 5dfb8a259..da2d179e0 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -231,6 +231,12 @@ public static ColumnVector castStringsToDecimals(ColumnView input, int precision return new ColumnVector(castStringsToDecimals(input.getNativeView(), precision, scale, isUSLocale)); } + public static ColumnVector castStringsToIntegers(ColumnView input, DType output_type) { + assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; + return new ColumnVector(castStringsToIntegers(input.getNativeView(), + output_type.getTypeId().getNativeId())); + } + public static ColumnVector removeQuotes(ColumnView input, boolean nullifyIfNotQuoted) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; return new ColumnVector(removeQuotes(input.getNativeView(), nullifyIfNotQuoted)); @@ -267,6 +273,8 @@ private static native long[] getJsonObjectMultiplePaths(long input, private static native long castStringsToDecimals(long input, int precision, int scale, boolean isUSLocale); + private static native long castStringsToIntegers(long input, int outputType); + private static native long removeQuotes(long input, boolean nullifyIfNotQuoted); private static native long removeQuotesForFloats(long input); From 1a7d1920c2c2d46ca8f082cb87a4dbff8ffe78a4 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 18 Oct 2024 15:08:32 -0700 Subject: [PATCH 11/58] Implement non-legacy `castStringsToDates` Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 26 ++++ src/main/cpp/src/json_utils.cu | 117 ++++++++++++++++++ src/main/cpp/src/json_utils.hpp | 10 ++ .../nvidia/spark/rapids/jni/JSONUtils.java | 12 ++ 4 files changed, 165 insertions(+) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 795698636..341a1266f 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -244,6 +244,32 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsTo CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL +Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToDates(JNIEnv* env, + jclass, + jlong j_input, + jstring j_date_regex, + jstring j_date_format, + jboolean error_if_invalid) +{ + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + JNI_NULL_CHECK(env, j_date_regex, "date_regex is null", 0); + JNI_NULL_CHECK(env, j_date_format, "date_format is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = *reinterpret_cast(j_input); + cudf::jni::native_jstring date_regex(env, j_date_regex); + cudf::jni::native_jstring date_format(env, j_date_format); + + auto output = spark_rapids_jni::cast_strings_to_dates( + input, date_regex.get(), date_format.get(), error_if_invalid); + if (output == nullptr) { return 0; } + return cudf::jni::ptr_as_jlong(output.release()); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_removeQuotes( JNIEnv* env, jclass, jlong j_input, jboolean nullify_if_not_quoted) { diff --git a/src/main/cpp/src/json_utils.cu b/src/main/cpp/src/json_utils.cu index c123c302a..4c7ea6993 100644 --- a/src/main/cpp/src/json_utils.cu +++ b/src/main/cpp/src/json_utils.cu @@ -23,8 +23,11 @@ #include #include #include +#include +#include #include #include +#include #include #include @@ -427,6 +430,101 @@ std::pair, rmm::device_uvector> cast_strings return {std::move(output), rmm::device_uvector(0, stream)}; } +std::pair, rmm::device_uvector> cast_strings_to_dates( + cudf::column_view const& input, + std::string const& date_regex, + std::string const& date_format, + bool error_if_invalid, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const string_count = input.size(); + if (string_count == 0) { + return {cudf::make_empty_column(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}), + rmm::device_uvector(0, stream)}; + } + + // TODO: mr + auto const removed_quotes = remove_quotes(input, false, stream, mr); + + auto const input_sv = cudf::strings_column_view{removed_quotes->view()}; + auto const regex_prog = cudf::strings::regex_program::create( + date_regex, cudf::strings::regex_flags::DEFAULT, cudf::strings::capture_groups::NON_CAPTURE); + auto const is_matched = cudf::strings::matches_re(input_sv, *regex_prog, stream); + auto const is_timestamp = cudf::strings::is_timestamp(input_sv, date_format, stream); + auto const d_is_matched = is_matched->view().begin(); + auto const d_is_timestamp = is_timestamp->view().begin(); + + auto const d_input_ptr = cudf::column_device_view::create(removed_quotes->view(), stream); + auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); + auto const invalid_count = thrust::count_if( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(string_count), + [is_valid = is_valid_it, is_matched = d_is_matched, is_timestamp = d_is_timestamp] __device__( + auto idx) { return is_valid[idx] && (!is_matched[idx] || !is_timestamp[idx]); }); + + if (invalid_count == 0) { + auto output = cudf::strings::to_timestamps( + input_sv, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, date_format, stream, mr); + return {std::move(output), rmm::device_uvector(0, stream)}; + } + + // From here we have invalid_count > 0. + if (error_if_invalid) { return {nullptr, rmm::device_uvector(0, stream)}; } + + auto const input_offsets_it = + cudf::detail::offsetalator_factory::make_input_iterator(input_sv.offsets()); + auto string_pairs = rmm::device_uvector(string_count, stream); + + thrust::tabulate( + rmm::exec_policy_nosync(stream), + string_pairs.begin(), + string_pairs.end(), + [chars = input_sv.chars_begin(stream), + offsets = input_offsets_it, + is_valid = is_valid_it, + is_matched = d_is_matched, + is_timestamp = d_is_timestamp] __device__(cudf::size_type idx) -> string_index_pair { + if (!is_valid[idx] || !is_matched[idx] || !is_timestamp[idx]) { return {nullptr, 0}; } + + auto const start_offset = offsets[idx]; + auto const end_offset = offsets[idx + 1]; + return {chars + start_offset, end_offset - start_offset}; + }); + + auto const size_it = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [string_pairs = string_pairs.begin()] __device__(cudf::size_type idx) -> cudf::size_type { + return string_pairs[idx].second; + })); + auto [offsets_column, bytes] = + cudf::strings::detail::make_offsets_child_column(size_it, size_it + string_count, stream, mr); + auto chars_data = /*cudf::strings::detail::*/ make_chars_buffer( + offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr); + + // Don't care about the null mask, as nulls imply empty strings, and will be nullified. + auto const sanitized_input = + cudf::make_strings_column(string_count, std::move(offsets_column), chars_data.release(), 0, {}); + + auto output = cudf::strings::to_timestamps(cudf::strings_column_view{sanitized_input->view()}, + cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, + date_format, + stream, + mr); + + auto validity = rmm::device_uvector(string_count, stream); + thrust::transform(rmm::exec_policy_nosync(stream), + string_pairs.begin(), + string_pairs.end(), + validity.begin(), + [] __device__(string_index_pair const& pair) { return pair.first != nullptr; }); + + // Null mask and null count will be updated later from the validity vector. + return {std::move(output), std::move(validity)}; +} + // TODO there is a bug here around 0 https://github.com/NVIDIA/spark-rapids/issues/10898 std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, int precision, @@ -822,6 +920,25 @@ std::unique_ptr cast_strings_to_integers(cudf::column_view const& return std::move(output); } +std::unique_ptr cast_strings_to_dates(cudf::column_view const& input, + std::string const& date_regex, + std::string const& date_format, + bool error_if_invalid, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + + auto [output, validity] = + detail::cast_strings_to_dates(input, date_regex, date_format, error_if_invalid, stream, mr); + + if (output == nullptr) { return nullptr; } + auto [null_mask, null_count] = + cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr); + if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } + return std::move(output); +} + std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, int precision, int scale, diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 9bf958adf..60dca5a38 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -27,6 +27,8 @@ namespace spark_rapids_jni { +// TODO: replace rmm::mr::get_current_device_resource() by cudf + std::unique_ptr from_json_to_raw_map( cudf::strings_column_view const& input, rmm::cuda_stream_view stream = cudf::get_default_stream(), @@ -74,6 +76,14 @@ std::unique_ptr cast_strings_to_integers( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +std::unique_ptr cast_strings_to_dates( + cudf::column_view const& input, + std::string const& date_regex, + std::string const& date_format, + bool error_if_invalid, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + std::unique_ptr remove_quotes( cudf::column_view const& input, bool nullify_if_not_quoted, diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index da2d179e0..66ef289de 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -237,6 +237,16 @@ public static ColumnVector castStringsToIntegers(ColumnView input, DType output_ output_type.getTypeId().getNativeId())); } + public static ColumnVector castStringsToDates(ColumnView input, String dateRegex, + String dateFormat, boolean failOnInvalid) { + assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; + long output = castStringsToDates(input.getNativeView(), dateRegex, dateFormat, failOnInvalid); + if (output == 0) { + return null; + } + return new ColumnVector(output); + } + public static ColumnVector removeQuotes(ColumnView input, boolean nullifyIfNotQuoted) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; return new ColumnVector(removeQuotes(input.getNativeView(), nullifyIfNotQuoted)); @@ -275,6 +285,8 @@ private static native long[] getJsonObjectMultiplePaths(long input, private static native long castStringsToIntegers(long input, int outputType); + private static native long castStringsToDates(long input, String dateRegex, String dateFormat, boolean failOnInvalid); + private static native long removeQuotes(long input, boolean nullifyIfNotQuoted); private static native long removeQuotesForFloats(long input); From dcb463e6f74def2479c97d1dc29082446ba1637b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 21 Oct 2024 09:53:42 -0700 Subject: [PATCH 12/58] WIP for `cast_strings_to_dates_legacy` Signed-off-by: Nghia Truong --- src/main/cpp/src/json_utils.cu | 104 +++++++++++++++++++++++++++++--- src/main/cpp/src/json_utils.hpp | 7 +++ 2 files changed, 101 insertions(+), 10 deletions(-) diff --git a/src/main/cpp/src/json_utils.cu b/src/main/cpp/src/json_utils.cu index 4c7ea6993..62d25a7d5 100644 --- a/src/main/cpp/src/json_utils.cu +++ b/src/main/cpp/src/json_utils.cu @@ -104,8 +104,8 @@ std::tuple, std::unique_ptr, c thrust::make_counting_iterator(0L), thrust::make_counting_iterator(input.size() * static_cast(cudf::detail::warp_size)), [input = *d_input_ptr, - output = thrust::make_zip_iterator(thrust::make_tuple( - is_valid_input.begin(), is_null_or_empty.begin()))] __device__(int64_t tidx) { + output = thrust::make_zip_iterator(is_valid_input.begin(), + is_null_or_empty.begin())] __device__(int64_t tidx) { // Execute one warp per row to minimize thread divergence. if ((tidx % cudf::detail::warp_size) != 0) { return; } auto const idx = tidx / cudf::detail::warp_size; @@ -333,8 +333,8 @@ std::pair, rmm::device_uvector> cast_strings cudf::detail::offsetalator_factory::make_input_iterator(input_sv.offsets()); auto const d_input_ptr = cudf::column_device_view::create(input, stream); auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); - auto const output_it = thrust::make_zip_iterator( - thrust::make_tuple(output->mutable_view().begin(), validity.begin())); + auto const output_it = + thrust::make_zip_iterator(output->mutable_view().begin(), validity.begin()); thrust::tabulate( rmm::exec_policy_nosync(stream), output_it, @@ -445,9 +445,12 @@ std::pair, rmm::device_uvector> cast_strings } // TODO: mr - auto const removed_quotes = remove_quotes(input, false, stream, mr); + auto const removed_quotes = remove_quotes(input, false, stream, mr); + auto const removed_quotes_cv = removed_quotes->view(); + auto const input_sv = cudf::strings_column_view{removed_quotes_cv}; + auto const d_input_ptr = cudf::column_device_view::create(removed_quotes_cv, stream); + auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); - auto const input_sv = cudf::strings_column_view{removed_quotes->view()}; auto const regex_prog = cudf::strings::regex_program::create( date_regex, cudf::strings::regex_flags::DEFAULT, cudf::strings::capture_groups::NON_CAPTURE); auto const is_matched = cudf::strings::matches_re(input_sv, *regex_prog, stream); @@ -455,22 +458,23 @@ std::pair, rmm::device_uvector> cast_strings auto const d_is_matched = is_matched->view().begin(); auto const d_is_timestamp = is_timestamp->view().begin(); - auto const d_input_ptr = cudf::column_device_view::create(removed_quotes->view(), stream); - auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); auto const invalid_count = thrust::count_if( rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(string_count), [is_valid = is_valid_it, is_matched = d_is_matched, is_timestamp = d_is_timestamp] __device__( - auto idx) { return is_valid[idx] && (!is_matched[idx] || !is_timestamp[idx]); }); + auto idx) { + // The row is invalid if it is valid (non-null) but failed at least one check. + return is_valid[idx] && (!is_matched[idx] || !is_timestamp[idx]); + }); if (invalid_count == 0) { auto output = cudf::strings::to_timestamps( input_sv, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, date_format, stream, mr); return {std::move(output), rmm::device_uvector(0, stream)}; } - // From here we have invalid_count > 0. + if (error_if_invalid) { return {nullptr, rmm::device_uvector(0, stream)}; } auto const input_offsets_it = @@ -525,6 +529,67 @@ std::pair, rmm::device_uvector> cast_strings return {std::move(output), std::move(validity)}; } +std::pair, rmm::device_uvector> cast_strings_to_dates_legacy( + cudf::column_view const& input, + std::vector> const& special_dates, + bool error_if_invalid, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const string_count = input.size(); + if (string_count == 0) { + return {cudf::make_empty_column(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}), + rmm::device_uvector(0, stream)}; + } + + // TODO: mr + auto const removed_quotes = remove_quotes(input, false, stream, mr); + auto const removed_quotes_cv = removed_quotes->view(); + auto const input_sv = cudf::strings_column_view{removed_quotes_cv}; + auto const d_input_ptr = cudf::column_device_view::create(removed_quotes_cv, stream); + auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); + + auto const check_input = [&](std::string const& date_regex, std::string const& date_format) { + auto const regex_prog = cudf::strings::regex_program::create( + date_regex, cudf::strings::regex_flags::DEFAULT, cudf::strings::capture_groups::NON_CAPTURE); + return {cudf::strings::matches_re(input_sv, *regex_prog, stream), + cudf::strings::is_timestamp(input_sv, date_format, stream)}; + }; + + auto const [is_matched_ymd, is_timestamp_ymd] = check_input(); + auto const [is_matched_ym, is_timestamp_ym] = check_input(); + auto const [is_matched_y, is_timestamp_y] = check_input(); + + auto const is_valid_format_it = thrust::make_zip_iterator(is_matched_ymd->view().begin(), + is_timestamp_ymd->view().begin(), + is_matched_ym->view().begin(), + is_timestamp_ym->view().begin(), + is_matched_y->view().begin(), + is_timestamp_y->view().begin()); + auto const invalid_count = thrust::count_if( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(string_count), + [is_valid = is_valid_it, is_valid_format = is_valid_format_it] __device__(auto idx) { + if (!is_valid[idx]) { return 0; } + auto const valid_format = is_valid_format[idx]; + // The row is invalid only if it non-null and failed to check for all 3 formats. + return (!thrust::get<0>(valid_format) || !thrust::get<1>(valid_format)) && + (!thrust::get<2>(valid_format) || !thrust::get<3>(valid_format)) && + (!thrust::get<4>(valid_format) || !thrust::get<5>(valid_format)); + }); + + if (invalid_count == 0) { + // TODO + auto output = cudf::strings::to_timestamps( + input_sv, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, date_format, stream, mr); + return {std::move(output), rmm::device_uvector(0, stream)}; + } + // From here we have invalid_count > 0. + + if (error_if_invalid) { return {nullptr, rmm::device_uvector(0, stream)}; } +} + // TODO there is a bug here around 0 https://github.com/NVIDIA/spark-rapids/issues/10898 std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, int precision, @@ -939,6 +1004,25 @@ std::unique_ptr cast_strings_to_dates(cudf::column_view const& inp return std::move(output); } +std::unique_ptr cast_strings_to_dates_legacy( + cudf::column_view const& input, + std::vector> const& special_dates, + bool error_if_invalid, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + + auto [output, validity] = + detail::cast_strings_to_dates_legacy(input, special_dates, error_if_invalid, stream, mr); + + if (output == nullptr) { return nullptr; } + auto [null_mask, null_count] = + cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr); + if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } + return std::move(output); +} + std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, int precision, int scale, diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 60dca5a38..66a1accda 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -84,6 +84,13 @@ std::unique_ptr cast_strings_to_dates( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +std::unique_ptr cast_strings_to_dates_legacy( + cudf::column_view const& input, + std::vector> const& special_dates, + bool error_if_invalid, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + std::unique_ptr remove_quotes( cudf::column_view const& input, bool nullify_if_not_quoted, From f059c219bcdafbdc97d6dfef99a7108159771a48 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 21 Oct 2024 09:54:09 -0700 Subject: [PATCH 13/58] Revert "WIP for `cast_strings_to_dates_legacy`" This reverts commit dcb463e6f74def2479c97d1dc29082446ba1637b. --- src/main/cpp/src/json_utils.cu | 104 +++----------------------------- src/main/cpp/src/json_utils.hpp | 7 --- 2 files changed, 10 insertions(+), 101 deletions(-) diff --git a/src/main/cpp/src/json_utils.cu b/src/main/cpp/src/json_utils.cu index 62d25a7d5..4c7ea6993 100644 --- a/src/main/cpp/src/json_utils.cu +++ b/src/main/cpp/src/json_utils.cu @@ -104,8 +104,8 @@ std::tuple, std::unique_ptr, c thrust::make_counting_iterator(0L), thrust::make_counting_iterator(input.size() * static_cast(cudf::detail::warp_size)), [input = *d_input_ptr, - output = thrust::make_zip_iterator(is_valid_input.begin(), - is_null_or_empty.begin())] __device__(int64_t tidx) { + output = thrust::make_zip_iterator(thrust::make_tuple( + is_valid_input.begin(), is_null_or_empty.begin()))] __device__(int64_t tidx) { // Execute one warp per row to minimize thread divergence. if ((tidx % cudf::detail::warp_size) != 0) { return; } auto const idx = tidx / cudf::detail::warp_size; @@ -333,8 +333,8 @@ std::pair, rmm::device_uvector> cast_strings cudf::detail::offsetalator_factory::make_input_iterator(input_sv.offsets()); auto const d_input_ptr = cudf::column_device_view::create(input, stream); auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); - auto const output_it = - thrust::make_zip_iterator(output->mutable_view().begin(), validity.begin()); + auto const output_it = thrust::make_zip_iterator( + thrust::make_tuple(output->mutable_view().begin(), validity.begin())); thrust::tabulate( rmm::exec_policy_nosync(stream), output_it, @@ -445,12 +445,9 @@ std::pair, rmm::device_uvector> cast_strings } // TODO: mr - auto const removed_quotes = remove_quotes(input, false, stream, mr); - auto const removed_quotes_cv = removed_quotes->view(); - auto const input_sv = cudf::strings_column_view{removed_quotes_cv}; - auto const d_input_ptr = cudf::column_device_view::create(removed_quotes_cv, stream); - auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); + auto const removed_quotes = remove_quotes(input, false, stream, mr); + auto const input_sv = cudf::strings_column_view{removed_quotes->view()}; auto const regex_prog = cudf::strings::regex_program::create( date_regex, cudf::strings::regex_flags::DEFAULT, cudf::strings::capture_groups::NON_CAPTURE); auto const is_matched = cudf::strings::matches_re(input_sv, *regex_prog, stream); @@ -458,23 +455,22 @@ std::pair, rmm::device_uvector> cast_strings auto const d_is_matched = is_matched->view().begin(); auto const d_is_timestamp = is_timestamp->view().begin(); + auto const d_input_ptr = cudf::column_device_view::create(removed_quotes->view(), stream); + auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); auto const invalid_count = thrust::count_if( rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(string_count), [is_valid = is_valid_it, is_matched = d_is_matched, is_timestamp = d_is_timestamp] __device__( - auto idx) { - // The row is invalid if it is valid (non-null) but failed at least one check. - return is_valid[idx] && (!is_matched[idx] || !is_timestamp[idx]); - }); + auto idx) { return is_valid[idx] && (!is_matched[idx] || !is_timestamp[idx]); }); if (invalid_count == 0) { auto output = cudf::strings::to_timestamps( input_sv, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, date_format, stream, mr); return {std::move(output), rmm::device_uvector(0, stream)}; } - // From here we have invalid_count > 0. + // From here we have invalid_count > 0. if (error_if_invalid) { return {nullptr, rmm::device_uvector(0, stream)}; } auto const input_offsets_it = @@ -529,67 +525,6 @@ std::pair, rmm::device_uvector> cast_strings return {std::move(output), std::move(validity)}; } -std::pair, rmm::device_uvector> cast_strings_to_dates_legacy( - cudf::column_view const& input, - std::vector> const& special_dates, - bool error_if_invalid, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto const string_count = input.size(); - if (string_count == 0) { - return {cudf::make_empty_column(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}), - rmm::device_uvector(0, stream)}; - } - - // TODO: mr - auto const removed_quotes = remove_quotes(input, false, stream, mr); - auto const removed_quotes_cv = removed_quotes->view(); - auto const input_sv = cudf::strings_column_view{removed_quotes_cv}; - auto const d_input_ptr = cudf::column_device_view::create(removed_quotes_cv, stream); - auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); - - auto const check_input = [&](std::string const& date_regex, std::string const& date_format) { - auto const regex_prog = cudf::strings::regex_program::create( - date_regex, cudf::strings::regex_flags::DEFAULT, cudf::strings::capture_groups::NON_CAPTURE); - return {cudf::strings::matches_re(input_sv, *regex_prog, stream), - cudf::strings::is_timestamp(input_sv, date_format, stream)}; - }; - - auto const [is_matched_ymd, is_timestamp_ymd] = check_input(); - auto const [is_matched_ym, is_timestamp_ym] = check_input(); - auto const [is_matched_y, is_timestamp_y] = check_input(); - - auto const is_valid_format_it = thrust::make_zip_iterator(is_matched_ymd->view().begin(), - is_timestamp_ymd->view().begin(), - is_matched_ym->view().begin(), - is_timestamp_ym->view().begin(), - is_matched_y->view().begin(), - is_timestamp_y->view().begin()); - auto const invalid_count = thrust::count_if( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(string_count), - [is_valid = is_valid_it, is_valid_format = is_valid_format_it] __device__(auto idx) { - if (!is_valid[idx]) { return 0; } - auto const valid_format = is_valid_format[idx]; - // The row is invalid only if it non-null and failed to check for all 3 formats. - return (!thrust::get<0>(valid_format) || !thrust::get<1>(valid_format)) && - (!thrust::get<2>(valid_format) || !thrust::get<3>(valid_format)) && - (!thrust::get<4>(valid_format) || !thrust::get<5>(valid_format)); - }); - - if (invalid_count == 0) { - // TODO - auto output = cudf::strings::to_timestamps( - input_sv, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, date_format, stream, mr); - return {std::move(output), rmm::device_uvector(0, stream)}; - } - // From here we have invalid_count > 0. - - if (error_if_invalid) { return {nullptr, rmm::device_uvector(0, stream)}; } -} - // TODO there is a bug here around 0 https://github.com/NVIDIA/spark-rapids/issues/10898 std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, int precision, @@ -1004,25 +939,6 @@ std::unique_ptr cast_strings_to_dates(cudf::column_view const& inp return std::move(output); } -std::unique_ptr cast_strings_to_dates_legacy( - cudf::column_view const& input, - std::vector> const& special_dates, - bool error_if_invalid, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - - auto [output, validity] = - detail::cast_strings_to_dates_legacy(input, special_dates, error_if_invalid, stream, mr); - - if (output == nullptr) { return nullptr; } - auto [null_mask, null_count] = - cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr); - if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } - return std::move(output); -} - std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, int precision, int scale, diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 66a1accda..60dca5a38 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -84,13 +84,6 @@ std::unique_ptr cast_strings_to_dates( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); -std::unique_ptr cast_strings_to_dates_legacy( - cudf::column_view const& input, - std::vector> const& special_dates, - bool error_if_invalid, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); - std::unique_ptr remove_quotes( cudf::column_view const& input, bool nullify_if_not_quoted, From 07b23ea93202ba7ce15667f65955af09abc8c9f0 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 23 Oct 2024 13:42:03 -0700 Subject: [PATCH 14/58] Fix compile issues Signed-off-by: Nghia Truong --- src/main/cpp/src/json_utils.cu | 51 +++------------------------------- 1 file changed, 4 insertions(+), 47 deletions(-) diff --git a/src/main/cpp/src/json_utils.cu b/src/main/cpp/src/json_utils.cu index 4c7ea6993..5fe7d526a 100644 --- a/src/main/cpp/src/json_utils.cu +++ b/src/main/cpp/src/json_utils.cu @@ -272,49 +272,6 @@ namespace { using string_index_pair = thrust::pair; -// TODO: remove this. -template -rmm::device_uvector make_chars_buffer(cudf::column_view const& offsets, - int64_t chars_size, - IndexPairIterator begin, - cudf::size_type string_count, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto chars_data = rmm::device_uvector(chars_size, stream, mr); - auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets); - - auto const src_ptrs = cudf::detail::make_counting_transform_iterator( - 0u, cuda::proclaim_return_type([begin] __device__(uint32_t idx) { - // Due to a bug in cub (https://github.com/NVIDIA/cccl/issues/586), - // we have to use `const_cast` to remove `const` qualifier from the source pointer. - // This should be fine as long as we only read but not write anything to the source. - return reinterpret_cast(const_cast(begin[idx].first)); - })); - auto const src_sizes = cudf::detail::make_counting_transform_iterator( - 0u, cuda::proclaim_return_type([begin] __device__(uint32_t idx) { - return begin[idx].second; - })); - auto const dst_ptrs = cudf::detail::make_counting_transform_iterator( - 0u, - cuda::proclaim_return_type([offsets = d_offsets, output = chars_data.data()] __device__( - uint32_t idx) { return output + offsets[idx]; })); - - size_t temp_storage_bytes = 0; - CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched( - nullptr, temp_storage_bytes, src_ptrs, dst_ptrs, src_sizes, string_count, stream.value())); - rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); - CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(d_temp_storage.data(), - temp_storage_bytes, - src_ptrs, - dst_ptrs, - src_sizes, - string_count, - stream.value())); - - return chars_data; -} - std::pair, rmm::device_uvector> cast_strings_to_booleans( cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -417,7 +374,7 @@ std::pair, rmm::device_uvector> cast_strings })); auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(size_it, size_it + string_count, stream, mr); - auto chars_data = /*cudf::strings::detail::*/ make_chars_buffer( + auto chars_data = cudf::strings::detail::make_chars_buffer( offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr); // Don't care about the null mask, as nulls imply empty strings, and will be nullified. @@ -501,7 +458,7 @@ std::pair, rmm::device_uvector> cast_strings })); auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(size_it, size_it + string_count, stream, mr); - auto chars_data = /*cudf::strings::detail::*/ make_chars_buffer( + auto chars_data = cudf::strings::detail::make_chars_buffer( offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr); // Don't care about the null mask, as nulls imply empty strings, and will be nullified. @@ -725,7 +682,7 @@ std::pair, rmm::device_uvector> remove_quote })); auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(size_it, size_it + string_count, stream, mr); - auto chars_data = /*cudf::strings::detail::*/ make_chars_buffer( + auto chars_data = cudf::strings::detail::make_chars_buffer( offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr); if (nullify_if_not_quoted) { @@ -826,7 +783,7 @@ std::pair, rmm::device_uvector> remove_quote })); auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(size_it, size_it + string_count, stream, mr); - auto chars_data = /*cudf::strings::detail::*/ make_chars_buffer( + auto chars_data = cudf::strings::detail::make_chars_buffer( offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr); auto output = cudf::make_strings_column(string_count, From de83a25298ddb94fcc7fc63183faacd001d944a7 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 23 Oct 2024 21:28:56 -0700 Subject: [PATCH 15/58] WIP: Implement `from_json_to_structs` Signed-off-by: Nghia Truong --- src/main/cpp/CMakeLists.txt | 1 + ...{json_utils.cu => from_json_to_structs.cu} | 131 ++++++++++++++++-- src/main/cpp/src/json_utils.hpp | 12 +- 3 files changed, 134 insertions(+), 10 deletions(-) rename src/main/cpp/src/{json_utils.cu => from_json_to_structs.cu} (87%) diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 14edae0ec..03a34cbe6 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -207,6 +207,7 @@ add_library( src/case_when.cu src/cast_decimal_to_string.cu src/format_float.cu + src/from_json_to_structs.cu src/cast_float_to_string.cu src/cast_string.cu src/cast_string_to_float.cu diff --git a/src/main/cpp/src/json_utils.cu b/src/main/cpp/src/from_json_to_structs.cu similarity index 87% rename from src/main/cpp/src/json_utils.cu rename to src/main/cpp/src/from_json_to_structs.cu index 5fe7d526a..fc0166c5a 100644 --- a/src/main/cpp/src/json_utils.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,7 @@ #include #include #include +#include #include #include @@ -795,32 +797,145 @@ std::pair, rmm::device_uvector> remove_quote return {std::move(output), rmm::device_uvector(0, stream)}; } -std::unique_ptr convert_column_type(cudf::column_view const& input, +std::unique_ptr convert_column_type(std::unique_ptr& input, json_schema_element const& schema, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + if (schema.type.id() == cudf::type_id::BOOL8) { + CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); + return cast_strings_to_booleans(input->view(), stream, mr); + } + if (cudf::is_integral(schema.type)) { + CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); + return cast_strings_to_integers(input->view(), schema.type, stream, mr); + } + if (cudf::is_floating_point(schema.type)) { + CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); + return cast_strings_to_floats(input->view(), schema.type, stream, mr); + } + if (cudf::is_fixed_point(schema.type)) { + CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); + return cast_strings_to_decimals(input->view(), schema.type, stream, mr); + } + if (schema.type.id() == cudf::type_id::STRING) { + CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); + return remove_quotes(input->view(), false, stream, mr); + } + + auto const num_rows = input->size(); + auto const num_count = input->null_count(); + + if (schema.type.id() == cudf::type_id::LIST) { + CUDF_EXPECTS(input->type().id() == cudf::type_id::LIST, "Input column should be LIST."); + + auto input_content = input->release(); + auto new_child = convert_column_type( + std::move(input_content.children[cudf::lists_column_view::child_column_index]), + schema.child_types.front(), + stream, + mr); + + return std::make_lists_column( + num_rows, + std::move(input_content.children[cudf::lists_column_view::offsets_column_index]), + std::move(new_child), + null_count, + std::move(input_content.null_mask), + stream, + mr); + } + + if (schema.type.id() == cudf::type_id::STRUCT) { + CUDF_EXPECTS(input->type().id() == cudf::type_id::STRUCT, "Input column should be LIST."); + + auto const num_children = input->num_children(); + std::vector> new_children; + new_children.reserve(num_children); + auto input_content = input->release(); + for (cudf::size_type i = 0; i < input->num_children(); ++i) { + new_children.emplace_back(convert_column_type( + std::move(input_content.children[i]), schema.child_types[i], stream, mr)); + } + + return std::make_structs_column(num_rows, + std::move(new_children), + null_count, + std::move(input_content.null_mask), + stream, + mr); + } + + CUDF_FAIL("Unexpected column type for conversion."); return nullptr; } +bool check_schema(cudf::io::column_name_info const& read_info, + std::pair const& column_schema) +{ + CUDF_EXPECTS(read_info.name == column_schema.first, "Mismatched column name."); + CUDF_EXPECTS(read_info.children.size() == column_schema.child_types.size(), + "Mismatched number of children."); + for (std::size_t i = 0; i < read_info.children.size(); ++i) { + check_schema(read_info.children[i], column_schema.child_types[i]); + } +} + } // namespace -std::unique_ptr convert_types( - cudf::table_view const& input, +std::unique_ptr from_json_to_structs( + cudf::strings_column_view const& input, std::vector> const& schema, + cudf::io::json_reader_options const& json_options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto const num_columns = input.num_columns(); - CUDF_EXPECTS(static_cast(num_columns) == schema.size(), + auto const [is_null_or_empty, concat_input, delimiter] = concat_json(input, stream, mr); + auto opts_builder = + cudf::io::json_reader_options::builder( + cudf::io::source_info{cudf::device_span{concat_input.data(), concat_input.size()}}) + .dayfirst(json_options.is_enabled_dayfirst()) + .lines(json_options.is_enabled_lines()) + .recovery_mode(json_options.recovery_mode()) + .normalize_single_quotes(json_options.is_enabled_normalize_single_quotes()) + .normalize_whitespace(json_options.is_enabled_normalize_whitespace()) + .mixed_types_as_string(json_options.is_enabled_mixed_types_as_string()) + .delimiter(json_options.get_delimiter()) + .strict_validation(json_options.is_strict_validation()) + .keep_quotes(json_options.is_enabled_keep_quotes()) + .prune_columns(json_options.is_enabled_prune_columns()) + .experimental(json_options.is_enabled_experimental()); + if (json_options.is_strict_validation()) { + opts_builder.numeric_leading_zeros(json_options.is_allowed_numeric_leading_zeros()) + .nonnumeric_numbers(json_options.is_allowed_nonnumeric_numbers()) + .unquoted_control_chars(json_options.is_allowed_unquoted_control_chars()); + } + auto const parsed_table_with_meta = cudf::io::read_json(opts_builder.build()); + auto const& parsed_meta = parsed_table_with_meta.metadata; + auto parsed_columns = parsed_table_with_meta.tbl->release(); + + CUDF_EXPECTS(parsed_columns.size() == schema.size(), "Numbers of columns in the input table is different from schema size."); std::vector> converted_cols(num_columns); - for (int i = 0; i < num_columns; ++i) { - converted_cols[i] = convert_column_type(input.column(i), schema[i].second, stream, mr); + for (std::size_t i = 0; i < parsed_columns.size(); ++i) { + check_schema(parsed_meta.schema_info[i], schema[i]); + converted_cols[i] = convert_column_type(parsed_columns[i], schema[i].second, stream, mr); } - return nullptr; + auto [null_mask, null_count] = cudf::detail::valid_if(is_null_or_empty.begin(), + is_null_or_empty.end(), + thrust::logical_not{}, + stream, + mr); + + return cudf::make_structs_column( + input.size(), + std::move(converted_cols), + null_count, + null_count > 0 ? std::move(null_mask) : rmm::device_buffer{0, stream, mr}, + stream, + mr); } } // namespace detail diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 60dca5a38..383922d72 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -51,12 +51,20 @@ struct json_schema_element { std::vector> child_types; }; -std::unique_ptr convert_types( - cudf::table_view const& input, +std::unique_ptr from_json_to_structs( + cudf::strings_column_view const& input, std::vector> const& schema, + cudf::io::json_reader_options const& json_options, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +// +// +// +// +// +// +// std::unique_ptr cast_strings_to_booleans( cudf::column_view const& input, rmm::cuda_stream_view stream = cudf::get_default_stream(), From 6c2bd5e48cb9fde5c95cfb406d91cfd8da2eeaa6 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 24 Oct 2024 12:58:48 -0700 Subject: [PATCH 16/58] Fix cmake Signed-off-by: Nghia Truong --- src/main/cpp/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 03a34cbe6..14882ab43 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -216,7 +216,6 @@ add_library( src/from_json_to_raw_map.cu src/get_json_object.cu src/histogram.cu - src/json_utils.cu src/murmur_hash.cu src/parse_uri.cu src/regex_rewrite_utils.cu From 904d857de51b8dc16c8a739a15c864bc9cdd0f89 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 24 Oct 2024 13:28:41 -0700 Subject: [PATCH 17/58] Fix compile issues Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 88 +++++++++++------------- src/main/cpp/src/json_utils.hpp | 1 + 2 files changed, 41 insertions(+), 48 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index fc0166c5a..a5ae1d30a 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -804,44 +804,44 @@ std::unique_ptr convert_column_type(std::unique_ptr& { if (schema.type.id() == cudf::type_id::BOOL8) { CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); - return cast_strings_to_booleans(input->view(), stream, mr); + return ::spark_rapids_jni::cast_strings_to_booleans(input->view(), stream, mr); } if (cudf::is_integral(schema.type)) { CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); - return cast_strings_to_integers(input->view(), schema.type, stream, mr); - } - if (cudf::is_floating_point(schema.type)) { - CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); - return cast_strings_to_floats(input->view(), schema.type, stream, mr); - } - if (cudf::is_fixed_point(schema.type)) { - CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); - return cast_strings_to_decimals(input->view(), schema.type, stream, mr); + return ::spark_rapids_jni::cast_strings_to_integers(input->view(), schema.type, stream, mr); } + // if (cudf::is_floating_point(schema.type)) { + // CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); + // return ::cast_strings_to_floats(input->view(), schema.type, stream, mr); + // } + // if (cudf::is_fixed_point(schema.type)) { + // CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); + // return ::cast_strings_to_decimals(input->view(), schema.type, stream, mr); + // } if (schema.type.id() == cudf::type_id::STRING) { CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); - return remove_quotes(input->view(), false, stream, mr); + return ::spark_rapids_jni::remove_quotes(input->view(), false, stream, mr); } - auto const num_rows = input->size(); - auto const num_count = input->null_count(); + auto const num_rows = input->size(); + auto const null_count = input->null_count(); if (schema.type.id() == cudf::type_id::LIST) { CUDF_EXPECTS(input->type().id() == cudf::type_id::LIST, "Input column should be LIST."); auto input_content = input->release(); - auto new_child = convert_column_type( - std::move(input_content.children[cudf::lists_column_view::child_column_index]), - schema.child_types.front(), - stream, - mr); + auto new_child = + convert_column_type(input_content.children[cudf::lists_column_view::child_column_index], + schema.child_types.front().second, + stream, + mr); - return std::make_lists_column( + return cudf::make_lists_column( num_rows, std::move(input_content.children[cudf::lists_column_view::offsets_column_index]), std::move(new_child), null_count, - std::move(input_content.null_mask), + std::move(*input_content.null_mask), stream, mr); } @@ -854,30 +854,33 @@ std::unique_ptr convert_column_type(std::unique_ptr& new_children.reserve(num_children); auto input_content = input->release(); for (cudf::size_type i = 0; i < input->num_children(); ++i) { - new_children.emplace_back(convert_column_type( - std::move(input_content.children[i]), schema.child_types[i], stream, mr)); + new_children.emplace_back( + convert_column_type(input_content.children[i], schema.child_types[i].second, stream, mr)); } - return std::make_structs_column(num_rows, - std::move(new_children), - null_count, - std::move(input_content.null_mask), - stream, - mr); + return cudf::make_structs_column(num_rows, + std::move(new_children), + null_count, + std::move(*input_content.null_mask), + stream, + mr); } CUDF_FAIL("Unexpected column type for conversion."); return nullptr; } -bool check_schema(cudf::io::column_name_info const& read_info, +void check_schema(cudf::io::column_name_info const& read_info, std::pair const& column_schema) { CUDF_EXPECTS(read_info.name == column_schema.first, "Mismatched column name."); - CUDF_EXPECTS(read_info.children.size() == column_schema.child_types.size(), + CUDF_EXPECTS(read_info.children.size() == column_schema.second.child_types.size(), "Mismatched number of children."); for (std::size_t i = 0; i < read_info.children.size(); ++i) { - check_schema(read_info.children[i], column_schema.child_types[i]); + // auto const find_it = column_schema.second.child_types.find(read_info.children[i].name); + // check_schema(read_info.children[i], + // find_it != column_schema.child_types.end() ? *find_it : {"", {}}); + check_schema(read_info.children[i], column_schema.second.child_types[i]); } } @@ -893,7 +896,8 @@ std::unique_ptr from_json_to_structs( auto const [is_null_or_empty, concat_input, delimiter] = concat_json(input, stream, mr); auto opts_builder = cudf::io::json_reader_options::builder( - cudf::io::source_info{cudf::device_span{concat_input.data(), concat_input.size()}}) + cudf::io::source_info{cudf::device_span{ + static_cast(concat_input->data()), concat_input->size()}}) .dayfirst(json_options.is_enabled_dayfirst()) .lines(json_options.is_enabled_lines()) .recovery_mode(json_options.recovery_mode()) @@ -917,17 +921,15 @@ std::unique_ptr from_json_to_structs( CUDF_EXPECTS(parsed_columns.size() == schema.size(), "Numbers of columns in the input table is different from schema size."); - std::vector> converted_cols(num_columns); + std::vector> converted_cols(parsed_columns.size()); for (std::size_t i = 0; i < parsed_columns.size(); ++i) { check_schema(parsed_meta.schema_info[i], schema[i]); converted_cols[i] = convert_column_type(parsed_columns[i], schema[i].second, stream, mr); } - auto [null_mask, null_count] = cudf::detail::valid_if(is_null_or_empty.begin(), - is_null_or_empty.end(), - thrust::logical_not{}, - stream, - mr); + auto const valid_it = is_null_or_empty->view().begin(); + auto [null_mask, null_count] = cudf::detail::valid_if( + valid_it, valid_it + is_null_or_empty->size(), thrust::logical_not{}, stream, mr); return cudf::make_structs_column( input.size(), @@ -958,16 +960,6 @@ std::unique_ptr make_structs(std::vector const& return detail::make_structs(children, is_null, stream, mr); } -std::unique_ptr convert_types( - cudf::table_view const& input, - std::vector> const& schema, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::convert_types(input, schema, stream, mr); -} - std::unique_ptr cast_strings_to_booleans(cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 383922d72..e66892d83 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include From d84f1fecb7c0c96e83b1d9d28a6660422876f74c Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 24 Oct 2024 14:14:40 -0700 Subject: [PATCH 18/58] Implement `castStringsToFloats` Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 10 +++++++--- src/main/cpp/src/from_json_to_structs.cu | 16 +++++++++++----- src/main/cpp/src/json_utils.hpp | 4 +++- .../com/nvidia/spark/rapids/jni/JSONUtils.java | 8 +++++--- 4 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 341a1266f..988d8d959 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -284,15 +284,19 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_removeQuotes( CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_JSONUtils_removeQuotesForFloats(JNIEnv* env, jclass, jlong j_input) +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToFloats( + JNIEnv* env, jclass, jlong j_input, jint j_output_type_id, jboolean allow_nonnumeric_numbers) { JNI_NULL_CHECK(env, j_input, "j_input is null", 0); try { cudf::jni::auto_set_device(env); auto const input = *reinterpret_cast(j_input); - return cudf::jni::ptr_as_jlong(spark_rapids_jni::remove_quotes_for_floats(input).release()); + return cudf::jni::ptr_as_jlong(spark_rapids_jni::cast_strings_to_floats( + input, + cudf::data_type{static_cast(j_output_type_id)}, + allow_nonnumeric_numbers) + .release()); } CATCH_STD(env, 0); } diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index a5ae1d30a..b2707c39f 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -1031,14 +1031,20 @@ std::unique_ptr remove_quotes(cudf::column_view const& input, return std::move(output); } -std::unique_ptr remove_quotes_for_floats(cudf::column_view const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr cast_strings_to_floats(cudf::column_view const& input, + cudf::data_type output_type, + bool allow_nonnumeric_numbers, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - auto [output, validity] = detail::remove_quotes_for_floats(input, stream, mr); - return std::move(output); + if (allow_nonnumeric_numbers) { + auto [removed_quotes, validity] = detail::remove_quotes_for_floats(input, stream, mr); + return string_to_float( + output_type, cudf::strings_column_view{removed_quotes->view()}, false, stream, mr); + } + return string_to_float(output_type, cudf::strings_column_view{input}, false, stream, mr); } } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index e66892d83..e0a855194 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -99,8 +99,10 @@ std::unique_ptr remove_quotes( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); -std::unique_ptr remove_quotes_for_floats( +std::unique_ptr cast_strings_to_floats( cudf::column_view const& input, + cudf::data_type output_type, + bool allow_nonnumeric_numbers, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 66ef289de..d39e7ea99 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -252,9 +252,11 @@ public static ColumnVector removeQuotes(ColumnView input, boolean nullifyIfNotQu return new ColumnVector(removeQuotes(input.getNativeView(), nullifyIfNotQuoted)); } - public static ColumnVector removeQuotesForFloats(ColumnView input) { + public static ColumnVector castStringsToFloats(ColumnView input, DType outputType, + boolean allowNonNumericNumbers) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - return new ColumnVector(removeQuotesForFloats(input.getNativeView())); + return new ColumnVector(castStringsToFloats(input.getNativeView(), + outputType.getTypeId().getNativeId(), allowNonNumericNumbers)); } private static native int getMaxJSONPathDepth(); @@ -289,5 +291,5 @@ private static native long[] getJsonObjectMultiplePaths(long input, private static native long removeQuotes(long input, boolean nullifyIfNotQuoted); - private static native long removeQuotesForFloats(long input); + private static native long castStringsToFloats(long input, int outputTypeId, boolean allowNonNumericNumbers); } From 3024583cd9bc722fb6be8434a9bf4f0d742e7591 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 24 Oct 2024 16:49:06 -0700 Subject: [PATCH 19/58] WIP Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 6 +- src/main/cpp/src/from_json_to_structs.cu | 61 ++++++++++--------- src/main/cpp/src/json_utils.hpp | 4 +- .../nvidia/spark/rapids/jni/JSONUtils.java | 7 ++- 4 files changed, 41 insertions(+), 37 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 988d8d959..964b33bd5 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -213,7 +213,7 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToBooleans(JNIEnv* env, jc } JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToDecimals( - JNIEnv* env, jclass, jlong j_input, jint precision, jint scale, jboolean is_us_locale) + JNIEnv* env, jclass, jlong j_input, jint j_output_type_id, jboolean is_us_locale) { JNI_NULL_CHECK(env, j_input, "j_input is null", 0); @@ -222,7 +222,9 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsTo auto const input = *reinterpret_cast(j_input); return cudf::jni::ptr_as_jlong( - spark_rapids_jni::cast_strings_to_decimals(input, precision, scale, is_us_locale).release()); + spark_rapids_jni::cast_strings_to_decimals( + input, cudf::data_type{static_cast(j_output_type_id)}, is_us_locale) + .release()); } CATCH_STD(env, 0); } diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index b2707c39f..9be55190a 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -486,27 +486,13 @@ std::pair, rmm::device_uvector> cast_strings // TODO there is a bug here around 0 https://github.com/NVIDIA/spark-rapids/issues/10898 std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, - int precision, - int scale, + cudf::data_type output_type, bool is_us_locale, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { auto const string_count = input.size(); - if (string_count == 0) { - auto const dtype = [precision, scale]() { - if (precision <= std::numeric_limits::digits10) { - return cudf::data_type(cudf::type_id::DECIMAL32, scale); - } else if (precision <= std::numeric_limits::digits10) { - return cudf::data_type(cudf::type_id::DECIMAL64, scale); - } else if (precision <= std::numeric_limits<__int128_t>::digits10) { - return cudf::data_type(cudf::type_id::DECIMAL128, scale); - } else { - CUDF_FAIL("Unable to support decimal with precision " + std::to_string(precision)); - } - }(); - return cudf::make_empty_column(dtype); - } + if (string_count == 0) { return cudf::make_empty_column(output_type); } CUDF_EXPECTS(is_us_locale, "String to decimal conversion is only supported in US locale."); @@ -799,6 +785,8 @@ std::pair, rmm::device_uvector> remove_quote std::unique_ptr convert_column_type(std::unique_ptr& input, json_schema_element const& schema, + bool allow_nonnumeric_numbers, + bool is_us_locale, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -810,14 +798,16 @@ std::unique_ptr convert_column_type(std::unique_ptr& CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); return ::spark_rapids_jni::cast_strings_to_integers(input->view(), schema.type, stream, mr); } - // if (cudf::is_floating_point(schema.type)) { - // CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); - // return ::cast_strings_to_floats(input->view(), schema.type, stream, mr); - // } - // if (cudf::is_fixed_point(schema.type)) { - // CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); - // return ::cast_strings_to_decimals(input->view(), schema.type, stream, mr); - // } + if (cudf::is_floating_point(schema.type)) { + CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); + return ::spark_rapids_jni::cast_strings_to_floats( + input->view(), schema.type, allow_nonnumeric_numbers, stream, mr); + } + if (cudf::is_fixed_point(schema.type)) { + CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); + return ::spark_rapids_jni::cast_strings_to_decimals( + input->view(), schema.type, is_us_locale, stream, mr); + } if (schema.type.id() == cudf::type_id::STRING) { CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); return ::spark_rapids_jni::remove_quotes(input->view(), false, stream, mr); @@ -833,6 +823,8 @@ std::unique_ptr convert_column_type(std::unique_ptr& auto new_child = convert_column_type(input_content.children[cudf::lists_column_view::child_column_index], schema.child_types.front().second, + allow_nonnumeric_numbers, + is_us_locale, stream, mr); @@ -854,8 +846,12 @@ std::unique_ptr convert_column_type(std::unique_ptr& new_children.reserve(num_children); auto input_content = input->release(); for (cudf::size_type i = 0; i < input->num_children(); ++i) { - new_children.emplace_back( - convert_column_type(input_content.children[i], schema.child_types[i].second, stream, mr)); + new_children.emplace_back(convert_column_type(input_content.children[i], + schema.child_types[i].second, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr)); } return cudf::make_structs_column(num_rows, @@ -890,6 +886,7 @@ std::unique_ptr from_json_to_structs( cudf::strings_column_view const& input, std::vector> const& schema, cudf::io::json_reader_options const& json_options, + bool is_us_locale, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -924,7 +921,12 @@ std::unique_ptr from_json_to_structs( std::vector> converted_cols(parsed_columns.size()); for (std::size_t i = 0; i < parsed_columns.size(); ++i) { check_schema(parsed_meta.schema_info[i], schema[i]); - converted_cols[i] = convert_column_type(parsed_columns[i], schema[i].second, stream, mr); + converted_cols[i] = convert_column_type(parsed_columns[i], + schema[i].second, + json_options.is_allowed_nonnumeric_numbers(), + is_us_locale, + stream, + mr); } auto const valid_it = is_null_or_empty->view().begin(); @@ -1004,15 +1006,14 @@ std::unique_ptr cast_strings_to_dates(cudf::column_view const& inp } std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, - int precision, - int scale, + cudf::data_type output_type, bool is_us_locale, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::cast_strings_to_decimals(input, precision, scale, is_us_locale, stream, mr); + return detail::cast_strings_to_decimals(input, output_type, is_us_locale, stream, mr); } std::unique_ptr remove_quotes(cudf::column_view const& input, diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index e0a855194..fb5f34f4d 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -56,6 +56,7 @@ std::unique_ptr from_json_to_structs( cudf::strings_column_view const& input, std::vector> const& schema, cudf::io::json_reader_options const& json_options, + bool is_us_locale, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); @@ -73,8 +74,7 @@ std::unique_ptr cast_strings_to_booleans( std::unique_ptr cast_strings_to_decimals( cudf::column_view const& input, - int precision, - int scale, + cudf::data_type output_type, bool is_us_locale, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index d39e7ea99..48f70d3f9 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -225,10 +225,11 @@ public static ColumnVector castStringsToBooleans(ColumnView input) { return new ColumnVector(castStringsToBooleans(input.getNativeView())); } - public static ColumnVector castStringsToDecimals(ColumnView input, int precision, int scale, + public static ColumnVector castStringsToDecimals(ColumnView input, DType outputType, boolean isUSLocale) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - return new ColumnVector(castStringsToDecimals(input.getNativeView(), precision, scale, isUSLocale)); + return new ColumnVector(castStringsToDecimals(input.getNativeView(), + outputType.getTypeId().getNativeId(), isUSLocale)); } public static ColumnVector castStringsToIntegers(ColumnView input, DType output_type) { @@ -283,7 +284,7 @@ private static native long[] getJsonObjectMultiplePaths(long input, private static native long castStringsToBooleans(long input); - private static native long castStringsToDecimals(long input, int precision, int scale, boolean isUSLocale); + private static native long castStringsToDecimals(long input, int outputTypeId, boolean isUSLocale); private static native long castStringsToIntegers(long input, int outputType); From d33d8e24051a4eb2ffa2399a9d15a261ebc92c14 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 25 Oct 2024 15:25:53 -0700 Subject: [PATCH 20/58] WIP: Implementing `fromJSONToStructs` Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 47 +++++++++++++++++++ src/main/cpp/src/from_json_to_structs.cu | 16 +++++++ src/main/cpp/src/json_utils.hpp | 12 ++++- .../nvidia/spark/rapids/jni/JSONUtils.java | 35 ++++++++++++++ 4 files changed, 108 insertions(+), 2 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 964b33bd5..7325600b1 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -18,6 +18,7 @@ #include "get_json_object.hpp" #include "json_utils.hpp" +#include #include #include @@ -303,4 +304,50 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsTo CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL +Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJSONToStructs(JNIEnv* env, + jclass, + jlong j_input, + jobjectArray j_col_names, + jintArray j_num_children, + jintArray j_types, + jintArray j_scales, + jintArray j_precisions, + jboolean normalize_single_quotes, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control, + jboolean prune_columns, + jboolean is_us_locale) +{ + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + + try { + cudf::jni::auto_set_device(env); + + auto const input = *reinterpret_cast(j_input).to_vector(); + auto const col_names = cudf::jni::native_jstringArray(env, j_col_names).as_cpp_vector(); + auto const num_children = cudf::jni::native_jintArray(env, j_num_children).to_vector(); + auto const types = cudf::jni::native_jintArray(env, j_types).to_vector(); + auto const scales = cudf::jni::native_jintArray(env, j_scales).to_vector(); + auto const precisions = cudf::jni::native_jintArray(env, j_precisions).to_vector(); + + return cudf::jni::ptr_as_jlong( + spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{input}, + col_names, + num_children, + types, + scales, + precisions, + normalize_single_quotes, + allow_leading_zeros, + allow_nonnumeric_numbers, + allow_unquoted_control, + prune_columns, + is_us_locale) + .release()); + } + CATCH_STD(env, 0); +} + } // extern "C" diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 9be55190a..7ebd7a03e 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -891,6 +891,22 @@ std::unique_ptr from_json_to_structs( rmm::device_async_resource_ref mr) { auto const [is_null_or_empty, concat_input, delimiter] = concat_json(input, stream, mr); + + // cudf::io::json_reader_options_builder builder = + // cudf::io::json_reader_options::builder() + // .lines(true) + // .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + // .normalize_whitespace(true) + // .mixed_types_as_string(true) + // .keep_quotes(true) + // .experimental(true) + // .normalize_single_quotes(normalize_single_quotes) + // .strict_validation(true) + // .numeric_leading_zeros(allow_leading_zeros) + // .nonnumeric_numbers(allow_nonnumeric_numbers) + // .unquoted_control_chars(allow_unquoted_control) + // .prune_columns(prune_columns); + auto opts_builder = cudf::io::json_reader_options::builder( cudf::io::source_info{cudf::device_span{ diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index fb5f34f4d..8e29475b9 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -54,8 +54,16 @@ struct json_schema_element { std::unique_ptr from_json_to_structs( cudf::strings_column_view const& input, - std::vector> const& schema, - cudf::io::json_reader_options const& json_options, + std::vector const& col_names, + std::vector const& num_children, + std::vector const& types, + std::vector const& scales, + std::vector const& precisions, + bool normalize_single_quotes, + bool allow_leading_zeros, + bool allow_nonnumeric_numbers, + bool allow_unquoted_control, + bool prune_columns, bool is_us_locale, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 48f70d3f9..b25a3d983 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -260,6 +260,27 @@ public static ColumnVector castStringsToFloats(ColumnView input, DType outputTyp outputType.getTypeId().getNativeId(), allowNonNumericNumbers)); } + public static ColumnVector fromJSONToStructs(ColumnVector input, Schema schema, + int[] flattenedPrecision, JSONOptions opts, + boolean isUSLocale) { + assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; + boolean cudfPruneSchema = schema.getColumnNames() != null && + schema.getColumnNames().length != 0 && + opts.shouldCudfPruneSchema(); + + return new ColumnVector(fromJSONToStructs(input.getNativeView(), + schema.getFlattenedColumnNames(), schema.getFlattenedNumChildren(), + schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), + flattenedPrecision, + opts.isNormalizeSingleQuotes(), + opts.leadingZerosAllowed(), + opts.nonNumericNumbersAllowed(), + opts.unquotedControlChars(), + cudfPruneSchema, + isUSLocale)); + } + + private static native int getMaxJSONPathDepth(); private static native long getJsonObject(long input, @@ -293,4 +314,18 @@ private static native long[] getJsonObjectMultiplePaths(long input, private static native long removeQuotes(long input, boolean nullifyIfNotQuoted); private static native long castStringsToFloats(long input, int outputTypeId, boolean allowNonNumericNumbers); + + private static native long fromJSONToStructs(long input, + String[] names, + int[] numChildren, + int[] typeIds, + int[] typeScales, + int[] typePrecision, + boolean normalizeSingleQuotes, + boolean leadingZerosAllowed, + boolean nonNumericNumbersAllowed, + boolean unquotedControlChars, + boolean cudfPruneSchema, + boolean isUSLocale); + } From 1ea9cc8efe656747a6cd999acc255e88f5248e63 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 29 Oct 2024 11:43:50 -0700 Subject: [PATCH 21/58] Fix compile errors Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 21 +- src/main/cpp/src/from_json_to_structs.cu | 208 +++++++++++++----- src/main/cpp/src/json_utils.hpp | 2 +- .../nvidia/spark/rapids/jni/JSONUtils.java | 18 +- 4 files changed, 173 insertions(+), 76 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 7325600b1..6d0ad8b21 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -213,8 +213,14 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToBooleans(JNIEnv* env, jc CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToDecimals( - JNIEnv* env, jclass, jlong j_input, jint j_output_type_id, jboolean is_us_locale) +JNIEXPORT jlong JNICALL +Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToDecimals(JNIEnv* env, + jclass, + jlong j_input, + jint j_output_type_id, + jint precision, + jint scale, + jboolean is_us_locale) { JNI_NULL_CHECK(env, j_input, "j_input is null", 0); @@ -224,7 +230,10 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsTo return cudf::jni::ptr_as_jlong( spark_rapids_jni::cast_strings_to_decimals( - input, cudf::data_type{static_cast(j_output_type_id)}, is_us_locale) + input, + cudf::data_type{static_cast(j_output_type_id), scale}, + precision, + is_us_locale) .release()); } CATCH_STD(env, 0); @@ -317,7 +326,6 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJSONToStructs(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, - jboolean prune_columns, jboolean is_us_locale) { JNI_NULL_CHECK(env, j_input, "j_input is null", 0); @@ -325,7 +333,7 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJSONToStructs(JNIEnv* env, try { cudf::jni::auto_set_device(env); - auto const input = *reinterpret_cast(j_input).to_vector(); + auto const input = reinterpret_cast(j_input); auto const col_names = cudf::jni::native_jstringArray(env, j_col_names).as_cpp_vector(); auto const num_children = cudf::jni::native_jintArray(env, j_num_children).to_vector(); auto const types = cudf::jni::native_jintArray(env, j_types).to_vector(); @@ -333,7 +341,7 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJSONToStructs(JNIEnv* env, auto const precisions = cudf::jni::native_jintArray(env, j_precisions).to_vector(); return cudf::jni::ptr_as_jlong( - spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{input}, + spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{*input}, col_names, num_children, types, @@ -343,7 +351,6 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJSONToStructs(JNIEnv* env, allow_leading_zeros, allow_nonnumeric_numbers, allow_unquoted_control, - prune_columns, is_us_locale) .release()); } diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 7ebd7a03e..07c26e051 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -99,7 +99,7 @@ std::tuple, std::unique_ptr, c // Check if the input rows are either null or empty. // This will be returned to the caller. - rmm::device_uvector is_null_or_empty(input.size(), stream, mr); + rmm::device_uvector is_invalid_or_empty(input.size(), stream, mr); thrust::for_each( rmm::exec_policy_nosync(stream), @@ -107,7 +107,7 @@ std::tuple, std::unique_ptr, c thrust::make_counting_iterator(input.size() * static_cast(cudf::detail::warp_size)), [input = *d_input_ptr, output = thrust::make_zip_iterator(thrust::make_tuple( - is_valid_input.begin(), is_null_or_empty.begin()))] __device__(int64_t tidx) { + is_valid_input.begin(), is_invalid_or_empty.begin()))] __device__(int64_t tidx) { // Execute one warp per row to minimize thread divergence. if ((tidx % cudf::detail::warp_size) != 0) { return; } auto const idx = tidx / cudf::detail::warp_size; @@ -239,7 +239,7 @@ std::tuple, std::unique_ptr, c stream, mr); - return {std::make_unique(std::move(is_null_or_empty), rmm::device_buffer{}, 0), + return {std::make_unique(std::move(is_invalid_or_empty), rmm::device_buffer{}, 0), std::move(concat_strings->release().data), delimiter}; } @@ -487,6 +487,7 @@ std::pair, rmm::device_uvector> cast_strings // TODO there is a bug here around 0 https://github.com/NVIDIA/spark-rapids/issues/10898 std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, cudf::data_type output_type, + int precision, bool is_us_locale, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) @@ -567,7 +568,7 @@ std::unique_ptr cast_strings_to_decimals(cudf::column_view const& // If the output strings column does not change in its total bytes, we know that it does not have // any '"' or ',' characters. if (bytes == input_sv.chars_size(stream)) { - return string_to_decimal(precision, scale, input_sv, false, false, stream, mr); + return string_to_decimal(precision, output_type.scale(), input_sv, false, false, stream, mr); } auto const out_offsets = @@ -612,7 +613,7 @@ std::unique_ptr cast_strings_to_decimals(cudf::column_view const& 0, rmm::device_buffer{0, stream, mr}); return string_to_decimal(precision, - scale, + output_type.scale(), cudf::strings_column_view{unquoted_strings->view()}, false, false, @@ -783,8 +784,18 @@ std::pair, rmm::device_uvector> remove_quote return {std::move(output), rmm::device_uvector(0, stream)}; } +/** + * @brief The struct similar to `cudf::io::schema_element` with adding decimal precision and + * preserving column order. + */ +struct schema_element_with_precision { + cudf::data_type type; + int precision; + std::vector> child_types; +}; + std::unique_ptr convert_column_type(std::unique_ptr& input, - json_schema_element const& schema, + schema_element_with_precision const& schema, bool allow_nonnumeric_numbers, bool is_us_locale, rmm::cuda_stream_view stream, @@ -806,7 +817,7 @@ std::unique_ptr convert_column_type(std::unique_ptr& if (cudf::is_fixed_point(schema.type)) { CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); return ::spark_rapids_jni::cast_strings_to_decimals( - input->view(), schema.type, is_us_locale, stream, mr); + input->view(), schema.type, schema.precision, is_us_locale, stream, mr); } if (schema.type.id() == cudf::type_id::STRING) { CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); @@ -839,7 +850,7 @@ std::unique_ptr convert_column_type(std::unique_ptr& } if (schema.type.id() == cudf::type_id::STRUCT) { - CUDF_EXPECTS(input->type().id() == cudf::type_id::STRUCT, "Input column should be LIST."); + CUDF_EXPECTS(input->type().id() == cudf::type_id::STRUCT, "Input column should be STRUCT."); auto const num_children = input->num_children(); std::vector> new_children; @@ -867,87 +878,137 @@ std::unique_ptr convert_column_type(std::unique_ptr& } void check_schema(cudf::io::column_name_info const& read_info, - std::pair const& column_schema) + std::pair const& column_schema) { CUDF_EXPECTS(read_info.name == column_schema.first, "Mismatched column name."); CUDF_EXPECTS(read_info.children.size() == column_schema.second.child_types.size(), "Mismatched number of children."); for (std::size_t i = 0; i < read_info.children.size(); ++i) { - // auto const find_it = column_schema.second.child_types.find(read_info.children[i].name); - // check_schema(read_info.children[i], - // find_it != column_schema.child_types.end() ? *find_it : {"", {}}); check_schema(read_info.children[i], column_schema.second.child_types[i]); } } +std::pair parse_schema_element( + std::size_t& index, + std::vector const& col_names, + std::vector const& num_children, + std::vector const& types, + std::vector const& scales, + std::vector const& precisions) +{ + auto const d_type = cudf::data_type{static_cast(types[index]), scales[index]}; + auto const precision = precisions[index]; + auto const col_num_children = num_children[index]; + index++; + std::map children; + std::vector> children_with_precisions; + std::vector child_names(col_num_children); + + if (d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST) { + for (int i = 0; i < col_num_children; i++) { + auto const& name = col_names[index]; + auto [child, child_with_precision] = + parse_schema_element(index, col_names, num_children, types, scales, precisions); + children.emplace(name, std::move(child)); + children_with_precisions.emplace_back(name, std::move(child_with_precision)); + child_names[i] = name; + } + } else if (col_num_children != 0) { + throw std::invalid_argument("Found children for a type that should have none."); + } + return {cudf::io::schema_element{d_type, std::move(children), {std::move(child_names)}}, + schema_element_with_precision{d_type, precision, std::move(children_with_precisions)}}; +} + +std::pair generate_struct_schema( + std::vector const& col_names, + std::vector const& num_children, + std::vector const& types, + std::vector const& scales, + std::vector const& precisions) +{ + std::map schema_cols; + std::vector> schema_cols_with_precisions; + std::vector name_order; + + std::size_t at = 0; + while (at < types.size()) { + auto const& name = col_names[at]; + auto [child, child_with_precision] = + parse_schema_element(at, col_names, num_children, types, scales, precisions); + schema_cols.emplace(name, std::move(child)); + schema_cols_with_precisions.emplace_back(name, std::move(child_with_precision)); + name_order.push_back(name); + } + return { + cudf::io::schema_element{ + cudf::data_type{cudf::type_id::STRUCT}, std::move(schema_cols), {std::move(name_order)}}, + schema_element_with_precision{ + cudf::data_type{cudf::type_id::STRUCT}, -1, std::move(schema_cols_with_precisions)}}; +} + } // namespace -std::unique_ptr from_json_to_structs( - cudf::strings_column_view const& input, - std::vector> const& schema, - cudf::io::json_reader_options const& json_options, - bool is_us_locale, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr from_json_to_structs(cudf::strings_column_view const& input, + std::vector const& col_names, + std::vector const& num_children, + std::vector const& types, + std::vector const& scales, + std::vector const& precisions, + bool normalize_single_quotes, + bool allow_leading_zeros, + bool allow_nonnumeric_numbers, + bool allow_unquoted_control, + bool is_us_locale, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - auto const [is_null_or_empty, concat_input, delimiter] = concat_json(input, stream, mr); - - // cudf::io::json_reader_options_builder builder = - // cudf::io::json_reader_options::builder() - // .lines(true) - // .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) - // .normalize_whitespace(true) - // .mixed_types_as_string(true) - // .keep_quotes(true) - // .experimental(true) - // .normalize_single_quotes(normalize_single_quotes) - // .strict_validation(true) - // .numeric_leading_zeros(allow_leading_zeros) - // .nonnumeric_numbers(allow_nonnumeric_numbers) - // .unquoted_control_chars(allow_unquoted_control) - // .prune_columns(prune_columns); + auto const [is_invalid_or_empty, concat_input, delimiter] = concat_json(input, stream, mr); + auto const [schema, schema_with_precision] = + generate_struct_schema(col_names, num_children, types, scales, precisions); auto opts_builder = cudf::io::json_reader_options::builder( cudf::io::source_info{cudf::device_span{ static_cast(concat_input->data()), concat_input->size()}}) - .dayfirst(json_options.is_enabled_dayfirst()) - .lines(json_options.is_enabled_lines()) - .recovery_mode(json_options.recovery_mode()) - .normalize_single_quotes(json_options.is_enabled_normalize_single_quotes()) - .normalize_whitespace(json_options.is_enabled_normalize_whitespace()) - .mixed_types_as_string(json_options.is_enabled_mixed_types_as_string()) - .delimiter(json_options.get_delimiter()) - .strict_validation(json_options.is_strict_validation()) - .keep_quotes(json_options.is_enabled_keep_quotes()) - .prune_columns(json_options.is_enabled_prune_columns()) - .experimental(json_options.is_enabled_experimental()); - if (json_options.is_strict_validation()) { - opts_builder.numeric_leading_zeros(json_options.is_allowed_numeric_leading_zeros()) - .nonnumeric_numbers(json_options.is_allowed_nonnumeric_numbers()) - .unquoted_control_chars(json_options.is_allowed_unquoted_control_chars()); - } + // fixed options + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .normalize_whitespace(true) + .mixed_types_as_string(true) + .keep_quotes(true) + .experimental(true) + .normalize_single_quotes(normalize_single_quotes) + .strict_validation(true) + // + .delimiter(delimiter) + .numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control) + .dtypes(schema) + .prune_columns(schema.child_types.size() != 0); + auto const parsed_table_with_meta = cudf::io::read_json(opts_builder.build()); auto const& parsed_meta = parsed_table_with_meta.metadata; auto parsed_columns = parsed_table_with_meta.tbl->release(); - CUDF_EXPECTS(parsed_columns.size() == schema.size(), + CUDF_EXPECTS(parsed_columns.size() == schema.child_types.size(), "Numbers of columns in the input table is different from schema size."); std::vector> converted_cols(parsed_columns.size()); for (std::size_t i = 0; i < parsed_columns.size(); ++i) { - check_schema(parsed_meta.schema_info[i], schema[i]); + check_schema(parsed_meta.schema_info[i], schema_with_precision.child_types[i]); converted_cols[i] = convert_column_type(parsed_columns[i], - schema[i].second, - json_options.is_allowed_nonnumeric_numbers(), + schema_with_precision.child_types[i].second, + allow_nonnumeric_numbers, is_us_locale, stream, mr); } - auto const valid_it = is_null_or_empty->view().begin(); + auto const valid_it = is_invalid_or_empty->view().begin(); auto [null_mask, null_count] = cudf::detail::valid_if( - valid_it, valid_it + is_null_or_empty->size(), thrust::logical_not{}, stream, mr); + valid_it, valid_it + is_invalid_or_empty->size(), thrust::logical_not{}, stream, mr); return cudf::make_structs_column( input.size(), @@ -960,6 +1021,36 @@ std::unique_ptr from_json_to_structs( } // namespace detail +std::unique_ptr from_json_to_structs(cudf::strings_column_view const& input, + std::vector const& col_names, + std::vector const& num_children, + std::vector const& types, + std::vector const& scales, + std::vector const& precisions, + bool normalize_single_quotes, + bool allow_leading_zeros, + bool allow_nonnumeric_numbers, + bool allow_unquoted_control, + bool is_us_locale, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::from_json_to_structs(input, + col_names, + num_children, + types, + scales, + precisions, + normalize_single_quotes, + allow_leading_zeros, + allow_nonnumeric_numbers, + allow_unquoted_control, + is_us_locale, + stream, + mr); +} + std::tuple, std::unique_ptr, char> concat_json( cudf::strings_column_view const& input, rmm::cuda_stream_view stream, @@ -1023,13 +1114,14 @@ std::unique_ptr cast_strings_to_dates(cudf::column_view const& inp std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, cudf::data_type output_type, + int precision, bool is_us_locale, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::cast_strings_to_decimals(input, output_type, is_us_locale, stream, mr); + return detail::cast_strings_to_decimals(input, output_type, precision, is_us_locale, stream, mr); } std::unique_ptr remove_quotes(cudf::column_view const& input, diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 8e29475b9..3522c0a8c 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -63,7 +63,6 @@ std::unique_ptr from_json_to_structs( bool allow_leading_zeros, bool allow_nonnumeric_numbers, bool allow_unquoted_control, - bool prune_columns, bool is_us_locale, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); @@ -83,6 +82,7 @@ std::unique_ptr cast_strings_to_booleans( std::unique_ptr cast_strings_to_decimals( cudf::column_view const& input, cudf::data_type output_type, + int precision, bool is_us_locale, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index b25a3d983..dab6f07c7 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -226,10 +226,11 @@ public static ColumnVector castStringsToBooleans(ColumnView input) { } public static ColumnVector castStringsToDecimals(ColumnView input, DType outputType, + int precision, int scale, boolean isUSLocale) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; return new ColumnVector(castStringsToDecimals(input.getNativeView(), - outputType.getTypeId().getNativeId(), isUSLocale)); + outputType.getTypeId().getNativeId(), precision, scale, isUSLocale)); } public static ColumnVector castStringsToIntegers(ColumnView input, DType output_type) { @@ -260,23 +261,18 @@ public static ColumnVector castStringsToFloats(ColumnView input, DType outputTyp outputType.getTypeId().getNativeId(), allowNonNumericNumbers)); } - public static ColumnVector fromJSONToStructs(ColumnVector input, Schema schema, - int[] flattenedPrecision, JSONOptions opts, + public static ColumnVector fromJSONToStructs(ColumnVector input, Schema schema, JSONOptions opts, boolean isUSLocale) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - boolean cudfPruneSchema = schema.getColumnNames() != null && - schema.getColumnNames().length != 0 && - opts.shouldCudfPruneSchema(); return new ColumnVector(fromJSONToStructs(input.getNativeView(), schema.getFlattenedColumnNames(), schema.getFlattenedNumChildren(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), - flattenedPrecision, + schema.getFlattenedDecimalPrecisions(), opts.isNormalizeSingleQuotes(), opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), - cudfPruneSchema, isUSLocale)); } @@ -305,7 +301,10 @@ private static native long[] getJsonObjectMultiplePaths(long input, private static native long castStringsToBooleans(long input); - private static native long castStringsToDecimals(long input, int outputTypeId, boolean isUSLocale); + private static native long castStringsToDecimals(long input, int outputTypeId, + int precision, + int scale, + boolean isUSLocale); private static native long castStringsToIntegers(long input, int outputType); @@ -325,7 +324,6 @@ private static native long fromJSONToStructs(long input, boolean leadingZerosAllowed, boolean nonNumericNumbersAllowed, boolean unquotedControlChars, - boolean cudfPruneSchema, boolean isUSLocale); } From c1bb2d4b5d686fd8b3443bfc47da73c895ac779b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 29 Oct 2024 12:05:06 -0700 Subject: [PATCH 22/58] Cleanup Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 157 ------------------ src/main/cpp/src/json_utils.hpp | 25 +-- .../nvidia/spark/rapids/jni/JSONUtils.java | 123 +------------- 3 files changed, 8 insertions(+), 297 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 6d0ad8b21..417217faa 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -156,163 +156,6 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_extractRawMap CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_concatenateJsonStrings( - JNIEnv* env, jclass, jlong j_input) -{ - JNI_NULL_CHECK(env, j_input, "j_input is null", 0); - - try { - cudf::jni::auto_set_device(env); - auto const input_cv = reinterpret_cast(j_input); - auto [is_valid, joined_strings, delimiter] = - spark_rapids_jni::concat_json(cudf::strings_column_view{*input_cv}); - - // The output array contains 5 elements: - // [0]: address of the cudf::column object `is_valid` in host memory - // [1]: address of data buffer of the concatenated strings in device memory - // [2]: data length - // [3]: address of the rmm::device_buffer object (of the concatenated strings) in host memory - // [4]: delimiter char - auto out_handles = cudf::jni::native_jlongArray(env, 5); - out_handles[0] = reinterpret_cast(is_valid.release()); - out_handles[1] = reinterpret_cast(joined_strings->data()); - out_handles[2] = static_cast(joined_strings->size()); - out_handles[3] = reinterpret_cast(joined_strings.release()); - out_handles[4] = static_cast(delimiter); - return out_handles.get_jArray(); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_makeStructs( - JNIEnv* env, jclass, jlongArray j_children, jlong j_is_null) -{ - JNI_NULL_CHECK(env, j_children, "j_children is null", 0); - JNI_NULL_CHECK(env, j_is_null, "j_is_null is null", 0); - - try { - cudf::jni::auto_set_device(env); - auto const children = - cudf::jni::native_jpointerArray{env, j_children}.get_dereferenced(); - auto const is_null = *reinterpret_cast(j_is_null); - return cudf::jni::ptr_as_jlong(spark_rapids_jni::make_structs(children, is_null).release()); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToBooleans(JNIEnv* env, jclass, jlong j_input) -{ - JNI_NULL_CHECK(env, j_input, "j_input is null", 0); - - try { - cudf::jni::auto_set_device(env); - auto const input = *reinterpret_cast(j_input); - return cudf::jni::ptr_as_jlong(spark_rapids_jni::cast_strings_to_booleans(input).release()); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToDecimals(JNIEnv* env, - jclass, - jlong j_input, - jint j_output_type_id, - jint precision, - jint scale, - jboolean is_us_locale) -{ - JNI_NULL_CHECK(env, j_input, "j_input is null", 0); - - try { - cudf::jni::auto_set_device(env); - auto const input = *reinterpret_cast(j_input); - - return cudf::jni::ptr_as_jlong( - spark_rapids_jni::cast_strings_to_decimals( - input, - cudf::data_type{static_cast(j_output_type_id), scale}, - precision, - is_us_locale) - .release()); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToIntegers( - JNIEnv* env, jclass, jlong j_input, jint output_type_id) -{ - JNI_NULL_CHECK(env, j_input, "j_input is null", 0); - - try { - cudf::jni::auto_set_device(env); - auto const input = *reinterpret_cast(j_input); - - return cudf::jni::ptr_as_jlong( - spark_rapids_jni::cast_strings_to_integers( - input, cudf::data_type{static_cast(output_type_id)}) - .release()); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToDates(JNIEnv* env, - jclass, - jlong j_input, - jstring j_date_regex, - jstring j_date_format, - jboolean error_if_invalid) -{ - JNI_NULL_CHECK(env, j_input, "j_input is null", 0); - JNI_NULL_CHECK(env, j_date_regex, "date_regex is null", 0); - JNI_NULL_CHECK(env, j_date_format, "date_format is null", 0); - - try { - cudf::jni::auto_set_device(env); - auto const input = *reinterpret_cast(j_input); - cudf::jni::native_jstring date_regex(env, j_date_regex); - cudf::jni::native_jstring date_format(env, j_date_format); - - auto output = spark_rapids_jni::cast_strings_to_dates( - input, date_regex.get(), date_format.get(), error_if_invalid); - if (output == nullptr) { return 0; } - return cudf::jni::ptr_as_jlong(output.release()); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_removeQuotes( - JNIEnv* env, jclass, jlong j_input, jboolean nullify_if_not_quoted) -{ - JNI_NULL_CHECK(env, j_input, "j_input is null", 0); - - try { - cudf::jni::auto_set_device(env); - auto const input = *reinterpret_cast(j_input); - return cudf::jni::ptr_as_jlong( - spark_rapids_jni::remove_quotes(input, nullify_if_not_quoted).release()); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToFloats( - JNIEnv* env, jclass, jlong j_input, jint j_output_type_id, jboolean allow_nonnumeric_numbers) -{ - JNI_NULL_CHECK(env, j_input, "j_input is null", 0); - - try { - cudf::jni::auto_set_device(env); - auto const input = *reinterpret_cast(j_input); - return cudf::jni::ptr_as_jlong(spark_rapids_jni::cast_strings_to_floats( - input, - cudf::data_type{static_cast(j_output_type_id)}, - allow_nonnumeric_numbers) - .release()); - } - CATCH_STD(env, 0); -} - JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJSONToStructs(JNIEnv* env, jclass, diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 3522c0a8c..3a720296d 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -28,29 +29,10 @@ namespace spark_rapids_jni { -// TODO: replace rmm::mr::get_current_device_resource() by cudf - std::unique_ptr from_json_to_raw_map( cudf::strings_column_view const& input, rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); - -std::tuple, std::unique_ptr, char> concat_json( - cudf::strings_column_view const& input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr make_structs( - std::vector const& input, - cudf::column_view const& is_null, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); - -struct json_schema_element { - cudf::data_type type; - - std::vector> child_types; -}; + rmm::device_async_resource_ref mr = cudf::get_current_device_resource()); std::unique_ptr from_json_to_structs( cudf::strings_column_view const& input, @@ -65,7 +47,7 @@ std::unique_ptr from_json_to_structs( bool allow_unquoted_control, bool is_us_locale, rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + rmm::device_async_resource_ref mr = cudf::get_current_device_resource()); // // @@ -113,5 +95,4 @@ std::unique_ptr cast_strings_to_floats( bool allow_nonnumeric_numbers, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); - } // namespace spark_rapids_jni diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index dab6f07c7..7434a52cb 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -161,106 +161,13 @@ public static ColumnVector extractRawMapFromJsonString(ColumnView input) { } /** - * A class to hold the result when concatenating JSON strings. - *

- * A long with the concatenated data, the result also contains a vector that indicates - * whether each row in the input is null or empty, and the delimiter used for concatenation. - */ - public static class ConcatenatedJson implements AutoCloseable { - public final ColumnVector isNullOrEmpty; - public final DeviceMemoryBuffer data; - public final char delimiter; - - public ConcatenatedJson(ColumnVector isNullOrEmpty, DeviceMemoryBuffer data, char delimiter) { - this.isNullOrEmpty = isNullOrEmpty; - this.data = data; - this.delimiter = delimiter; - } - - @Override - public void close() { - isNullOrEmpty.close(); - data.close(); - } - } - - /** - * Concatenate JSON strings in the input column into a single JSON string. - *

- * During concatenation, the function also generates a boolean vector that indicates whether - * each row in the input is null or empty. The delimiter used for concatenation is also returned. - * - * @param input The input strings column to concatenate - * @return A {@link ConcatenatedJson} object that contains the concatenated output - */ - public static ConcatenatedJson concatenateJsonStrings(ColumnView input) { - assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - long[] concatenated = concatenateJsonStrings(input.getNativeView()); - return new ConcatenatedJson(new ColumnVector(concatenated[0]), - DeviceMemoryBuffer.fromRmm(concatenated[1], concatenated[2], concatenated[3]), - (char) concatenated[4]); - } - - /** - * Create a structs column from the given children columns and a boolean column specifying - * the rows at which the output column.should be null. - *

- * Note that the children columns are expected to have null rows at the same positions indicated - * by the input isNull column. * - * @param children The children columns of the output structs column - * @param isNull A boolean column specifying the rows at which the output column should be null - * @return A structs column created from the given children and the isNull column + * @param input + * @param schema + * @param opts + * @param isUSLocale + * @return */ - public static ColumnVector makeStructs(ColumnView[] children, ColumnView isNull) { - long[] handles = new long[children.length]; - for (int i = 0; i < children.length; i++) { - handles[i] = children[i].getNativeView(); - } - return new ColumnVector(makeStructs(handles, isNull.getNativeView())); - } - - public static ColumnVector castStringsToBooleans(ColumnView input) { - assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - return new ColumnVector(castStringsToBooleans(input.getNativeView())); - } - - public static ColumnVector castStringsToDecimals(ColumnView input, DType outputType, - int precision, int scale, - boolean isUSLocale) { - assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - return new ColumnVector(castStringsToDecimals(input.getNativeView(), - outputType.getTypeId().getNativeId(), precision, scale, isUSLocale)); - } - - public static ColumnVector castStringsToIntegers(ColumnView input, DType output_type) { - assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - return new ColumnVector(castStringsToIntegers(input.getNativeView(), - output_type.getTypeId().getNativeId())); - } - - public static ColumnVector castStringsToDates(ColumnView input, String dateRegex, - String dateFormat, boolean failOnInvalid) { - assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - long output = castStringsToDates(input.getNativeView(), dateRegex, dateFormat, failOnInvalid); - if (output == 0) { - return null; - } - return new ColumnVector(output); - } - - public static ColumnVector removeQuotes(ColumnView input, boolean nullifyIfNotQuoted) { - assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - return new ColumnVector(removeQuotes(input.getNativeView(), nullifyIfNotQuoted)); - } - - public static ColumnVector castStringsToFloats(ColumnView input, DType outputType, - boolean allowNonNumericNumbers) { - assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - return new ColumnVector(castStringsToFloats(input.getNativeView(), - outputType.getTypeId().getNativeId(), allowNonNumericNumbers)); - } - public static ColumnVector fromJSONToStructs(ColumnVector input, Schema schema, JSONOptions opts, boolean isUSLocale) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; @@ -292,28 +199,8 @@ private static native long[] getJsonObjectMultiplePaths(long input, long memoryBudgetBytes, int parallelOverride); - private static native long extractRawMapFromJsonString(long input); - private static native long[] concatenateJsonStrings(long input); - - private static native long makeStructs(long[] children, long isNull); - - private static native long castStringsToBooleans(long input); - - private static native long castStringsToDecimals(long input, int outputTypeId, - int precision, - int scale, - boolean isUSLocale); - - private static native long castStringsToIntegers(long input, int outputType); - - private static native long castStringsToDates(long input, String dateRegex, String dateFormat, boolean failOnInvalid); - - private static native long removeQuotes(long input, boolean nullifyIfNotQuoted); - - private static native long castStringsToFloats(long input, int outputTypeId, boolean allowNonNumericNumbers); - private static native long fromJSONToStructs(long input, String[] names, int[] numChildren, From f6634b4cd5fc0e74fe5630e47259c8fe97bdfb47 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 29 Oct 2024 13:43:24 -0700 Subject: [PATCH 23/58] Revert code as we still need them --- src/main/cpp/src/JSONUtilsJni.cpp | 87 +++++++++++++++++++ .../nvidia/spark/rapids/jni/JSONUtils.java | 53 +++++++++-- 2 files changed, 132 insertions(+), 8 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 417217faa..8eeb04021 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -156,6 +156,93 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_extractRawMap CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL +Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToBooleans(JNIEnv* env, jclass, jlong j_input) +{ + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = *reinterpret_cast(j_input); + return cudf::jni::ptr_as_jlong(spark_rapids_jni::cast_strings_to_booleans(input).release()); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlong JNICALL +Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToDecimals(JNIEnv* env, + jclass, + jlong j_input, + jint j_output_type_id, + jint precision, + jint scale, + jboolean is_us_locale) +{ + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = *reinterpret_cast(j_input); + + return cudf::jni::ptr_as_jlong( + spark_rapids_jni::cast_strings_to_decimals( + input, + cudf::data_type{static_cast(j_output_type_id), scale}, + precision, + is_us_locale) + .release()); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToIntegers( + JNIEnv* env, jclass, jlong j_input, jint output_type_id) +{ + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = *reinterpret_cast(j_input); + + return cudf::jni::ptr_as_jlong( + spark_rapids_jni::cast_strings_to_integers( + input, cudf::data_type{static_cast(output_type_id)}) + .release()); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_removeQuotes( + JNIEnv* env, jclass, jlong j_input, jboolean nullify_if_not_quoted) +{ + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = *reinterpret_cast(j_input); + return cudf::jni::ptr_as_jlong( + spark_rapids_jni::remove_quotes(input, nullify_if_not_quoted).release()); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToFloats( + JNIEnv* env, jclass, jlong j_input, jint j_output_type_id, jboolean allow_nonnumeric_numbers) +{ + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = *reinterpret_cast(j_input); + return cudf::jni::ptr_as_jlong(spark_rapids_jni::cast_strings_to_floats( + input, + cudf::data_type{static_cast(j_output_type_id)}, + allow_nonnumeric_numbers) + .release()); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJSONToStructs(JNIEnv* env, jclass, diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 7434a52cb..8564ce5d0 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -160,14 +160,37 @@ public static ColumnVector extractRawMapFromJsonString(ColumnView input) { return new ColumnVector(extractRawMapFromJsonString(input.getNativeView())); } - /** - * - * @param input - * @param schema - * @param opts - * @param isUSLocale - * @return - */ + public static ColumnVector castStringsToBooleans(ColumnView input) { + assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; + return new ColumnVector(castStringsToBooleans(input.getNativeView())); + } + + public static ColumnVector castStringsToDecimals(ColumnView input, DType outputType, + int precision, int scale, + boolean isUSLocale) { + assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; + return new ColumnVector(castStringsToDecimals(input.getNativeView(), + outputType.getTypeId().getNativeId(), precision, scale, isUSLocale)); + } + + public static ColumnVector castStringsToIntegers(ColumnView input, DType output_type) { + assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; + return new ColumnVector(castStringsToIntegers(input.getNativeView(), + output_type.getTypeId().getNativeId())); + } + + public static ColumnVector removeQuotes(ColumnView input, boolean nullifyIfNotQuoted) { + assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; + return new ColumnVector(removeQuotes(input.getNativeView(), nullifyIfNotQuoted)); + } + + public static ColumnVector castStringsToFloats(ColumnView input, DType outputType, + boolean allowNonNumericNumbers) { + assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; + return new ColumnVector(castStringsToFloats(input.getNativeView(), + outputType.getTypeId().getNativeId(), allowNonNumericNumbers)); + } + public static ColumnVector fromJSONToStructs(ColumnVector input, Schema schema, JSONOptions opts, boolean isUSLocale) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; @@ -199,8 +222,22 @@ private static native long[] getJsonObjectMultiplePaths(long input, long memoryBudgetBytes, int parallelOverride); + private static native long extractRawMapFromJsonString(long input); + private static native long castStringsToBooleans(long input); + + private static native long castStringsToDecimals(long input, int outputTypeId, + int precision, + int scale, + boolean isUSLocale); + + private static native long castStringsToIntegers(long input, int outputType); + + private static native long removeQuotes(long input, boolean nullifyIfNotQuoted); + + private static native long castStringsToFloats(long input, int outputTypeId, boolean allowNonNumericNumbers); + private static native long fromJSONToStructs(long input, String[] names, int[] numChildren, From 06b2c19364b3c2763bd90b82ce0e86584aeeeca7 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 29 Oct 2024 13:54:38 -0700 Subject: [PATCH 24/58] Add error check Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 8eeb04021..5027e7c39 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -259,6 +259,11 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJSONToStructs(JNIEnv* env, jboolean is_us_locale) { JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + JNI_NULL_CHECK(env, j_col_names, "j_col_names is null", 0); + JNI_NULL_CHECK(env, j_num_children, "j_num_children is null", 0); + JNI_NULL_CHECK(env, j_types, "j_types is null", 0); + JNI_NULL_CHECK(env, j_scales, "j_scales is null", 0); + JNI_NULL_CHECK(env, j_precisions, "j_precisions is null", 0); try { cudf::jni::auto_set_device(env); @@ -270,6 +275,12 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJSONToStructs(JNIEnv* env, auto const scales = cudf::jni::native_jintArray(env, j_scales).to_vector(); auto const precisions = cudf::jni::native_jintArray(env, j_precisions).to_vector(); + CUDF_EXPECTS(col_names.size() > 0, "Invalid schema data."); + CUDF_EXPECTS(col_names.size() == num_children.size(), "Invalid schema data."); + CUDF_EXPECTS(col_names.size() == types.size(), "Invalid schema data."); + CUDF_EXPECTS(col_names.size() == scales.size(), "Invalid schema data."); + CUDF_EXPECTS(col_names.size() == precisions.size(), "Invalid schema data."); + return cudf::jni::ptr_as_jlong( spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{*input}, col_names, From 2dcdd11f1791486dc78d450f93ba5b9a9a0a381f Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 29 Oct 2024 14:11:04 -0700 Subject: [PATCH 25/58] Add more comments Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 51 ++++++++++++++---------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 07c26e051..92b457bce 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -90,6 +90,8 @@ std::tuple, std::unique_ptr, c rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_FUNC_RANGE(); + auto const d_input_ptr = cudf::column_device_view::create(input.parent(), stream); auto const default_mr = rmm::mr::get_current_device_resource(); @@ -877,6 +879,8 @@ std::unique_ptr convert_column_type(std::unique_ptr& return nullptr; } +// Verify if the output column is matched with the input schema element. +// We do not check for type matching since we will perform conversion later on. void check_schema(cudf::io::column_name_info const& read_info, std::pair const& column_schema) { @@ -896,16 +900,18 @@ std::pair parse_schema_ std::vector const& scales, std::vector const& precisions) { + // Get data for the current column. auto const d_type = cudf::data_type{static_cast(types[index]), scales[index]}; auto const precision = precisions[index]; auto const col_num_children = num_children[index]; index++; + std::map children; std::vector> children_with_precisions; std::vector child_names(col_num_children); if (d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST) { - for (int i = 0; i < col_num_children; i++) { + for (int i = 0; i < col_num_children; ++i) { auto const& name = col_names[index]; auto [child, child_with_precision] = parse_schema_element(index, col_names, num_children, types, scales, precisions); @@ -913,13 +919,26 @@ std::pair parse_schema_ children_with_precisions.emplace_back(name, std::move(child_with_precision)); child_names[i] = name; } - } else if (col_num_children != 0) { - throw std::invalid_argument("Found children for a type that should have none."); + } else { + CUDF_EXPECTS(col_num_children == 0, + "Found children for a type that should have none.", + std::invalid_argument); } - return {cudf::io::schema_element{d_type, std::move(children), {std::move(child_names)}}, + + // Note that the first schema element always has type STRING, since we intentionally parse + // JSON into strings column for later post-processing. + return {cudf::io::schema_element{ + cudf::data_type{cudf::type_id::STRING}, std::move(children), {std::move(child_names)}}, schema_element_with_precision{d_type, precision, std::move(children_with_precisions)}}; } +// Travel the schema data by depth-first search order. +// Two separate schema is generated: +// - The first one is used as input to `cudf::read_json`, in which the data types of all columns +// are specified as STRING type. As such, the table returned by `cudf::read_json` will contain +// only strings columns. +// - The second schema is used for converting from STRING type to the desired types for the final +// output. std::pair generate_struct_schema( std::vector const& col_names, std::vector const& num_children, @@ -931,11 +950,11 @@ std::pair generate_stru std::vector> schema_cols_with_precisions; std::vector name_order; - std::size_t at = 0; - while (at < types.size()) { - auto const& name = col_names[at]; + std::size_t index = 0; + while (index < types.size()) { + auto const& name = col_names[index]; auto [child, child_with_precision] = - parse_schema_element(at, col_names, num_children, types, scales, precisions); + parse_schema_element(index, col_names, num_children, types, scales, precisions); schema_cols.emplace(name, std::move(child)); schema_cols_with_precisions.emplace_back(name, std::move(child_with_precision)); name_order.push_back(name); @@ -963,7 +982,8 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto const [is_invalid_or_empty, concat_input, delimiter] = concat_json(input, stream, mr); + auto const [is_invalid_or_empty, concat_input, delimiter] = + concat_json(input, stream, cudf::get_current_device_resource()); auto const [schema, schema_with_precision] = generate_struct_schema(col_names, num_children, types, scales, precisions); @@ -980,7 +1000,7 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con .experimental(true) .normalize_single_quotes(normalize_single_quotes) .strict_validation(true) - // + // specifying parameters .delimiter(delimiter) .numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) @@ -993,7 +1013,7 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con auto parsed_columns = parsed_table_with_meta.tbl->release(); CUDF_EXPECTS(parsed_columns.size() == schema.child_types.size(), - "Numbers of columns in the input table is different from schema size."); + "Numbers of output columns is different from schema size."); std::vector> converted_cols(parsed_columns.size()); for (std::size_t i = 0; i < parsed_columns.size(); ++i) { @@ -1051,15 +1071,6 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con mr); } -std::tuple, std::unique_ptr, char> concat_json( - cudf::strings_column_view const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::concat_json(input, stream, mr); -} - std::unique_ptr make_structs(std::vector const& children, cudf::column_view const& is_null, rmm::cuda_stream_view stream, From f3c391bdeee66fc819d3c3de3aee7cc733921bae Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 29 Oct 2024 14:22:39 -0700 Subject: [PATCH 26/58] Cleanup Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 47 +++++++++++------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 92b457bce..5e0c9db13 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -804,35 +804,32 @@ std::unique_ptr convert_column_type(std::unique_ptr& rmm::device_async_resource_ref mr) { if (schema.type.id() == cudf::type_id::BOOL8) { - CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); return ::spark_rapids_jni::cast_strings_to_booleans(input->view(), stream, mr); } if (cudf::is_integral(schema.type)) { - CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); return ::spark_rapids_jni::cast_strings_to_integers(input->view(), schema.type, stream, mr); } if (cudf::is_floating_point(schema.type)) { - CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); return ::spark_rapids_jni::cast_strings_to_floats( input->view(), schema.type, allow_nonnumeric_numbers, stream, mr); } if (cudf::is_fixed_point(schema.type)) { - CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); return ::spark_rapids_jni::cast_strings_to_decimals( input->view(), schema.type, schema.precision, is_us_locale, stream, mr); } if (schema.type.id() == cudf::type_id::STRING) { - CUDF_EXPECTS(input->type().id() == cudf::type_id::STRING, "Input column should be STRING."); - return ::spark_rapids_jni::remove_quotes(input->view(), false, stream, mr); + return ::spark_rapids_jni::remove_quotes( + input->view(), /*nullify_if_not_quoted*/ false, stream, mr); } - auto const num_rows = input->size(); - auto const null_count = input->null_count(); + auto const num_rows = input->size(); + auto const null_count = input->null_count(); + auto const d_type = input->type().id(); + auto const num_children = input->num_children(); + auto input_content = input->release(); if (schema.type.id() == cudf::type_id::LIST) { - CUDF_EXPECTS(input->type().id() == cudf::type_id::LIST, "Input column should be LIST."); - - auto input_content = input->release(); + CUDF_EXPECTS(d_type == cudf::type_id::LIST, "Input column should be LIST."); auto new_child = convert_column_type(input_content.children[cudf::lists_column_view::child_column_index], schema.child_types.front().second, @@ -840,7 +837,6 @@ std::unique_ptr convert_column_type(std::unique_ptr& is_us_locale, stream, mr); - return cudf::make_lists_column( num_rows, std::move(input_content.children[cudf::lists_column_view::offsets_column_index]), @@ -852,21 +848,16 @@ std::unique_ptr convert_column_type(std::unique_ptr& } if (schema.type.id() == cudf::type_id::STRUCT) { - CUDF_EXPECTS(input->type().id() == cudf::type_id::STRUCT, "Input column should be STRUCT."); - - auto const num_children = input->num_children(); - std::vector> new_children; - new_children.reserve(num_children); - auto input_content = input->release(); - for (cudf::size_type i = 0; i < input->num_children(); ++i) { - new_children.emplace_back(convert_column_type(input_content.children[i], - schema.child_types[i].second, - allow_nonnumeric_numbers, - is_us_locale, - stream, - mr)); + CUDF_EXPECTS(d_type == cudf::type_id::STRUCT, "Input column should be STRUCT."); + std::vector> new_children(num_children); + for (cudf::size_type i = 0; i < num_children; ++i) { + new_children[i] = convert_column_type(input_content.children[i], + schema.child_types[i].second, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr); } - return cudf::make_structs_column(num_rows, std::move(new_children), null_count, @@ -1017,6 +1008,10 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con std::vector> converted_cols(parsed_columns.size()); for (std::size_t i = 0; i < parsed_columns.size(); ++i) { + auto const d_type = parsed_columns[i]->type().id(); + CUDF_EXPECTS(d_type == cudf::type_id::LIST || d_type == cudf::type_id::STRUCT || + d_type == cudf::type_id::STRING, + "Input column should be STRING or nested."); check_schema(parsed_meta.schema_info[i], schema_with_precision.child_types[i]); converted_cols[i] = convert_column_type(parsed_columns[i], schema_with_precision.child_types[i].second, From 52c42a621618339036eff5972443bbafea9e4013 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 29 Oct 2024 14:26:58 -0700 Subject: [PATCH 27/58] Return as-is if the column is date/time Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 39 +++++++++--------------- 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 5e0c9db13..1cd2c3e06 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -803,6 +803,9 @@ std::unique_ptr convert_column_type(std::unique_ptr& rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + // Date/time is not processed for now, but it should be handled later on in spark-rapids. + if (cudf::is_chrono(schema.type)) { return std::move(input); } + if (schema.type.id() == cudf::type_id::BOOL8) { return ::spark_rapids_jni::cast_strings_to_booleans(input->view(), stream, mr); } @@ -870,19 +873,6 @@ std::unique_ptr convert_column_type(std::unique_ptr& return nullptr; } -// Verify if the output column is matched with the input schema element. -// We do not check for type matching since we will perform conversion later on. -void check_schema(cudf::io::column_name_info const& read_info, - std::pair const& column_schema) -{ - CUDF_EXPECTS(read_info.name == column_schema.first, "Mismatched column name."); - CUDF_EXPECTS(read_info.children.size() == column_schema.second.child_types.size(), - "Mismatched number of children."); - for (std::size_t i = 0; i < read_info.children.size(); ++i) { - check_schema(read_info.children[i], column_schema.second.child_types[i]); - } -} - std::pair parse_schema_element( std::size_t& index, std::vector const& col_names, @@ -916,10 +906,13 @@ std::pair parse_schema_ std::invalid_argument); } - // Note that the first schema element always has type STRING, since we intentionally parse - // JSON into strings column for later post-processing. - return {cudf::io::schema_element{ - cudf::data_type{cudf::type_id::STRING}, std::move(children), {std::move(child_names)}}, + // Note that if the first schema element does not has type STRUCT/LIST then it always has type + // STRING, since we intentionally parse JSON into strings column for later post-processing. + auto const schema_dtype = + d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST + ? d_type + : cudf::data_type{cudf::type_id::STRING}; + return {cudf::io::schema_element{schema_dtype, std::move(children), {std::move(child_names)}}, schema_element_with_precision{d_type, precision, std::move(children_with_precisions)}}; } @@ -1012,13 +1005,11 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con CUDF_EXPECTS(d_type == cudf::type_id::LIST || d_type == cudf::type_id::STRUCT || d_type == cudf::type_id::STRING, "Input column should be STRING or nested."); - check_schema(parsed_meta.schema_info[i], schema_with_precision.child_types[i]); - converted_cols[i] = convert_column_type(parsed_columns[i], - schema_with_precision.child_types[i].second, - allow_nonnumeric_numbers, - is_us_locale, - stream, - mr); + + auto const& [col_name, col_schema] = schema_with_precision.child_types[i]; + CUDF_EXPECTS(parsed_meta.schema_info[i].name == col_name, "Mismatched column name."); + converted_cols[i] = convert_column_type( + parsed_columns[i], col_schema, allow_nonnumeric_numbers, is_us_locale, stream, mr); } auto const valid_it = is_invalid_or_empty->view().begin(); From 19c64be8576301129cfde4b708de40ed96860481 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 29 Oct 2024 22:41:10 -0700 Subject: [PATCH 28/58] Update test Signed-off-by: Nghia Truong --- src/main/cpp/tests/cast_float_to_string.cpp | 87 ++++++++++++++++++++- 1 file changed, 84 insertions(+), 3 deletions(-) diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp index a9c8a332f..1daddcb97 100644 --- a/src/main/cpp/tests/cast_float_to_string.cpp +++ b/src/main/cpp/tests/cast_float_to_string.cpp @@ -18,6 +18,8 @@ #include #include +#include + #include #include @@ -28,8 +30,87 @@ struct FloatToStringTests : public cudf::test::BaseFixture {}; TEST_F(FloatToStringTests, FromFloats32) { - auto const input = cudf::test::strings_column_wrapper{R"("26/08/2015")"}; - auto out = spark_rapids_jni::remove_quotes(input, true); + std::string json_string = R"({"student": [{"name": "abc", "class": "junior"}]})"; + + { + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.data(), json_string.size()}) + .prune_columns(true) + .mixed_types_as_string(true) + .lines(true); + + cudf::io::schema_element dtype_schema{cudf::data_type{cudf::type_id::STRUCT}, + { + {"student", + {data_type{cudf::type_id::LIST}, + {{"element", + {data_type{cudf::type_id::STRUCT}, + { + {"name", {data_type{cudf::type_id::STRING}}}, + {"abc", {data_type{cudf::type_id::STRING}}}, + {"class", {data_type{cudf::type_id::STRING}}}, + }, + {{"name", "abc", "class"}}}}}}}, + }, + {{"student"}}}; + in_options.set_dtypes(dtype_schema); + + auto const parsed_table_with_meta = cudf::io::read_json(in_options); + // auto const& parsed_meta = parsed_table_with_meta.metadata; + auto parsed_columns = parsed_table_with_meta.tbl->release(); + for (auto& col : parsed_columns) { + cudf::test::print(*col); + } + } + + { + /* + * colname: +student, +element, +name, +abc, +class, +num child: +1, +3, +0, +0, +0, +num child: +1, +3, +0, +0, +0, +types: +24, +28, +23, +23, +23, + + */ + + std::vector col_names{"student", "element", "name", "abc", "class"}; + std::vector num_children{1, 3, 0, 0, 0}; + std::vector types{24, 28, 23, 23, 23}; + std::vector scales{0, 0, 0, 0, 0}; + std::vector precisions{-1, -1, -1, -1, -1}; - // cudf::test::print(out->view()); + auto const input = cudf::test::strings_column_wrapper{json_string}; + auto out = spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{input}, + col_names, + num_children, + types, + scales, + precisions, + true, + true, + true, + true, + true); + cudf::test::print(*out); + } } From 5d07db1d7c23344a993cad6b143a9b8952b23cfa Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 30 Oct 2024 10:03:43 -0700 Subject: [PATCH 29/58] Update cudf Signed-off-by: Nghia Truong --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 6328ad679..f146ac075 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 6328ad679947eb5cbc352c345a28f079aa6b8005 +Subproject commit f146ac075b10c9a1685491d2761012e87b5c0d28 From 39e3a9bb9e73c272999bed57e4007fb49bccd5b4 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 30 Oct 2024 10:04:12 -0700 Subject: [PATCH 30/58] Revert "Update cudf" This reverts commit 5d07db1d7c23344a993cad6b143a9b8952b23cfa. --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index f146ac075..6328ad679 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit f146ac075b10c9a1685491d2761012e87b5c0d28 +Subproject commit 6328ad679947eb5cbc352c345a28f079aa6b8005 From df1428ddcc4ab7eee92ce3567be320718024eb27 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 30 Oct 2024 13:44:45 -0700 Subject: [PATCH 31/58] Update cudf --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 6c2eb4ef0..84cd5331b 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 6c2eb4ef03c56413000b3d28574868b68c86181f +Subproject commit 84cd5331b490e519674a7151190eac09e8971a0e From 1d489062e6ddf2edf2cba98a30b025b7adba9973 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 8 Nov 2024 15:30:48 -0800 Subject: [PATCH 32/58] Update cudf --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index adf32694e..b1f38c366 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit adf32694e7b4eb9f91e928bf6dbf0818b97bcf35 +Subproject commit b1f38c366843e8208ce45904e7fda354576b549b From d9e1db587f3c1127dfbc0c1b2e7f7b107ef96f34 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 8 Nov 2024 20:11:42 -0800 Subject: [PATCH 33/58] Change header --- src/main/cpp/src/json_utils.hpp | 75 +++++++++++---------------------- 1 file changed, 24 insertions(+), 51 deletions(-) diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 8ed5d77b9..7a163d311 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -16,24 +16,30 @@ #pragma once -#include #include #include #include #include #include -#include #include namespace spark_rapids_jni { +/** + * @brief Extract a map column from the JSON strings given by an input strings column. + */ std::unique_ptr from_json_to_raw_map( cudf::strings_column_view const& input, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource()); +/** + * @brief Parse JSON strings into a struct column followed by a given data schema. + * + * The data schema is specified as data arrays flattened by depth-first-search order. + */ std::unique_ptr from_json_to_structs( cudf::strings_column_view const& input, std::vector const& col_names, @@ -49,58 +55,25 @@ std::unique_ptr from_json_to_structs( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource()); -// -// -// -// -// -// -// -std::unique_ptr cast_strings_to_booleans( - cudf::column_view const& input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr cast_strings_to_decimals( - cudf::column_view const& input, - cudf::data_type output_type, - int precision, - bool is_us_locale, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr cast_strings_to_integers( - cudf::column_view const& input, - cudf::data_type output_type, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr cast_strings_to_dates( - cudf::column_view const& input, - std::string const& date_regex, - std::string const& date_format, - bool error_if_invalid, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr remove_quotes( - cudf::column_view const& input, - bool nullify_if_not_quoted, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr cast_strings_to_floats( +/** + * @brief Convert the input column into a desired type given by a schema. + * + * The input column can be a nested column thus the given schema is specified as data arrays + * flattened by depth-first-search order. + */ +std::unique_ptr convert_data_type( cudf::column_view const& input, - cudf::data_type output_type, + std::vector const& num_children, + std::vector const& types, + std::vector const& scales, + std::vector const& precisions, + bool normalize_single_quotes, + bool allow_leading_zeros, bool allow_nonnumeric_numbers, + bool allow_unquoted_control, + bool is_us_locale, rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr make_structs( - std::vector const& input, - cudf::column_view const& is_null, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + rmm::device_async_resource_ref mr = cudf::get_current_device_resource()); /** * @brief Concatenate the JSON objects given by a strings column into one single character buffer, From 0f053a64d77250f988d3464ffaca641fa448b984 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 8 Nov 2024 20:21:54 -0800 Subject: [PATCH 34/58] Rewrite JSONUtils.cpp --- src/main/cpp/src/JSONUtilsJni.cpp | 145 +++++++++++------------------- 1 file changed, 53 insertions(+), 92 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 5027e7c39..ea56aaad7 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -156,93 +156,6 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_extractRawMap CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToBooleans(JNIEnv* env, jclass, jlong j_input) -{ - JNI_NULL_CHECK(env, j_input, "j_input is null", 0); - - try { - cudf::jni::auto_set_device(env); - auto const input = *reinterpret_cast(j_input); - return cudf::jni::ptr_as_jlong(spark_rapids_jni::cast_strings_to_booleans(input).release()); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToDecimals(JNIEnv* env, - jclass, - jlong j_input, - jint j_output_type_id, - jint precision, - jint scale, - jboolean is_us_locale) -{ - JNI_NULL_CHECK(env, j_input, "j_input is null", 0); - - try { - cudf::jni::auto_set_device(env); - auto const input = *reinterpret_cast(j_input); - - return cudf::jni::ptr_as_jlong( - spark_rapids_jni::cast_strings_to_decimals( - input, - cudf::data_type{static_cast(j_output_type_id), scale}, - precision, - is_us_locale) - .release()); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToIntegers( - JNIEnv* env, jclass, jlong j_input, jint output_type_id) -{ - JNI_NULL_CHECK(env, j_input, "j_input is null", 0); - - try { - cudf::jni::auto_set_device(env); - auto const input = *reinterpret_cast(j_input); - - return cudf::jni::ptr_as_jlong( - spark_rapids_jni::cast_strings_to_integers( - input, cudf::data_type{static_cast(output_type_id)}) - .release()); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_removeQuotes( - JNIEnv* env, jclass, jlong j_input, jboolean nullify_if_not_quoted) -{ - JNI_NULL_CHECK(env, j_input, "j_input is null", 0); - - try { - cudf::jni::auto_set_device(env); - auto const input = *reinterpret_cast(j_input); - return cudf::jni::ptr_as_jlong( - spark_rapids_jni::remove_quotes(input, nullify_if_not_quoted).release()); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_castStringsToFloats( - JNIEnv* env, jclass, jlong j_input, jint j_output_type_id, jboolean allow_nonnumeric_numbers) -{ - JNI_NULL_CHECK(env, j_input, "j_input is null", 0); - - try { - cudf::jni::auto_set_device(env); - auto const input = *reinterpret_cast(j_input); - return cudf::jni::ptr_as_jlong(spark_rapids_jni::cast_strings_to_floats( - input, - cudf::data_type{static_cast(j_output_type_id)}, - allow_nonnumeric_numbers) - .release()); - } - CATCH_STD(env, 0); -} - JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJSONToStructs(JNIEnv* env, jclass, @@ -275,11 +188,11 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJSONToStructs(JNIEnv* env, auto const scales = cudf::jni::native_jintArray(env, j_scales).to_vector(); auto const precisions = cudf::jni::native_jintArray(env, j_precisions).to_vector(); - CUDF_EXPECTS(col_names.size() > 0, "Invalid schema data."); - CUDF_EXPECTS(col_names.size() == num_children.size(), "Invalid schema data."); - CUDF_EXPECTS(col_names.size() == types.size(), "Invalid schema data."); - CUDF_EXPECTS(col_names.size() == scales.size(), "Invalid schema data."); - CUDF_EXPECTS(col_names.size() == precisions.size(), "Invalid schema data."); + CUDF_EXPECTS(col_names.size() > 0, "Invalid schema data: col_names."); + CUDF_EXPECTS(col_names.size() == num_children.size(), "Invalid schema data: num_children."); + CUDF_EXPECTS(col_names.size() == types.size(), "Invalid schema data: types."); + CUDF_EXPECTS(col_names.size() == scales.size(), "Invalid schema data: scales."); + CUDF_EXPECTS(col_names.size() == precisions.size(), "Invalid schema data: precisions."); return cudf::jni::ptr_as_jlong( spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{*input}, @@ -298,4 +211,52 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJSONToStructs(JNIEnv* env, CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL +Java_com_nvidia_spark_rapids_jni_JSONUtils_convertDataType(JNIEnv* env, + jclass, + jlong j_input, + jintArray j_num_children, + jintArray j_types, + jintArray j_scales, + jintArray j_precisions, + jboolean normalize_single_quotes, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control, + jboolean is_us_locale) +{ + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + JNI_NULL_CHECK(env, j_num_children, "j_num_children is null", 0); + JNI_NULL_CHECK(env, j_types, "j_types is null", 0); + JNI_NULL_CHECK(env, j_scales, "j_scales is null", 0); + JNI_NULL_CHECK(env, j_precisions, "j_precisions is null", 0); + + try { + cudf::jni::auto_set_device(env); + + auto const input = reinterpret_cast(j_input); + auto const num_children = cudf::jni::native_jintArray(env, j_num_children).to_vector(); + auto const types = cudf::jni::native_jintArray(env, j_types).to_vector(); + auto const scales = cudf::jni::native_jintArray(env, j_scales).to_vector(); + auto const precisions = cudf::jni::native_jintArray(env, j_precisions).to_vector(); + + CUDF_EXPECTS(num_children.size() > 0, "Invalid schema data: num_children."); + CUDF_EXPECTS(num_children.size() == types.size(), "Invalid schema data: types."); + CUDF_EXPECTS(num_children.size() == scales.size(), "Invalid schema data: scales."); + CUDF_EXPECTS(num_children.size() == precisions.size(), "Invalid schema data: precisions."); + + return cudf::jni::ptr_as_jlong(spark_rapids_jni::convert_data_type(*input, + num_children, + types, + scales, + precisions, + normalize_single_quotes, + allow_leading_zeros, + allow_nonnumeric_numbers, + allow_unquoted_control, + is_us_locale) + .release()); + } + CATCH_STD(env, 0); +} } // extern "C" From 8912e0063fb5ae906e43a95d369887c3bff953a4 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 12 Nov 2024 07:13:52 -0800 Subject: [PATCH 35/58] Implement a common function for converting column Signed-off-by: Nghia Truong --- src/main/cpp/CMakeLists.txt | 5 +- src/main/cpp/src/from_json_to_structs.cu | 432 +++++++---------------- src/main/cpp/src/json_utils.cu | 230 ++++++++++++ 3 files changed, 369 insertions(+), 298 deletions(-) create mode 100644 src/main/cpp/src/json_utils.cu diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 14882ab43..ff32cfa0b 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -206,16 +206,17 @@ add_library( src/bloom_filter.cu src/case_when.cu src/cast_decimal_to_string.cu - src/format_float.cu - src/from_json_to_structs.cu src/cast_float_to_string.cu src/cast_string.cu src/cast_string_to_float.cu src/datetime_rebase.cu src/decimal_utils.cu + src/format_float.cu src/from_json_to_raw_map.cu + src/from_json_to_structs.cu src/get_json_object.cu src/histogram.cu + src/json_utils.cu src/murmur_hash.cu src/parse_uri.cu src/regex_rewrite_utils.cu diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 1cd2c3e06..65f800b2f 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -58,222 +58,6 @@ namespace detail { namespace { -constexpr bool not_whitespace(cudf::char_utf8 ch) -{ - return ch != ' ' && ch != '\r' && ch != '\n' && ch != '\t'; -} - -constexpr bool can_be_delimiter(char c) -{ - // The character list below is from `json_reader_options.set_delimiter`. - switch (c) { - case '{': - case '[': - case '}': - case ']': - case ',': - case ':': - case '"': - case '\'': - case '\\': - case ' ': - case '\t': - case '\r': return false; - default: return true; - } -} - -} // namespace - -std::tuple, std::unique_ptr, char> concat_json( - cudf::strings_column_view const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - - auto const d_input_ptr = cudf::column_device_view::create(input.parent(), stream); - auto const default_mr = rmm::mr::get_current_device_resource(); - - // Check if the input rows are either null, equal to `null` string literal, or empty. - // This will be used for masking out the input when doing string concatenation. - rmm::device_uvector is_valid_input(input.size(), stream, default_mr); - - // Check if the input rows are either null or empty. - // This will be returned to the caller. - rmm::device_uvector is_invalid_or_empty(input.size(), stream, mr); - - thrust::for_each( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0L), - thrust::make_counting_iterator(input.size() * static_cast(cudf::detail::warp_size)), - [input = *d_input_ptr, - output = thrust::make_zip_iterator(thrust::make_tuple( - is_valid_input.begin(), is_invalid_or_empty.begin()))] __device__(int64_t tidx) { - // Execute one warp per row to minimize thread divergence. - if ((tidx % cudf::detail::warp_size) != 0) { return; } - auto const idx = tidx / cudf::detail::warp_size; - - if (input.is_null(idx)) { - output[idx] = thrust::make_tuple(false, true); - return; - } - - auto const d_str = input.element(idx); - auto const size = d_str.size_bytes(); - int i = 0; - char ch; - - // Skip the very first whitespace characters. - for (; i < size; ++i) { - ch = d_str[i]; - if (not_whitespace(ch)) { break; } - } - - if (i + 3 < size && - (d_str[i] == 'n' && d_str[i + 1] == 'u' && d_str[i + 2] == 'l' && d_str[i + 3] == 'l')) { - i += 4; - - // Skip the very last whitespace characters. - bool is_null_literal{true}; - for (; i < size; ++i) { - ch = d_str[i]; - if (not_whitespace(ch)) { - is_null_literal = false; - break; - } - } - - // The current row contains only `null` string literal and not any other non-whitespace - // characters. Such rows need to be masked out as null when doing concatenation. - if (is_null_literal) { - output[idx] = thrust::make_tuple(false, false); - return; - } - } - - auto const not_eol = i < size; - - // If the current row is not null or empty, it should start with `{`. Otherwise, we need to - // replace it by a null. This is necessary for libcudf's JSON reader to work. - // Note that if we want to support ARRAY schema, we need to check for `[` instead. - auto constexpr start_character = '{'; - if (not_eol && ch != start_character) { - output[idx] = thrust::make_tuple(false, false); - return; - } - - output[idx] = thrust::make_tuple(not_eol, !not_eol); - }); - - auto constexpr num_levels = 256; - auto constexpr lower_level = std::numeric_limits::min(); - auto constexpr upper_level = std::numeric_limits::max(); - auto const num_chars = input.chars_size(stream); - - rmm::device_uvector histogram(num_levels, stream, default_mr); - thrust::uninitialized_fill( - rmm::exec_policy_nosync(stream), histogram.begin(), histogram.end(), 0); - - size_t temp_storage_bytes = 0; - cub::DeviceHistogram::HistogramEven(nullptr, - temp_storage_bytes, - input.chars_begin(stream), - histogram.begin(), - num_levels, - lower_level, - upper_level, - num_chars, - stream.value()); - rmm::device_buffer d_temp(temp_storage_bytes, stream); - cub::DeviceHistogram::HistogramEven(d_temp.data(), - temp_storage_bytes, - input.chars_begin(stream), - histogram.begin(), - num_levels, - lower_level, - upper_level, - num_chars, - stream.value()); - - auto const it = thrust::make_counting_iterator(0); - auto const zero_level_idx = -lower_level; // the bin storing count for character `\0` - auto const zero_level_it = it + zero_level_idx; - auto const end = it + num_levels; - - auto const first_zero_count_pos = - thrust::find_if(rmm::exec_policy_nosync(stream), - zero_level_it, // ignore the negative characters - end, - [zero_level_idx, counts = histogram.begin()] __device__(auto idx) -> bool { - auto const count = counts[idx]; - if (count > 0) { return false; } - auto const first_non_existing_char = static_cast(idx - zero_level_idx); - return can_be_delimiter(first_non_existing_char); - }); - - // This should never happen since the input should never cover the entire char range. - if (first_zero_count_pos == end) { - throw std::logic_error( - "Cannot find any character suitable as delimiter during joining json strings."); - } - auto const delimiter = static_cast(thrust::distance(zero_level_it, first_zero_count_pos)); - - auto [null_mask, null_count] = cudf::detail::valid_if( - is_valid_input.begin(), is_valid_input.end(), thrust::identity{}, stream, default_mr); - // If the null count doesn't change, that mean we do not have any rows containing `null` string - // literal or empty rows. In such cases, just use the input column for concatenation. - auto const input_applied_null = - null_count == input.null_count() - ? cudf::column_view{} - : cudf::column_view{cudf::data_type{cudf::type_id::STRING}, - input.size(), - input.chars_begin(stream), - reinterpret_cast(null_mask.data()), - null_count, - 0, - std::vector{input.offsets()}}; - - auto concat_strings = cudf::strings::detail::join_strings( - null_count == input.null_count() ? input : cudf::strings_column_view{input_applied_null}, - cudf::string_scalar(std::string(1, delimiter), true, stream, default_mr), - cudf::string_scalar("{}", true, stream, default_mr), - stream, - mr); - - return {std::make_unique(std::move(is_invalid_or_empty), rmm::device_buffer{}, 0), - std::move(concat_strings->release().data), - delimiter}; -} - -std::unique_ptr make_structs(std::vector const& children, - cudf::column_view const& is_null, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - if (children.size() == 0) { return nullptr; } - - auto const row_count = children.front().size(); - for (auto const& col : children) { - CUDF_EXPECTS(col.size() == row_count, "All columns must have the same number of rows."); - } - - auto const [null_mask, null_count] = cudf::detail::valid_if( - is_null.begin(), is_null.end(), thrust::logical_not{}, stream, mr); - - auto const structs = - cudf::column_view(cudf::data_type{cudf::type_id::STRUCT}, - row_count, - nullptr, - reinterpret_cast(null_mask.data()), - null_count, - 0, - children); - return std::make_unique(structs, stream, mr); -} - -namespace { - using string_index_pair = thrust::pair; std::pair, rmm::device_uvector> cast_strings_to_booleans( @@ -796,83 +580,6 @@ struct schema_element_with_precision { std::vector> child_types; }; -std::unique_ptr convert_column_type(std::unique_ptr& input, - schema_element_with_precision const& schema, - bool allow_nonnumeric_numbers, - bool is_us_locale, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - // Date/time is not processed for now, but it should be handled later on in spark-rapids. - if (cudf::is_chrono(schema.type)) { return std::move(input); } - - if (schema.type.id() == cudf::type_id::BOOL8) { - return ::spark_rapids_jni::cast_strings_to_booleans(input->view(), stream, mr); - } - if (cudf::is_integral(schema.type)) { - return ::spark_rapids_jni::cast_strings_to_integers(input->view(), schema.type, stream, mr); - } - if (cudf::is_floating_point(schema.type)) { - return ::spark_rapids_jni::cast_strings_to_floats( - input->view(), schema.type, allow_nonnumeric_numbers, stream, mr); - } - if (cudf::is_fixed_point(schema.type)) { - return ::spark_rapids_jni::cast_strings_to_decimals( - input->view(), schema.type, schema.precision, is_us_locale, stream, mr); - } - if (schema.type.id() == cudf::type_id::STRING) { - return ::spark_rapids_jni::remove_quotes( - input->view(), /*nullify_if_not_quoted*/ false, stream, mr); - } - - auto const num_rows = input->size(); - auto const null_count = input->null_count(); - auto const d_type = input->type().id(); - auto const num_children = input->num_children(); - auto input_content = input->release(); - - if (schema.type.id() == cudf::type_id::LIST) { - CUDF_EXPECTS(d_type == cudf::type_id::LIST, "Input column should be LIST."); - auto new_child = - convert_column_type(input_content.children[cudf::lists_column_view::child_column_index], - schema.child_types.front().second, - allow_nonnumeric_numbers, - is_us_locale, - stream, - mr); - return cudf::make_lists_column( - num_rows, - std::move(input_content.children[cudf::lists_column_view::offsets_column_index]), - std::move(new_child), - null_count, - std::move(*input_content.null_mask), - stream, - mr); - } - - if (schema.type.id() == cudf::type_id::STRUCT) { - CUDF_EXPECTS(d_type == cudf::type_id::STRUCT, "Input column should be STRUCT."); - std::vector> new_children(num_children); - for (cudf::size_type i = 0; i < num_children; ++i) { - new_children[i] = convert_column_type(input_content.children[i], - schema.child_types[i].second, - allow_nonnumeric_numbers, - is_us_locale, - stream, - mr); - } - return cudf::make_structs_column(num_rows, - std::move(new_children), - null_count, - std::move(*input_content.null_mask), - stream, - mr); - } - - CUDF_FAIL("Unexpected column type for conversion."); - return nullptr; -} - std::pair parse_schema_element( std::size_t& index, std::vector const& col_names, @@ -1025,6 +732,114 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con mr); } +template +std::unique_ptr convert_data_type(InputType&& input, + schema_element_with_precision const& schema, + bool allow_nonnumeric_numbers, + bool is_us_locale, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using DecayInputT = std::decay_t; + auto constexpr input_is_const_cv = std::is_same_v; + auto constexpr input_is_column_ptr = std::is_same_v>; + static_assert(input_is_const_cv ^ input_is_column_ptr); + + auto const d_type = [&] { + if constexpr (input_is_column_ptr) { + return input->type().id(); + } else { + return input.type().id(); + } + }; + + // if (d_type == cudf::type_id::LIST || d_type == cudf::type_id::STRUCT || + // d_type == cudf::type_id::STRING) { + // // + // } + // std::unique_ptr convert_column_type(std::unique_ptr& input, + // schema_element_with_precision const& schema, + // bool allow_nonnumeric_numbers, + // bool is_us_locale, + // rmm::cuda_stream_view stream, + // rmm::device_async_resource_ref mr) + + if (cudf::is_chrono(schema.type)) { + if constexpr (input_is_column_ptr) { + // Date/time is not processed for now, but it should be handled later on in spark-rapids. + return std::move(input); + } else { + CUDF_FAIL("Cannot convert data type to a chrono (date/time) type."); + } + } + + if (schema.type.id() == cudf::type_id::BOOL8) { + return ::spark_rapids_jni::cast_strings_to_booleans(input->view(), stream, mr); + } + if (cudf::is_integral(schema.type)) { + return ::spark_rapids_jni::cast_strings_to_integers(input->view(), schema.type, stream, mr); + } + if (cudf::is_floating_point(schema.type)) { + return ::spark_rapids_jni::cast_strings_to_floats( + input->view(), schema.type, allow_nonnumeric_numbers, stream, mr); + } + if (cudf::is_fixed_point(schema.type)) { + return ::spark_rapids_jni::cast_strings_to_decimals( + input->view(), schema.type, schema.precision, is_us_locale, stream, mr); + } + if (schema.type.id() == cudf::type_id::STRING) { + return ::spark_rapids_jni::remove_quotes( + input->view(), /*nullify_if_not_quoted*/ false, stream, mr); + } + + auto const num_rows = input->size(); + auto const null_count = input->null_count(); + auto const d_type = input->type().id(); + auto const num_children = input->num_children(); + auto input_content = input->release(); + + if (schema.type.id() == cudf::type_id::LIST) { + CUDF_EXPECTS(d_type == cudf::type_id::LIST, "Input column should be LIST."); + auto new_child = + convert_column_type(input_content.children[cudf::lists_column_view::child_column_index], + schema.child_types.front().second, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr); + return cudf::make_lists_column( + num_rows, + std::move(input_content.children[cudf::lists_column_view::offsets_column_index]), + std::move(new_child), + null_count, + std::move(*input_content.null_mask), + stream, + mr); + } + + if (schema.type.id() == cudf::type_id::STRUCT) { + CUDF_EXPECTS(d_type == cudf::type_id::STRUCT, "Input column should be STRUCT."); + std::vector> new_children(num_children); + for (cudf::size_type i = 0; i < num_children; ++i) { + new_children[i] = convert_column_type(input_content.children[i], + schema.child_types[i].second, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr); + } + return cudf::make_structs_column(num_rows, + std::move(new_children), + null_count, + std::move(*input_content.null_mask), + stream, + mr); + } + + CUDF_FAIL("Unexpected column type for conversion."); + return nullptr; +} + } // namespace detail std::unique_ptr from_json_to_structs(cudf::strings_column_view const& input, @@ -1117,7 +932,6 @@ std::unique_ptr cast_strings_to_decimals(cudf::column_view const& rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::cast_strings_to_decimals(input, output_type, precision, is_us_locale, stream, mr); } @@ -1127,7 +941,6 @@ std::unique_ptr remove_quotes(cudf::column_view const& input, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - auto [output, validity] = detail::remove_quotes(input, nullify_if_not_quoted, stream, mr); if (validity.size() > 0) { auto [null_mask, null_count] = @@ -1144,7 +957,6 @@ std::unique_ptr cast_strings_to_floats(cudf::column_view const& in rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - if (allow_nonnumeric_numbers) { auto [removed_quotes, validity] = detail::remove_quotes_for_floats(input, stream, mr); return string_to_float( @@ -1153,4 +965,32 @@ std::unique_ptr cast_strings_to_floats(cudf::column_view const& in return string_to_float(output_type, cudf::strings_column_view{input}, false, stream, mr); } +std::unique_ptr convert_data_type(cudf::column_view const& input, + std::vector const& num_children, + std::vector const& types, + std::vector const& scales, + std::vector const& precisions, + bool normalize_single_quotes, + bool allow_leading_zeros, + bool allow_nonnumeric_numbers, + bool allow_unquoted_control, + bool is_us_locale, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::convert_data_type(input, + num_children, + types, + scales, + precisions, + normalize_single_quotes, + allow_leading_zeros, + allow_nonnumeric_numbers, + allow_unquoted_control, + is_us_locale, + stream, + mr); +} + } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/json_utils.cu b/src/main/cpp/src/json_utils.cu new file mode 100644 index 000000000..3305bec9a --- /dev/null +++ b/src/main/cpp/src/json_utils.cu @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace spark_rapids_jni { + +namespace detail { + +namespace { + +constexpr bool not_whitespace(cudf::char_utf8 ch) +{ + return ch != ' ' && ch != '\r' && ch != '\n' && ch != '\t'; +} + +constexpr bool can_be_delimiter(char c) +{ + // The character list below is from `json_reader_options.set_delimiter`. + switch (c) { + case '{': + case '[': + case '}': + case ']': + case ',': + case ':': + case '"': + case '\'': + case '\\': + case ' ': + case '\t': + case '\r': return false; + default: return true; + } +} + +} // namespace + +std::tuple, char, std::unique_ptr> concat_json( + cudf::strings_column_view const& input, + bool nullify_invalid_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + if (input.is_empty()) { + return {std::make_unique(0, stream, mr), + '\n', + std::make_unique( + rmm::device_uvector{0, stream, mr}, rmm::device_buffer{}, 0)}; + } + + auto const d_input_ptr = cudf::column_device_view::create(input.parent(), stream); + auto const default_mr = rmm::mr::get_current_device_resource(); + + // Check if the input rows are null, empty (containing only whitespaces), and invalid JSON. + // This will be used for masking out the null/empty/invalid input rows when doing string + // concatenation. + rmm::device_uvector is_valid_input(input.size(), stream, default_mr); + + // Check if the input rows are null, empty (containing only whitespaces), and may also check + // for invalid JSON strings. + // This will be returned to the caller to create null mask for the final output. + rmm::device_uvector should_be_nullified(input.size(), stream, mr); + + thrust::for_each( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0L), + thrust::make_counting_iterator(input.size() * static_cast(cudf::detail::warp_size)), + [nullify_invalid_rows, + input = *d_input_ptr, + output = thrust::make_zip_iterator(thrust::make_tuple( + is_valid_input.begin(), should_be_nullified.begin()))] __device__(int64_t tidx) { + // Execute one warp per row to minimize thread divergence. + if ((tidx % cudf::detail::warp_size) != 0) { return; } + auto const idx = tidx / cudf::detail::warp_size; + + if (input.is_null(idx)) { + output[idx] = thrust::make_tuple(false, true); + return; + } + + auto const d_str = input.element(idx); + auto const size = d_str.size_bytes(); + int i = 0; + char ch; + + // Skip the very first whitespace characters. + for (; i < size; ++i) { + ch = d_str[i]; + if (not_whitespace(ch)) { break; } + } + + auto const not_eol = i < size; + + // If the current row is not null or empty, it should start with `{`. Otherwise, we need to + // replace it by a null. This is necessary for libcudf's JSON reader to work. + // Note that if we want to support ARRAY schema, we need to check for `[` instead. + auto constexpr start_character = '{'; + if (not_eol && ch != start_character) { + output[idx] = thrust::make_tuple(false, nullify_invalid_rows); + return; + } + + output[idx] = thrust::make_tuple(not_eol, !not_eol); + }); + + auto constexpr num_levels = 256; + auto constexpr lower_level = std::numeric_limits::min(); + auto constexpr upper_level = std::numeric_limits::max(); + auto const num_chars = input.chars_size(stream); + + rmm::device_uvector histogram(num_levels, stream, default_mr); + thrust::uninitialized_fill( + rmm::exec_policy_nosync(stream), histogram.begin(), histogram.end(), 0); + + size_t temp_storage_bytes = 0; + cub::DeviceHistogram::HistogramEven(nullptr, + temp_storage_bytes, + input.chars_begin(stream), + histogram.begin(), + num_levels, + lower_level, + upper_level, + num_chars, + stream.value()); + rmm::device_buffer d_temp(temp_storage_bytes, stream); + cub::DeviceHistogram::HistogramEven(d_temp.data(), + temp_storage_bytes, + input.chars_begin(stream), + histogram.begin(), + num_levels, + lower_level, + upper_level, + num_chars, + stream.value()); + + auto const it = thrust::make_counting_iterator(0); + auto const zero_level_idx = -lower_level; // the bin storing count for character `\0` + auto const zero_level_it = it + zero_level_idx; + auto const end = it + num_levels; + + auto const first_zero_count_pos = + thrust::find_if(rmm::exec_policy_nosync(stream), + zero_level_it, // ignore the negative characters + end, + [zero_level_idx, counts = histogram.begin()] __device__(auto idx) -> bool { + auto const count = counts[idx]; + if (count > 0) { return false; } + auto const first_non_existing_char = static_cast(idx - zero_level_idx); + return can_be_delimiter(first_non_existing_char); + }); + + // This should never happen since the input should never cover the entire char range. + if (first_zero_count_pos == end) { + throw std::logic_error( + "Cannot find any character suitable as delimiter during joining json strings."); + } + auto const delimiter = static_cast(thrust::distance(zero_level_it, first_zero_count_pos)); + + auto [null_mask, null_count] = cudf::detail::valid_if( + is_valid_input.begin(), is_valid_input.end(), thrust::identity{}, stream, default_mr); + // If the null count doesn't change, that mean we do not have any rows containing `null` string + // literal or empty rows. In such cases, just use the input column for concatenation. + auto const input_applied_null = + null_count == input.null_count() + ? cudf::column_view{} + : cudf::column_view{cudf::data_type{cudf::type_id::STRING}, + input.size(), + input.chars_begin(stream), + reinterpret_cast(null_mask.data()), + null_count, + 0, + std::vector{input.offsets()}}; + + auto concat_strings = cudf::strings::detail::join_strings( + null_count == input.null_count() ? input : cudf::strings_column_view{input_applied_null}, + cudf::string_scalar(std::string(1, delimiter), true, stream, default_mr), + cudf::string_scalar("{}", true, stream, default_mr), + stream, + mr); + + return {std::move(concat_strings->release().data), + delimiter, + std::make_unique(std::move(should_be_nullified), rmm::device_buffer{}, 0)}; +} + +} // namespace detail + +std::tuple, char, std::unique_ptr> concat_json( + cudf::strings_column_view const& input, + bool nullify_invalid_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::concat_json(input, nullify_invalid_rows, stream, mr); +} + +} // namespace spark_rapids_jni From 3614718bd345ce8a0543aa31fd2e282753496366 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 12 Nov 2024 14:40:10 -0800 Subject: [PATCH 36/58] Rewrite `convert_data_type` Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 171 +++++++++++++++-------- 1 file changed, 113 insertions(+), 58 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 65f800b2f..2134e4c8b 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -740,33 +740,16 @@ std::unique_ptr convert_data_type(InputType&& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_FUNC_RANGE(); + using DecayInputT = std::decay_t; auto constexpr input_is_const_cv = std::is_same_v; auto constexpr input_is_column_ptr = std::is_same_v>; static_assert(input_is_const_cv ^ input_is_column_ptr); - auto const d_type = [&] { - if constexpr (input_is_column_ptr) { - return input->type().id(); - } else { - return input.type().id(); - } - }; - - // if (d_type == cudf::type_id::LIST || d_type == cudf::type_id::STRUCT || - // d_type == cudf::type_id::STRING) { - // // - // } - // std::unique_ptr convert_column_type(std::unique_ptr& input, - // schema_element_with_precision const& schema, - // bool allow_nonnumeric_numbers, - // bool is_us_locale, - // rmm::cuda_stream_view stream, - // rmm::device_async_resource_ref mr) - if (cudf::is_chrono(schema.type)) { + // Date/time is not processed here - it should be handled separately in spark-rapids. if constexpr (input_is_column_ptr) { - // Date/time is not processed for now, but it should be handled later on in spark-rapids. return std::move(input); } else { CUDF_FAIL("Cannot convert data type to a chrono (date/time) type."); @@ -774,66 +757,138 @@ std::unique_ptr convert_data_type(InputType&& input, } if (schema.type.id() == cudf::type_id::BOOL8) { - return ::spark_rapids_jni::cast_strings_to_booleans(input->view(), stream, mr); + if constexpr (input_is_column_ptr) { + return cast_strings_to_booleans(input->view(), stream, mr); + } else { + return cast_strings_to_booleans(input, stream, mr); + } } + + // if constexpr (input_is_column_ptr) { + // } else { + // } if (cudf::is_integral(schema.type)) { - return ::spark_rapids_jni::cast_strings_to_integers(input->view(), schema.type, stream, mr); + if constexpr (input_is_column_ptr) { + return cast_strings_to_integers(input->view(), schema.type, stream, mr); + } else { + return cast_strings_to_integers(input, schema.type, stream, mr); + } } + if (cudf::is_floating_point(schema.type)) { - return ::spark_rapids_jni::cast_strings_to_floats( - input->view(), schema.type, allow_nonnumeric_numbers, stream, mr); + if constexpr (input_is_column_ptr) { + return cast_strings_to_floats( + input->view(), schema.type, allow_nonnumeric_numbers, stream, mr); + } else { + return cast_strings_to_floats(input, schema.type, allow_nonnumeric_numbers, stream, mr); + } } if (cudf::is_fixed_point(schema.type)) { - return ::spark_rapids_jni::cast_strings_to_decimals( - input->view(), schema.type, schema.precision, is_us_locale, stream, mr); + if constexpr (input_is_column_ptr) { + return cast_strings_to_decimals( + input->view(), schema.type, schema.precision, is_us_locale, stream, mr); + } else { + return cast_strings_to_decimals( + input, schema.type, schema.precision, is_us_locale, stream, mr); + } } if (schema.type.id() == cudf::type_id::STRING) { - return ::spark_rapids_jni::remove_quotes( - input->view(), /*nullify_if_not_quoted*/ false, stream, mr); + if constexpr (input_is_column_ptr) { + return remove_quotes(input->view(), /*nullify_if_not_quoted*/ false, stream, mr); + } else { + return remove_quotes(input, /*nullify_if_not_quoted*/ false, stream, mr); + } } - auto const num_rows = input->size(); - auto const null_count = input->null_count(); - auto const d_type = input->type().id(); - auto const num_children = input->num_children(); - auto input_content = input->release(); + if constexpr (input_is_column_ptr) { + auto const d_type = input->type().id(); + auto const num_rows = input->size(); + auto const null_count = input->null_count(); + auto const num_children = input->num_children(); + auto input_content = input->release(); + + if (schema.type.id() == cudf::type_id::LIST) { + CUDF_EXPECTS(d_type == cudf::type_id::LIST, "Input column should be LIST."); + std::vector> new_children; + new_children.emplace_back( + std::move(input_content.children[cudf::lists_column_view::offsets_column_index])); + new_children.emplace_back( + convert_data_type(input_content.children[cudf::lists_column_view::child_column_index], + schema.child_types.front().second, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr)); + return std::make_unique(cudf::data_type{cudf::type_id::LIST}, + num_rows, + rmm::device_buffer{}, + std::move(*input_content.null_mask), + null_count, + std::move(new_children)); + } - if (schema.type.id() == cudf::type_id::LIST) { - CUDF_EXPECTS(d_type == cudf::type_id::LIST, "Input column should be LIST."); - auto new_child = - convert_column_type(input_content.children[cudf::lists_column_view::child_column_index], + if (schema.type.id() == cudf::type_id::STRUCT) { + CUDF_EXPECTS(d_type == cudf::type_id::STRUCT, "Input column should be STRUCT."); + std::vector> new_children(num_children); + for (cudf::size_type i = 0; i < num_children; ++i) { + new_children[i] = convert_data_type(input_content.children[i], + schema.child_types[i].second, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr); + } + return std::make_unique(cudf::data_type{cudf::type_id::STRUCT}, + num_rows, + rmm::device_buffer{}, + std::move(*input_content.null_mask), + null_count, + std::move(new_children)); + } + } else { + auto const d_type = input.type().id(); + auto const num_rows = input.size(); + auto const null_count = input.null_count(); + auto const num_children = input.num_children(); + + if (schema.type.id() == cudf::type_id::LIST) { + CUDF_EXPECTS(d_type == cudf::type_id::LIST, "Input column should be LIST."); + std::vector> new_children; + new_children.emplace_back(std::make_unique( + input.children[cudf::lists_column_view::offsets_column_index])); + new_children.emplace_back( + convert_data_type(input.child(cudf::lists_column_view::child_column_index), schema.child_types.front().second, allow_nonnumeric_numbers, is_us_locale, stream, - mr); - return cudf::make_lists_column( - num_rows, - std::move(input_content.children[cudf::lists_column_view::offsets_column_index]), - std::move(new_child), - null_count, - std::move(*input_content.null_mask), - stream, - mr); - } + mr)); + return std::make_unique(cudf::data_type{cudf::type_id::LIST}, + num_rows, + rmm::device_buffer{}, + cudf::detail::copy_bitmask(input, stream, mr), + null_count, + std::move(new_children)); + } - if (schema.type.id() == cudf::type_id::STRUCT) { - CUDF_EXPECTS(d_type == cudf::type_id::STRUCT, "Input column should be STRUCT."); - std::vector> new_children(num_children); - for (cudf::size_type i = 0; i < num_children; ++i) { - new_children[i] = convert_column_type(input_content.children[i], + if (schema.type.id() == cudf::type_id::STRUCT) { + CUDF_EXPECTS(d_type == cudf::type_id::STRUCT, "Input column should be STRUCT."); + std::vector> new_children(num_children); + for (cudf::size_type i = 0; i < num_children; ++i) { + new_children[i] = convert_data_type(input.child(i), schema.child_types[i].second, allow_nonnumeric_numbers, is_us_locale, stream, mr); + } + return std::make_unique(cudf::data_type{cudf::type_id::STRUCT}, + num_rows, + rmm::device_buffer{}, + cudf::detail::copy_bitmask(input, stream, mr), + null_count, + std::move(new_children)); } - return cudf::make_structs_column(num_rows, - std::move(new_children), - null_count, - std::move(*input_content.null_mask), - stream, - mr); } CUDF_FAIL("Unexpected column type for conversion."); From 6d9bbdc82092f0c4b05cb964ca22f1ff9f138256 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 12 Nov 2024 14:53:58 -0800 Subject: [PATCH 37/58] Remove `cast_strings_to_dates` Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 114 ----------------------- 1 file changed, 114 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 2134e4c8b..4d299d67a 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -175,101 +175,6 @@ std::pair, rmm::device_uvector> cast_strings return {std::move(output), rmm::device_uvector(0, stream)}; } -std::pair, rmm::device_uvector> cast_strings_to_dates( - cudf::column_view const& input, - std::string const& date_regex, - std::string const& date_format, - bool error_if_invalid, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto const string_count = input.size(); - if (string_count == 0) { - return {cudf::make_empty_column(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}), - rmm::device_uvector(0, stream)}; - } - - // TODO: mr - auto const removed_quotes = remove_quotes(input, false, stream, mr); - - auto const input_sv = cudf::strings_column_view{removed_quotes->view()}; - auto const regex_prog = cudf::strings::regex_program::create( - date_regex, cudf::strings::regex_flags::DEFAULT, cudf::strings::capture_groups::NON_CAPTURE); - auto const is_matched = cudf::strings::matches_re(input_sv, *regex_prog, stream); - auto const is_timestamp = cudf::strings::is_timestamp(input_sv, date_format, stream); - auto const d_is_matched = is_matched->view().begin(); - auto const d_is_timestamp = is_timestamp->view().begin(); - - auto const d_input_ptr = cudf::column_device_view::create(removed_quotes->view(), stream); - auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); - auto const invalid_count = thrust::count_if( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(string_count), - [is_valid = is_valid_it, is_matched = d_is_matched, is_timestamp = d_is_timestamp] __device__( - auto idx) { return is_valid[idx] && (!is_matched[idx] || !is_timestamp[idx]); }); - - if (invalid_count == 0) { - auto output = cudf::strings::to_timestamps( - input_sv, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, date_format, stream, mr); - return {std::move(output), rmm::device_uvector(0, stream)}; - } - - // From here we have invalid_count > 0. - if (error_if_invalid) { return {nullptr, rmm::device_uvector(0, stream)}; } - - auto const input_offsets_it = - cudf::detail::offsetalator_factory::make_input_iterator(input_sv.offsets()); - auto string_pairs = rmm::device_uvector(string_count, stream); - - thrust::tabulate( - rmm::exec_policy_nosync(stream), - string_pairs.begin(), - string_pairs.end(), - [chars = input_sv.chars_begin(stream), - offsets = input_offsets_it, - is_valid = is_valid_it, - is_matched = d_is_matched, - is_timestamp = d_is_timestamp] __device__(cudf::size_type idx) -> string_index_pair { - if (!is_valid[idx] || !is_matched[idx] || !is_timestamp[idx]) { return {nullptr, 0}; } - - auto const start_offset = offsets[idx]; - auto const end_offset = offsets[idx + 1]; - return {chars + start_offset, end_offset - start_offset}; - }); - - auto const size_it = cudf::detail::make_counting_transform_iterator( - 0, - cuda::proclaim_return_type( - [string_pairs = string_pairs.begin()] __device__(cudf::size_type idx) -> cudf::size_type { - return string_pairs[idx].second; - })); - auto [offsets_column, bytes] = - cudf::strings::detail::make_offsets_child_column(size_it, size_it + string_count, stream, mr); - auto chars_data = cudf::strings::detail::make_chars_buffer( - offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr); - - // Don't care about the null mask, as nulls imply empty strings, and will be nullified. - auto const sanitized_input = - cudf::make_strings_column(string_count, std::move(offsets_column), chars_data.release(), 0, {}); - - auto output = cudf::strings::to_timestamps(cudf::strings_column_view{sanitized_input->view()}, - cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, - date_format, - stream, - mr); - - auto validity = rmm::device_uvector(string_count, stream); - thrust::transform(rmm::exec_policy_nosync(stream), - string_pairs.begin(), - string_pairs.end(), - validity.begin(), - [] __device__(string_index_pair const& pair) { return pair.first != nullptr; }); - - // Null mask and null count will be updated later from the validity vector. - return {std::move(output), std::move(validity)}; -} - // TODO there is a bug here around 0 https://github.com/NVIDIA/spark-rapids/issues/10898 std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, cudf::data_type output_type, @@ -960,25 +865,6 @@ std::unique_ptr cast_strings_to_integers(cudf::column_view const& return std::move(output); } -std::unique_ptr cast_strings_to_dates(cudf::column_view const& input, - std::string const& date_regex, - std::string const& date_format, - bool error_if_invalid, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - - auto [output, validity] = - detail::cast_strings_to_dates(input, date_regex, date_format, error_if_invalid, stream, mr); - - if (output == nullptr) { return nullptr; } - auto [null_mask, null_count] = - cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr); - if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } - return std::move(output); -} - std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, cudf::data_type output_type, int precision, From a832938734b644e2e33f5f26dabc8244b247bb7f Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 12 Nov 2024 19:48:00 -0800 Subject: [PATCH 38/58] Implement `convert_data_type` Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 387 ++++++++++++----------- 1 file changed, 194 insertions(+), 193 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 4d299d67a..9992c7292 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -562,88 +562,75 @@ std::pair generate_stru cudf::data_type{cudf::type_id::STRUCT}, -1, std::move(schema_cols_with_precisions)}}; } -} // namespace - -std::unique_ptr from_json_to_structs(cudf::strings_column_view const& input, - std::vector const& col_names, - std::vector const& num_children, - std::vector const& types, - std::vector const& scales, - std::vector const& precisions, - bool normalize_single_quotes, - bool allow_leading_zeros, - bool allow_nonnumeric_numbers, - bool allow_unquoted_control, - bool is_us_locale, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr make_column_from_pair( + std::pair, rmm::device_uvector>& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - auto const [is_invalid_or_empty, concat_input, delimiter] = - concat_json(input, stream, cudf::get_current_device_resource()); - auto const [schema, schema_with_precision] = - generate_struct_schema(col_names, num_children, types, scales, precisions); - - auto opts_builder = - cudf::io::json_reader_options::builder( - cudf::io::source_info{cudf::device_span{ - static_cast(concat_input->data()), concat_input->size()}}) - // fixed options - .lines(true) - .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) - .normalize_whitespace(true) - .mixed_types_as_string(true) - .keep_quotes(true) - .experimental(true) - .normalize_single_quotes(normalize_single_quotes) - .strict_validation(true) - // specifying parameters - .delimiter(delimiter) - .numeric_leading_zeros(allow_leading_zeros) - .nonnumeric_numbers(allow_nonnumeric_numbers) - .unquoted_control_chars(allow_unquoted_control) - .dtypes(schema) - .prune_columns(schema.child_types.size() != 0); + auto& [output, validity] = input; + if (validity.size() > 0) { + auto [null_mask, null_count] = + cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr); + if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } + } + return std::move(output); +} - auto const parsed_table_with_meta = cudf::io::read_json(opts_builder.build()); - auto const& parsed_meta = parsed_table_with_meta.metadata; - auto parsed_columns = parsed_table_with_meta.tbl->release(); +std::vector> make_column_array_from_pairs( + std::vector, rmm::device_uvector>>& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const num_columns = input.size(); + std::vector null_masks; + null_masks.reserve(num_columns); + + rmm::device_uvector d_valid_counts(num_columns, stream, mr); + thrust::uninitialized_fill( + rmm::exec_policy_nosync(stream), d_valid_counts.begin(), d_valid_counts.end(), 0); + + for (std::size_t idx = 0; idx < num_columns; ++idx) { + auto const col_size = input[idx].first->size(); + if (col_size == 0) { + null_masks.emplace_back(rmm::device_buffer{}); // placeholder + continue; + } - CUDF_EXPECTS(parsed_columns.size() == schema.child_types.size(), - "Numbers of output columns is different from schema size."); + null_masks.emplace_back( + cudf::create_null_mask(col_size, mask_state::UNINITIALIZED, stream, mr)); + constexpr size_type block_size{256}; + auto const grid = cudf::detail::grid_1d{static_cast(col_size), block_size}; + cudf::detail::valid_if_kernel + <<>>( + reinterpret_cast(null_masks.back().data()), + input[idx].second.data(), + col_size, + thrust::identity{}, + d_valid_counts.data() + idx); + } - std::vector> converted_cols(parsed_columns.size()); - for (std::size_t i = 0; i < parsed_columns.size(); ++i) { - auto const d_type = parsed_columns[i]->type().id(); - CUDF_EXPECTS(d_type == cudf::type_id::LIST || d_type == cudf::type_id::STRUCT || - d_type == cudf::type_id::STRING, - "Input column should be STRING or nested."); + auto const valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream); + std::vector> output(num_columns); - auto const& [col_name, col_schema] = schema_with_precision.child_types[i]; - CUDF_EXPECTS(parsed_meta.schema_info[i].name == col_name, "Mismatched column name."); - converted_cols[i] = convert_column_type( - parsed_columns[i], col_schema, allow_nonnumeric_numbers, is_us_locale, stream, mr); + for (std::size_t idx = 0; idx < num_columns; ++idx) { + auto const col_size = input[idx].first->size(); + auto const valid_count = valid_counts[idx]; + auto const null_count = col_size - valid_count; + output[idx] = std::move(input[idx]); + if (null_count > 0) { output[idx]->set_null_mask(std::move(null_masks[idx]), null_count); } } - auto const valid_it = is_invalid_or_empty->view().begin(); - auto [null_mask, null_count] = cudf::detail::valid_if( - valid_it, valid_it + is_invalid_or_empty->size(), thrust::logical_not{}, stream, mr); - - return cudf::make_structs_column( - input.size(), - std::move(converted_cols), - null_count, - null_count > 0 ? std::move(null_mask) : rmm::device_buffer{0, stream, mr}, - stream, - mr); + return output; } template -std::unique_ptr convert_data_type(InputType&& input, - schema_element_with_precision const& schema, - bool allow_nonnumeric_numbers, - bool is_us_locale, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::pair, rmm::device_uvector> convert_data_type( + InputType&& input, + schema_element_with_precision const& schema, + bool allow_nonnumeric_numbers, + bool is_us_locale, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -655,9 +642,10 @@ std::unique_ptr convert_data_type(InputType&& input, if (cudf::is_chrono(schema.type)) { // Date/time is not processed here - it should be handled separately in spark-rapids. if constexpr (input_is_column_ptr) { - return std::move(input); + return {std::move(input), rmm::device_uvector{0, stream, mr}}; } else { CUDF_FAIL("Cannot convert data type to a chrono (date/time) type."); + return {nullptr, rmm::device_uvector{0, stream, mr}}; } } @@ -669,9 +657,6 @@ std::unique_ptr convert_data_type(InputType&& input, } } - // if constexpr (input_is_column_ptr) { - // } else { - // } if (cudf::is_integral(schema.type)) { if constexpr (input_is_column_ptr) { return cast_strings_to_integers(input->view(), schema.type, stream, mr); @@ -688,6 +673,7 @@ std::unique_ptr convert_data_type(InputType&& input, return cast_strings_to_floats(input, schema.type, allow_nonnumeric_numbers, stream, mr); } } + if (cudf::is_fixed_point(schema.type)) { if constexpr (input_is_column_ptr) { return cast_strings_to_decimals( @@ -697,6 +683,7 @@ std::unique_ptr convert_data_type(InputType&& input, input, schema.type, schema.precision, is_us_locale, stream, mr); } } + if (schema.type.id() == cudf::type_id::STRING) { if constexpr (input_is_column_ptr) { return remove_quotes(input->view(), /*nullify_if_not_quoted*/ false, stream, mr); @@ -717,38 +704,45 @@ std::unique_ptr convert_data_type(InputType&& input, std::vector> new_children; new_children.emplace_back( std::move(input_content.children[cudf::lists_column_view::offsets_column_index])); - new_children.emplace_back( + new_children.emplace_back(make_column_from_pair( convert_data_type(input_content.children[cudf::lists_column_view::child_column_index], schema.child_types.front().second, allow_nonnumeric_numbers, is_us_locale, stream, - mr)); - return std::make_unique(cudf::data_type{cudf::type_id::LIST}, - num_rows, - rmm::device_buffer{}, - std::move(*input_content.null_mask), - null_count, - std::move(new_children)); + mr), + stream, + mr)); + return {std::make_unique(cudf::data_type{cudf::type_id::LIST}, + num_rows, + rmm::device_buffer{}, + std::move(*input_content.null_mask), + null_count, + std::move(new_children)), + rmm::device_uvector{0, stream, mr}}; } if (schema.type.id() == cudf::type_id::STRUCT) { CUDF_EXPECTS(d_type == cudf::type_id::STRUCT, "Input column should be STRUCT."); - std::vector> new_children(num_children); + std::vector, rmm::device_uvector>> + new_children_with_validity(num_children); for (cudf::size_type i = 0; i < num_children; ++i) { - new_children[i] = convert_data_type(input_content.children[i], - schema.child_types[i].second, - allow_nonnumeric_numbers, - is_us_locale, - stream, - mr); + new_children_with_validity[i] = convert_data_type(input_content.children[i], + schema.child_types[i].second, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr); } - return std::make_unique(cudf::data_type{cudf::type_id::STRUCT}, - num_rows, - rmm::device_buffer{}, - std::move(*input_content.null_mask), - null_count, - std::move(new_children)); + + return {std::make_unique( + cudf::data_type{cudf::type_id::STRUCT}, + num_rows, + rmm::device_buffer{}, + std::move(*input_content.null_mask), + null_count, + make_column_array_from_pairs(new_children_with_validity, stream, mr)), + rmm::device_uvector{0, stream, mr}}; } } else { auto const d_type = input.type().id(); @@ -761,43 +755,124 @@ std::unique_ptr convert_data_type(InputType&& input, std::vector> new_children; new_children.emplace_back(std::make_unique( input.children[cudf::lists_column_view::offsets_column_index])); - new_children.emplace_back( + new_children.emplace_back(make_column_from_pair( convert_data_type(input.child(cudf::lists_column_view::child_column_index), schema.child_types.front().second, allow_nonnumeric_numbers, is_us_locale, stream, - mr)); - return std::make_unique(cudf::data_type{cudf::type_id::LIST}, - num_rows, - rmm::device_buffer{}, - cudf::detail::copy_bitmask(input, stream, mr), - null_count, - std::move(new_children)); + mr), + stream, + mr)); + return {std::make_unique(cudf::data_type{cudf::type_id::LIST}, + num_rows, + rmm::device_buffer{}, + cudf::detail::copy_bitmask(input, stream, mr), + null_count, + std::move(new_children)), + rmm::device_uvector{0, stream, mr}}; } if (schema.type.id() == cudf::type_id::STRUCT) { CUDF_EXPECTS(d_type == cudf::type_id::STRUCT, "Input column should be STRUCT."); - std::vector> new_children(num_children); + std::vector, rmm::device_uvector>> + new_children_with_validity(num_children); for (cudf::size_type i = 0; i < num_children; ++i) { - new_children[i] = convert_data_type(input.child(i), - schema.child_types[i].second, - allow_nonnumeric_numbers, - is_us_locale, - stream, - mr); + new_children_with_validity[i] = convert_data_type(input.child(i), + schema.child_types[i].second, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr); } - return std::make_unique(cudf::data_type{cudf::type_id::STRUCT}, - num_rows, - rmm::device_buffer{}, - cudf::detail::copy_bitmask(input, stream, mr), - null_count, - std::move(new_children)); + return {std::make_unique( + cudf::data_type{cudf::type_id::STRUCT}, + num_rows, + rmm::device_buffer{}, + cudf::detail::copy_bitmask(input, stream, mr), + null_count, + make_column_array_from_pairs(new_children_with_validity, stream, mr)), + rmm::device_uvector{0, stream, mr}}; } } CUDF_FAIL("Unexpected column type for conversion."); - return nullptr; + return {nullptr, rmm::device_uvector{0, stream, mr}}; +} + +} // namespace + +std::unique_ptr from_json_to_structs(cudf::strings_column_view const& input, + std::vector const& col_names, + std::vector const& num_children, + std::vector const& types, + std::vector const& scales, + std::vector const& precisions, + bool normalize_single_quotes, + bool allow_leading_zeros, + bool allow_nonnumeric_numbers, + bool allow_unquoted_control, + bool is_us_locale, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const [is_invalid_or_empty, concat_input, delimiter] = + concat_json(input, stream, cudf::get_current_device_resource()); + auto const [schema, schema_with_precision] = + generate_struct_schema(col_names, num_children, types, scales, precisions); + + auto opts_builder = + cudf::io::json_reader_options::builder( + cudf::io::source_info{cudf::device_span{ + static_cast(concat_input->data()), concat_input->size()}}) + // fixed options + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .normalize_whitespace(true) + .mixed_types_as_string(true) + .keep_quotes(true) + .experimental(true) + .normalize_single_quotes(normalize_single_quotes) + .strict_validation(true) + // specifying parameters + .delimiter(delimiter) + .numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control) + .dtypes(schema) + .prune_columns(schema.child_types.size() != 0); + + auto const parsed_table_with_meta = cudf::io::read_json(opts_builder.build()); + auto const& parsed_meta = parsed_table_with_meta.metadata; + auto parsed_columns = parsed_table_with_meta.tbl->release(); + + CUDF_EXPECTS(parsed_columns.size() == schema.child_types.size(), + "Numbers of output columns is different from schema size."); + + std::vector> converted_cols(parsed_columns.size()); + for (std::size_t i = 0; i < parsed_columns.size(); ++i) { + auto const d_type = parsed_columns[i]->type().id(); + CUDF_EXPECTS(d_type == cudf::type_id::LIST || d_type == cudf::type_id::STRUCT || + d_type == cudf::type_id::STRING, + "Input column should be STRING or nested."); + + auto const& [col_name, col_schema] = schema_with_precision.child_types[i]; + CUDF_EXPECTS(parsed_meta.schema_info[i].name == col_name, "Mismatched column name."); + converted_cols[i] = convert_column_type( + parsed_columns[i], col_schema, allow_nonnumeric_numbers, is_us_locale, stream, mr); + } + + auto const valid_it = is_invalid_or_empty->view().begin(); + auto [null_mask, null_count] = cudf::detail::valid_if( + valid_it, valid_it + is_invalid_or_empty->size(), thrust::logical_not{}, stream, mr); + + return cudf::make_structs_column( + input.size(), + std::move(converted_cols), + null_count, + null_count > 0 ? std::move(null_mask) : rmm::device_buffer{0, stream, mr}, + stream, + mr); } } // namespace detail @@ -832,80 +907,6 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con mr); } -std::unique_ptr make_structs(std::vector const& children, - cudf::column_view const& is_null, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::make_structs(children, is_null, stream, mr); -} - -std::unique_ptr cast_strings_to_booleans(cudf::column_view const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - - auto [output, validity] = detail::cast_strings_to_booleans(input, stream, mr); - auto [null_mask, null_count] = - cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr); - if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } - return std::move(output); -} - -std::unique_ptr cast_strings_to_integers(cudf::column_view const& input, - cudf::data_type output_type, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - - auto [output, validity] = detail::cast_strings_to_integers(input, output_type, stream, mr); - return std::move(output); -} - -std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, - cudf::data_type output_type, - int precision, - bool is_us_locale, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::cast_strings_to_decimals(input, output_type, precision, is_us_locale, stream, mr); -} - -std::unique_ptr remove_quotes(cudf::column_view const& input, - bool nullify_if_not_quoted, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - auto [output, validity] = detail::remove_quotes(input, nullify_if_not_quoted, stream, mr); - if (validity.size() > 0) { - auto [null_mask, null_count] = - cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr); - if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } - } - return std::move(output); -} - -std::unique_ptr cast_strings_to_floats(cudf::column_view const& input, - cudf::data_type output_type, - bool allow_nonnumeric_numbers, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - if (allow_nonnumeric_numbers) { - auto [removed_quotes, validity] = detail::remove_quotes_for_floats(input, stream, mr); - return string_to_float( - output_type, cudf::strings_column_view{removed_quotes->view()}, false, stream, mr); - } - return string_to_float(output_type, cudf::strings_column_view{input}, false, stream, mr); -} - std::unique_ptr convert_data_type(cudf::column_view const& input, std::vector const& num_children, std::vector const& types, From 44b885b26fd2e5269c3f324d353bf44a2635523b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 12 Nov 2024 22:35:27 -0800 Subject: [PATCH 39/58] Fix compile errors Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 18 +- src/main/cpp/src/from_json_to_structs.cu | 330 +++++++++++++---------- src/main/cpp/src/json_utils.hpp | 3 - 3 files changed, 185 insertions(+), 166 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index ea56aaad7..b0618ed8d 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -219,10 +219,7 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_convertDataType(JNIEnv* env, jintArray j_types, jintArray j_scales, jintArray j_precisions, - jboolean normalize_single_quotes, - jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, - jboolean allow_unquoted_control, jboolean is_us_locale) { JNI_NULL_CHECK(env, j_input, "j_input is null", 0); @@ -245,17 +242,10 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_convertDataType(JNIEnv* env, CUDF_EXPECTS(num_children.size() == scales.size(), "Invalid schema data: scales."); CUDF_EXPECTS(num_children.size() == precisions.size(), "Invalid schema data: precisions."); - return cudf::jni::ptr_as_jlong(spark_rapids_jni::convert_data_type(*input, - num_children, - types, - scales, - precisions, - normalize_single_quotes, - allow_leading_zeros, - allow_nonnumeric_numbers, - allow_unquoted_control, - is_us_locale) - .release()); + return cudf::jni::ptr_as_jlong( + spark_rapids_jni::convert_data_type( + *input, num_children, types, scales, precisions, allow_nonnumeric_numbers, is_us_locale) + .release()); } CATCH_STD(env, 0); } diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 9992c7292..a6604ab8f 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -175,16 +175,121 @@ std::pair, rmm::device_uvector> cast_strings return {std::move(output), rmm::device_uvector(0, stream)}; } +// TODO: extract commond code for this and `remove_quotes`. +// This function always return zero size validity array. +std::pair, rmm::device_uvector> remove_quotes_for_floats( + cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) +{ + auto const string_count = input.size(); + if (string_count == 0) { + return {cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}), + rmm::device_uvector(0, stream)}; + } + + auto const input_sv = cudf::strings_column_view{input}; + auto const input_offsets_it = + cudf::detail::offsetalator_factory::make_input_iterator(input_sv.offsets()); + auto const d_input_ptr = cudf::column_device_view::create(input, stream); + auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); + + auto string_pairs = rmm::device_uvector(string_count, stream); + thrust::tabulate(rmm::exec_policy_nosync(stream), + string_pairs.begin(), + string_pairs.end(), + [chars = input_sv.chars_begin(stream), + offsets = input_offsets_it, + is_valid = is_valid_it] __device__(cudf::size_type idx) -> string_index_pair { + if (!is_valid[idx]) { return {nullptr, 0}; } + + auto const start_offset = offsets[idx]; + auto const end_offset = offsets[idx + 1]; + auto const size = end_offset - start_offset; + auto const str = chars + start_offset; + + // Need to check for size, since the input string may contain just a single + // character `"`. Such input should not be considered as quoted. + auto const is_quoted = size > 1 && str[0] == '"' && str[size - 1] == '"'; + + // We check and remove quotes only for the special cases (non-numeric numbers + // wrapped in double quotes) that are accepted in `from_json`. + // They are "NaN", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity". + if (is_quoted) { + // "NaN" + auto accepted = size == 5 && str[1] == 'N' && str[2] == 'a' && str[3] == 'N'; + + // "+INF" and "-INF" + accepted = accepted || (size == 6 && (str[1] == '+' || str[1] == '-') && + str[2] == 'I' && str[3] == 'N' && str[4] == 'F'); + + // "Infinity" + accepted = accepted || (size == 10 && str[1] == 'I' && str[2] == 'n' && + str[3] == 'f' && str[4] == 'i' && str[5] == 'n' && + str[6] == 'i' && str[7] == 't' && str[8] == 'y'); + + // "+Infinity" and "-Infinity" + accepted = accepted || (size == 11 && (str[1] == '+' || str[1] == '-') && + str[2] == 'I' && str[3] == 'n' && str[4] == 'f' && + str[5] == 'i' && str[6] == 'n' && str[7] == 'i' && + str[8] == 't' && str[9] == 'y'); + + if (accepted) { return {str + 1, size - 2}; } + } + + return {str, size}; + }); + + auto const size_it = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [string_pairs = string_pairs.begin()] __device__(cudf::size_type idx) -> cudf::size_type { + return string_pairs[idx].second; + })); + auto [offsets_column, bytes] = + cudf::strings::detail::make_offsets_child_column(size_it, size_it + string_count, stream, mr); + auto chars_data = cudf::strings::detail::make_chars_buffer( + offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr); + + auto output = cudf::make_strings_column(string_count, + std::move(offsets_column), + chars_data.release(), + input.null_count(), + cudf::detail::copy_bitmask(input, stream, mr)); + + return {std::move(output), rmm::device_uvector(0, stream)}; +} + +std::pair, rmm::device_uvector> cast_strings_to_floats( + cudf::column_view const& input, + cudf::data_type output_type, + bool allow_nonnumeric_numbers, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + if (allow_nonnumeric_numbers) { + auto [removed_quotes, validity] = remove_quotes_for_floats(input, stream, mr); + return {::spark_rapids_jni::string_to_float( + output_type, cudf::strings_column_view{removed_quotes->view()}, false, stream, mr), + rmm::device_uvector{0, stream, mr}}; + } + return {::spark_rapids_jni::string_to_float( + output_type, cudf::strings_column_view{input}, false, stream, mr), + rmm::device_uvector{0, stream, mr}}; +} + // TODO there is a bug here around 0 https://github.com/NVIDIA/spark-rapids/issues/10898 -std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, - cudf::data_type output_type, - int precision, - bool is_us_locale, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::pair, rmm::device_uvector> cast_strings_to_decimals( + cudf::column_view const& input, + cudf::data_type output_type, + int precision, + bool is_us_locale, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { auto const string_count = input.size(); - if (string_count == 0) { return cudf::make_empty_column(output_type); } + if (string_count == 0) { + return {cudf::make_empty_column(output_type), rmm::device_uvector{0, stream, mr}}; + } CUDF_EXPECTS(is_us_locale, "String to decimal conversion is only supported in US locale."); @@ -259,7 +364,8 @@ std::unique_ptr cast_strings_to_decimals(cudf::column_view const& // If the output strings column does not change in its total bytes, we know that it does not have // any '"' or ',' characters. if (bytes == input_sv.chars_size(stream)) { - return string_to_decimal(precision, output_type.scale(), input_sv, false, false, stream, mr); + return {string_to_decimal(precision, output_type.scale(), input_sv, false, false, stream, mr), + rmm::device_uvector{0, stream, mr}}; } auto const out_offsets = @@ -303,13 +409,14 @@ std::unique_ptr cast_strings_to_decimals(cudf::column_view const& chars_data.release(), 0, rmm::device_buffer{0, stream, mr}); - return string_to_decimal(precision, - output_type.scale(), - cudf::strings_column_view{unquoted_strings->view()}, - false, - false, - stream, - mr); + return {string_to_decimal(precision, + output_type.scale(), + cudf::strings_column_view{unquoted_strings->view()}, + false, + false, + stream, + mr), + rmm::device_uvector{0, stream, mr}}; } std::pair, rmm::device_uvector> remove_quotes( @@ -393,88 +500,6 @@ std::pair, rmm::device_uvector> remove_quote } } -// TODO: extract commond code for this and `remove_quotes`. -std::pair, rmm::device_uvector> remove_quotes_for_floats( - cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) -{ - auto const string_count = input.size(); - if (string_count == 0) { - return {cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}), - rmm::device_uvector(0, stream)}; - } - - auto const input_sv = cudf::strings_column_view{input}; - auto const input_offsets_it = - cudf::detail::offsetalator_factory::make_input_iterator(input_sv.offsets()); - auto const d_input_ptr = cudf::column_device_view::create(input, stream); - auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); - - auto string_pairs = rmm::device_uvector(string_count, stream); - thrust::tabulate(rmm::exec_policy_nosync(stream), - string_pairs.begin(), - string_pairs.end(), - [chars = input_sv.chars_begin(stream), - offsets = input_offsets_it, - is_valid = is_valid_it] __device__(cudf::size_type idx) -> string_index_pair { - if (!is_valid[idx]) { return {nullptr, 0}; } - - auto const start_offset = offsets[idx]; - auto const end_offset = offsets[idx + 1]; - auto const size = end_offset - start_offset; - auto const str = chars + start_offset; - - // Need to check for size, since the input string may contain just a single - // character `"`. Such input should not be considered as quoted. - auto const is_quoted = size > 1 && str[0] == '"' && str[size - 1] == '"'; - - // We check and remove quotes only for the special cases (non-numeric numbers - // wrapped in double quotes) that are accepted in `from_json`. - // They are "NaN", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity". - if (is_quoted) { - // "NaN" - auto accepted = size == 5 && str[1] == 'N' && str[2] == 'a' && str[3] == 'N'; - - // "+INF" and "-INF" - accepted = accepted || (size == 6 && (str[1] == '+' || str[1] == '-') && - str[2] == 'I' && str[3] == 'N' && str[4] == 'F'); - - // "Infinity" - accepted = accepted || (size == 10 && str[1] == 'I' && str[2] == 'n' && - str[3] == 'f' && str[4] == 'i' && str[5] == 'n' && - str[6] == 'i' && str[7] == 't' && str[8] == 'y'); - - // "+Infinity" and "-Infinity" - accepted = accepted || (size == 11 && (str[1] == '+' || str[1] == '-') && - str[2] == 'I' && str[3] == 'n' && str[4] == 'f' && - str[5] == 'i' && str[6] == 'n' && str[7] == 'i' && - str[8] == 't' && str[9] == 'y'); - - if (accepted) { return {str + 1, size - 2}; } - } - - return {str, size}; - }); - - auto const size_it = cudf::detail::make_counting_transform_iterator( - 0, - cuda::proclaim_return_type( - [string_pairs = string_pairs.begin()] __device__(cudf::size_type idx) -> cudf::size_type { - return string_pairs[idx].second; - })); - auto [offsets_column, bytes] = - cudf::strings::detail::make_offsets_child_column(size_it, size_it + string_count, stream, mr); - auto chars_data = cudf::strings::detail::make_chars_buffer( - offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr); - - auto output = cudf::make_strings_column(string_count, - std::move(offsets_column), - chars_data.release(), - input.null_count(), - cudf::detail::copy_bitmask(input, stream, mr)); - - return {std::move(output), rmm::device_uvector(0, stream)}; -} - /** * @brief The struct similar to `cudf::io::schema_element` with adding decimal precision and * preserving column order. @@ -563,7 +588,7 @@ std::pair generate_stru } std::unique_ptr make_column_from_pair( - std::pair, rmm::device_uvector>& input, + std::pair, rmm::device_uvector>&& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -585,7 +610,7 @@ std::vector> make_column_array_from_pairs( std::vector null_masks; null_masks.reserve(num_columns); - rmm::device_uvector d_valid_counts(num_columns, stream, mr); + rmm::device_uvector d_valid_counts(num_columns, stream, mr); thrust::uninitialized_fill( rmm::exec_policy_nosync(stream), d_valid_counts.begin(), d_valid_counts.end(), 0); @@ -597,12 +622,13 @@ std::vector> make_column_array_from_pairs( } null_masks.emplace_back( - cudf::create_null_mask(col_size, mask_state::UNINITIALIZED, stream, mr)); - constexpr size_type block_size{256}; - auto const grid = cudf::detail::grid_1d{static_cast(col_size), block_size}; + cudf::create_null_mask(col_size, cudf::mask_state::UNINITIALIZED, stream, mr)); + constexpr cudf::size_type block_size{256}; + auto const grid = + cudf::detail::grid_1d{static_cast(col_size), block_size}; cudf::detail::valid_if_kernel <<>>( - reinterpret_cast(null_masks.back().data()), + reinterpret_cast(null_masks.back().data()), input[idx].second.data(), col_size, thrust::identity{}, @@ -610,13 +636,13 @@ std::vector> make_column_array_from_pairs( } auto const valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream); - std::vector> output(num_columns); + std::vector> output(num_columns); for (std::size_t idx = 0; idx < num_columns; ++idx) { auto const col_size = input[idx].first->size(); auto const valid_count = valid_counts[idx]; auto const null_count = col_size - valid_count; - output[idx] = std::move(input[idx]); + output[idx] = std::move(input[idx].first); if (null_count > 0) { output[idx]->set_null_mask(std::move(null_masks[idx]), null_count); } } @@ -705,12 +731,13 @@ std::pair, rmm::device_uvector> convert_data new_children.emplace_back( std::move(input_content.children[cudf::lists_column_view::offsets_column_index])); new_children.emplace_back(make_column_from_pair( - convert_data_type(input_content.children[cudf::lists_column_view::child_column_index], - schema.child_types.front().second, - allow_nonnumeric_numbers, - is_us_locale, - stream, - mr), + convert_data_type( + std::move(input_content.children[cudf::lists_column_view::child_column_index]), + schema.child_types.front().second, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr), stream, mr)); return {std::make_unique(cudf::data_type{cudf::type_id::LIST}, @@ -725,14 +752,16 @@ std::pair, rmm::device_uvector> convert_data if (schema.type.id() == cudf::type_id::STRUCT) { CUDF_EXPECTS(d_type == cudf::type_id::STRUCT, "Input column should be STRUCT."); std::vector, rmm::device_uvector>> - new_children_with_validity(num_children); + new_children_with_validity; + new_children_with_validity.reserve(num_children); for (cudf::size_type i = 0; i < num_children; ++i) { - new_children_with_validity[i] = convert_data_type(input_content.children[i], - schema.child_types[i].second, - allow_nonnumeric_numbers, - is_us_locale, - stream, - mr); + new_children_with_validity.emplace_back( + convert_data_type(std::move(input_content.children[i]), + schema.child_types[i].second, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr)); } return {std::make_unique( @@ -744,7 +773,7 @@ std::pair, rmm::device_uvector> convert_data make_column_array_from_pairs(new_children_with_validity, stream, mr)), rmm::device_uvector{0, stream, mr}}; } - } else { + } else { // input_is_const_cv auto const d_type = input.type().id(); auto const num_rows = input.size(); auto const null_count = input.null_count(); @@ -753,8 +782,8 @@ std::pair, rmm::device_uvector> convert_data if (schema.type.id() == cudf::type_id::LIST) { CUDF_EXPECTS(d_type == cudf::type_id::LIST, "Input column should be LIST."); std::vector> new_children; - new_children.emplace_back(std::make_unique( - input.children[cudf::lists_column_view::offsets_column_index])); + new_children.emplace_back( + std::make_unique(input.child(cudf::lists_column_view::offsets_column_index))); new_children.emplace_back(make_column_from_pair( convert_data_type(input.child(cudf::lists_column_view::child_column_index), schema.child_types.front().second, @@ -776,14 +805,15 @@ std::pair, rmm::device_uvector> convert_data if (schema.type.id() == cudf::type_id::STRUCT) { CUDF_EXPECTS(d_type == cudf::type_id::STRUCT, "Input column should be STRUCT."); std::vector, rmm::device_uvector>> - new_children_with_validity(num_children); + new_children_with_validity; + new_children_with_validity.reserve(num_children); for (cudf::size_type i = 0; i < num_children; ++i) { - new_children_with_validity[i] = convert_data_type(input.child(i), - schema.child_types[i].second, - allow_nonnumeric_numbers, - is_us_locale, - stream, - mr); + new_children_with_validity.emplace_back(convert_data_type(input.child(i), + schema.child_types[i].second, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr)); } return {std::make_unique( cudf::data_type{cudf::type_id::STRUCT}, @@ -816,8 +846,8 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto const [is_invalid_or_empty, concat_input, delimiter] = - concat_json(input, stream, cudf::get_current_device_resource()); + auto const [concat_input, delimiter, is_invalid_or_empty] = + concat_json(input, false, stream, cudf::get_current_device_resource()); auto const [schema, schema_with_precision] = generate_struct_schema(col_names, num_children, types, scales, precisions); @@ -849,7 +879,9 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con CUDF_EXPECTS(parsed_columns.size() == schema.child_types.size(), "Numbers of output columns is different from schema size."); - std::vector> converted_cols(parsed_columns.size()); + std::vector, rmm::device_uvector>> + converted_cols_with_validity; + converted_cols_with_validity.reserve(parsed_columns.size()); for (std::size_t i = 0; i < parsed_columns.size(); ++i) { auto const d_type = parsed_columns[i]->type().id(); CUDF_EXPECTS(d_type == cudf::type_id::LIST || d_type == cudf::type_id::STRUCT || @@ -858,8 +890,12 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con auto const& [col_name, col_schema] = schema_with_precision.child_types[i]; CUDF_EXPECTS(parsed_meta.schema_info[i].name == col_name, "Mismatched column name."); - converted_cols[i] = convert_column_type( - parsed_columns[i], col_schema, allow_nonnumeric_numbers, is_us_locale, stream, mr); + converted_cols_with_validity.emplace_back(convert_data_type(std::move(parsed_columns[i]), + col_schema, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr)); } auto const valid_it = is_invalid_or_empty->view().begin(); @@ -868,7 +904,7 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con return cudf::make_structs_column( input.size(), - std::move(converted_cols), + make_column_array_from_pairs(converted_cols_with_validity, stream, mr), null_count, null_count > 0 ? std::move(null_mask) : rmm::device_buffer{0, stream, mr}, stream, @@ -912,27 +948,23 @@ std::unique_ptr convert_data_type(cudf::column_view const& input, std::vector const& types, std::vector const& scales, std::vector const& precisions, - bool normalize_single_quotes, - bool allow_leading_zeros, bool allow_nonnumeric_numbers, - bool allow_unquoted_control, bool is_us_locale, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::convert_data_type(input, - num_children, - types, - scales, - precisions, - normalize_single_quotes, - allow_leading_zeros, - allow_nonnumeric_numbers, - allow_unquoted_control, - is_us_locale, - stream, - mr); + [[maybe_unused]] auto const [schema, schema_with_precision] = detail::generate_struct_schema( + /*dummy col_names*/ std::vector(num_children.size(), std::string{}), + num_children, + types, + scales, + precisions); + return detail::make_column_from_pair( + detail::convert_data_type( + input, schema_with_precision, allow_nonnumeric_numbers, is_us_locale, stream, mr), + stream, + mr); } } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 7a163d311..44bc8d93e 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -67,10 +67,7 @@ std::unique_ptr convert_data_type( std::vector const& types, std::vector const& scales, std::vector const& precisions, - bool normalize_single_quotes, - bool allow_leading_zeros, bool allow_nonnumeric_numbers, - bool allow_unquoted_control, bool is_us_locale, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource()); From ab45de8e914da85e499c0eaabdbf88c7c12d9d0a Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 12 Nov 2024 22:43:23 -0800 Subject: [PATCH 40/58] Add `CUDF_FUNC_RANGE();` Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index a6604ab8f..24894a632 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -63,6 +63,8 @@ using string_index_pair = thrust::pair; std::pair, rmm::device_uvector> cast_strings_to_booleans( cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_FUNC_RANGE(); + auto const string_count = input.size(); if (string_count == 0) { return {cudf::make_empty_column(cudf::data_type{cudf::type_id::BOOL8}), @@ -117,6 +119,8 @@ std::pair, rmm::device_uvector> cast_strings rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_FUNC_RANGE(); + auto const string_count = input.size(); if (string_count == 0) { return {cudf::make_empty_column(output_type), rmm::device_uvector(0, stream)}; @@ -180,6 +184,8 @@ std::pair, rmm::device_uvector> cast_strings std::pair, rmm::device_uvector> remove_quotes_for_floats( cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_FUNC_RANGE(); + auto const string_count = input.size(); if (string_count == 0) { return {cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}), @@ -266,6 +272,7 @@ std::pair, rmm::device_uvector> cast_strings rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); + if (allow_nonnumeric_numbers) { auto [removed_quotes, validity] = remove_quotes_for_floats(input, stream, mr); return {::spark_rapids_jni::string_to_float( @@ -286,6 +293,8 @@ std::pair, rmm::device_uvector> cast_strings rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_FUNC_RANGE(); + auto const string_count = input.size(); if (string_count == 0) { return {cudf::make_empty_column(output_type), rmm::device_uvector{0, stream, mr}}; @@ -425,6 +434,8 @@ std::pair, rmm::device_uvector> remove_quote rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_FUNC_RANGE(); + auto const string_count = input.size(); if (string_count == 0) { return {cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}), From 89e74a014375abe8fdc0dda6f8b9633288e49754 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 12 Nov 2024 22:46:16 -0800 Subject: [PATCH 41/58] Fix schema Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 24894a632..79632987d 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -939,6 +939,7 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); + return detail::from_json_to_structs(input, col_names, num_children, @@ -965,15 +966,23 @@ std::unique_ptr convert_data_type(cudf::column_view const& input, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); + [[maybe_unused]] auto const [schema, schema_with_precision] = detail::generate_struct_schema( /*dummy col_names*/ std::vector(num_children.size(), std::string{}), num_children, types, scales, precisions); + CUDF_EXPECTS(schema_with_precision.child_types.size() == 1, + "The input schema must have exactly one column."); + return detail::make_column_from_pair( - detail::convert_data_type( - input, schema_with_precision, allow_nonnumeric_numbers, is_us_locale, stream, mr), + detail::convert_data_type(input, + schema_with_precision.child_types.front().second, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr), stream, mr); } From 27ef532f26a1ba0e40c6a486079965a152fafc19 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 13 Nov 2024 10:27:49 -0800 Subject: [PATCH 42/58] Complete `from_json_to_structs` Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 15 +++ src/main/cpp/src/from_json_to_structs.cu | 11 ++ src/main/cpp/src/json_utils.hpp | 13 +++ .../nvidia/spark/rapids/jni/JSONUtils.java | 101 ++++++++++-------- 4 files changed, 94 insertions(+), 46 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index b0618ed8d..28969cfba 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -249,4 +249,19 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_convertDataType(JNIEnv* env, } CATCH_STD(env, 0); } + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_removeQuotes( + JNIEnv* env, jclass, jlong j_input, jboolean nullify_if_not_quoted) +{ + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = reinterpret_cast(j_input); + return cudf::jni::ptr_as_jlong( + spark_rapids_jni::remove_quotes(*input, nullify_if_not_quoted).release()); + } + CATCH_STD(env, 0); +} + } // extern "C" diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 79632987d..e0fee063a 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -987,4 +987,15 @@ std::unique_ptr convert_data_type(cudf::column_view const& input, mr); } +std::unique_ptr remove_quotes(cudf::column_view const& input, + bool nullify_if_not_quoted, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + + return detail::make_column_from_pair( + detail::remove_quotes(input, nullify_if_not_quoted, stream, mr), stream, mr); +} + } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 44bc8d93e..6edadb24b 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -72,6 +72,19 @@ std::unique_ptr convert_data_type( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource()); +/** + * @brief remove_quotes + * @param input + * @param stream + * @param mr + * @return + */ +std::unique_ptr remove_quotes( + cudf::column_view const& input, + bool nullify_if_not_quoted, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + /** * @brief Concatenate the JSON objects given by a strings column into one single character buffer, * in which each JSON objects is delimited by a special character that does not exist in the input. diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 8564ce5d0..432453bfb 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -160,53 +160,66 @@ public static ColumnVector extractRawMapFromJsonString(ColumnView input) { return new ColumnVector(extractRawMapFromJsonString(input.getNativeView())); } - public static ColumnVector castStringsToBooleans(ColumnView input) { - assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - return new ColumnVector(castStringsToBooleans(input.getNativeView())); - } - - public static ColumnVector castStringsToDecimals(ColumnView input, DType outputType, - int precision, int scale, - boolean isUSLocale) { + /** + * + * @param input + * @param schema + * @param opts + * @param isUSLocale + * @return + */ + public static ColumnVector fromJSONToStructs(ColumnView input, Schema schema, JSONOptions opts, + boolean isUSLocale) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - return new ColumnVector(castStringsToDecimals(input.getNativeView(), - outputType.getTypeId().getNativeId(), precision, scale, isUSLocale)); + return new ColumnVector(fromJSONToStructs(input.getNativeView(), + schema.getFlattenedColumnNames(), + schema.getFlattenedNumChildren(), + schema.getFlattenedTypeIds(), + schema.getFlattenedTypeScales(), + schema.getFlattenedDecimalPrecisions(), + opts.isNormalizeSingleQuotes(), + opts.leadingZerosAllowed(), + opts.nonNumericNumbersAllowed(), + opts.unquotedControlChars(), + isUSLocale)); } - public static ColumnVector castStringsToIntegers(ColumnView input, DType output_type) { + /** + * + * @param input + * @param schema + * @param opts + * @param isUSLocale + * @return + */ + public static ColumnVector convertDataType(ColumnView input, Schema schema, JSONOptions opts, + boolean isUSLocale) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - return new ColumnVector(castStringsToIntegers(input.getNativeView(), - output_type.getTypeId().getNativeId())); + return new ColumnVector(convertDataType(input.getNativeView(), + schema.getFlattenedNumChildren(), + schema.getFlattenedTypeIds(), + schema.getFlattenedTypeScales(), + schema.getFlattenedDecimalPrecisions(), + opts.nonNumericNumbersAllowed(), + isUSLocale)); } + /** + * + * @param input + * @param nullifyIfNotQuoted + * @return + */ public static ColumnVector removeQuotes(ColumnView input, boolean nullifyIfNotQuoted) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; return new ColumnVector(removeQuotes(input.getNativeView(), nullifyIfNotQuoted)); } - public static ColumnVector castStringsToFloats(ColumnView input, DType outputType, - boolean allowNonNumericNumbers) { - assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - return new ColumnVector(castStringsToFloats(input.getNativeView(), - outputType.getTypeId().getNativeId(), allowNonNumericNumbers)); - } - - public static ColumnVector fromJSONToStructs(ColumnVector input, Schema schema, JSONOptions opts, - boolean isUSLocale) { + public static ColumnVector removeQuotes(ColumnView input) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - - return new ColumnVector(fromJSONToStructs(input.getNativeView(), - schema.getFlattenedColumnNames(), schema.getFlattenedNumChildren(), - schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), - schema.getFlattenedDecimalPrecisions(), - opts.isNormalizeSingleQuotes(), - opts.leadingZerosAllowed(), - opts.nonNumericNumbersAllowed(), - opts.unquotedControlChars(), - isUSLocale)); + return new ColumnVector(removeQuotes(input.getNativeView(), true)); } - private static native int getMaxJSONPathDepth(); private static native long getJsonObject(long input, @@ -225,19 +238,6 @@ private static native long[] getJsonObjectMultiplePaths(long input, private static native long extractRawMapFromJsonString(long input); - private static native long castStringsToBooleans(long input); - - private static native long castStringsToDecimals(long input, int outputTypeId, - int precision, - int scale, - boolean isUSLocale); - - private static native long castStringsToIntegers(long input, int outputType); - - private static native long removeQuotes(long input, boolean nullifyIfNotQuoted); - - private static native long castStringsToFloats(long input, int outputTypeId, boolean allowNonNumericNumbers); - private static native long fromJSONToStructs(long input, String[] names, int[] numChildren, @@ -250,4 +250,13 @@ private static native long fromJSONToStructs(long input, boolean unquotedControlChars, boolean isUSLocale); + private static native long convertDataType(long input, + int[] numChildren, + int[] typeIds, + int[] typeScales, + int[] typePrecision, + boolean nonNumericNumbersAllowed, + boolean isUSLocale); + + private static native long removeQuotes(long input, boolean nullifyIfNotQuoted); } From 5b657124ea849a43b58d6ac2159cb8acd89d7ff9 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 13 Nov 2024 11:14:36 -0800 Subject: [PATCH 43/58] Fix null mask Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index e0fee063a..7a1f49f42 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -626,8 +626,9 @@ std::vector> make_column_array_from_pairs( rmm::exec_policy_nosync(stream), d_valid_counts.begin(), d_valid_counts.end(), 0); for (std::size_t idx = 0; idx < num_columns; ++idx) { - auto const col_size = input[idx].first->size(); - if (col_size == 0) { + auto const col_size = input[idx].first->size(); + auto const validity_size = input[idx].second.size(); + if (col_size == 0 || validity_size == 0) { null_masks.emplace_back(rmm::device_buffer{}); // placeholder continue; } @@ -650,10 +651,14 @@ std::vector> make_column_array_from_pairs( std::vector> output(num_columns); for (std::size_t idx = 0; idx < num_columns; ++idx) { - auto const col_size = input[idx].first->size(); + auto const col_size = input[idx].first->size(); + output[idx] = std::move(input[idx].first); + + auto const validity_size = input[idx].second.size(); + if (col_size == 0 || validity_size == 0) { continue; } + auto const valid_count = valid_counts[idx]; auto const null_count = col_size - valid_count; - output[idx] = std::move(input[idx].first); if (null_count > 0) { output[idx]->set_null_mask(std::move(null_masks[idx]), null_count); } } From 6788471af3154350ddd4f78cab8000d6c4b62a96 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 13 Nov 2024 13:50:50 -0800 Subject: [PATCH 44/58] Write Javadoc Signed-off-by: Nghia Truong --- .../nvidia/spark/rapids/jni/JSONUtils.java | 39 +++++++++---------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 432453bfb..57eb9b3e9 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -161,12 +161,13 @@ public static ColumnVector extractRawMapFromJsonString(ColumnView input) { } /** + * Parse a JSON string into a struct column following by the given data schema. * - * @param input - * @param schema - * @param opts - * @param isUSLocale - * @return + * @param input The input strings column in which each row specifies a json object + * @param schema The schema of the output struct column + * @param opts The options for parsing JSON strings + * @param isUSLocale Whether the current local is US locale + * @return A struct column in which each row is parsed from the corresponding json string */ public static ColumnVector fromJSONToStructs(ColumnView input, Schema schema, JSONOptions opts, boolean isUSLocale) { @@ -185,14 +186,16 @@ public static ColumnVector fromJSONToStructs(ColumnView input, Schema schema, JS } /** + * Convert the data type of a strings column to the desired type given by a data schema. * - * @param input - * @param schema - * @param opts - * @param isUSLocale - * @return + * @param input The input strings column + * @param schema The schema of the output column + * @param allowedNonNumericNumbers Whether non-numeric numbers are allowed + * @param isUSLocale Whether the current local is US locale + * @return A column with the desired data type */ - public static ColumnVector convertDataType(ColumnView input, Schema schema, JSONOptions opts, + public static ColumnVector convertDataType(ColumnView input, Schema schema, + boolean allowedNonNumericNumbers, boolean isUSLocale) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; return new ColumnVector(convertDataType(input.getNativeView(), @@ -200,26 +203,22 @@ public static ColumnVector convertDataType(ColumnView input, Schema schema, JSON schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), schema.getFlattenedDecimalPrecisions(), - opts.nonNumericNumbersAllowed(), + allowedNonNumericNumbers, isUSLocale)); } /** + * Remove quotes from each string in the given strings column. * - * @param input - * @param nullifyIfNotQuoted - * @return + * @param input The input strings column + * @param nullifyIfNotQuoted Whether to nullify the output if the input string is not quoted + * @return A strings column in which quotes are removed from all strings */ public static ColumnVector removeQuotes(ColumnView input, boolean nullifyIfNotQuoted) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; return new ColumnVector(removeQuotes(input.getNativeView(), nullifyIfNotQuoted)); } - public static ColumnVector removeQuotes(ColumnView input) { - assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - return new ColumnVector(removeQuotes(input.getNativeView(), true)); - } - private static native int getMaxJSONPathDepth(); private static native long getJsonObject(long input, From 49c78ce62bc09612adfd97787b8f12ec45974ec7 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 13 Nov 2024 13:52:05 -0800 Subject: [PATCH 45/58] Rewrite JNI Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 16 +++++++++++----- src/main/cpp/src/from_json_to_structs.cu | 10 ++++++---- src/main/cpp/src/json_utils.hpp | 19 +++++++++---------- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 28969cfba..df1ee4270 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -231,7 +231,7 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_convertDataType(JNIEnv* env, try { cudf::jni::auto_set_device(env); - auto const input = reinterpret_cast(j_input); + auto const input_cv = reinterpret_cast(j_input); auto const num_children = cudf::jni::native_jintArray(env, j_num_children).to_vector(); auto const types = cudf::jni::native_jintArray(env, j_types).to_vector(); auto const scales = cudf::jni::native_jintArray(env, j_scales).to_vector(); @@ -243,8 +243,13 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_convertDataType(JNIEnv* env, CUDF_EXPECTS(num_children.size() == precisions.size(), "Invalid schema data: precisions."); return cudf::jni::ptr_as_jlong( - spark_rapids_jni::convert_data_type( - *input, num_children, types, scales, precisions, allow_nonnumeric_numbers, is_us_locale) + spark_rapids_jni::convert_data_type(cudf::strings_column_view{*input_cv}, + num_children, + types, + scales, + precisions, + allow_nonnumeric_numbers, + is_us_locale) .release()); } CATCH_STD(env, 0); @@ -257,9 +262,10 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_removeQuotes( try { cudf::jni::auto_set_device(env); - auto const input = reinterpret_cast(j_input); + auto const input_cv = reinterpret_cast(j_input); return cudf::jni::ptr_as_jlong( - spark_rapids_jni::remove_quotes(*input, nullify_if_not_quoted).release()); + spark_rapids_jni::remove_quotes(cudf::strings_column_view{*input_cv}, nullify_if_not_quoted) + .release()); } CATCH_STD(env, 0); } diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 7a1f49f42..2a2f0c12e 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -960,7 +960,7 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con mr); } -std::unique_ptr convert_data_type(cudf::column_view const& input, +std::unique_ptr convert_data_type(cudf::strings_column_view const& input, std::vector const& num_children, std::vector const& types, std::vector const& scales, @@ -981,8 +981,9 @@ std::unique_ptr convert_data_type(cudf::column_view const& input, CUDF_EXPECTS(schema_with_precision.child_types.size() == 1, "The input schema must have exactly one column."); + auto const input_cv = input.parent(); return detail::make_column_from_pair( - detail::convert_data_type(input, + detail::convert_data_type(input_cv, schema_with_precision.child_types.front().second, allow_nonnumeric_numbers, is_us_locale, @@ -992,15 +993,16 @@ std::unique_ptr convert_data_type(cudf::column_view const& input, mr); } -std::unique_ptr remove_quotes(cudf::column_view const& input, +std::unique_ptr remove_quotes(cudf::strings_column_view const& input, bool nullify_if_not_quoted, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); + auto const input_cv = input.parent(); return detail::make_column_from_pair( - detail::remove_quotes(input, nullify_if_not_quoted, stream, mr), stream, mr); + detail::remove_quotes(input_cv, nullify_if_not_quoted, stream, mr), stream, mr); } } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 6edadb24b..8db1bcba0 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -56,13 +56,13 @@ std::unique_ptr from_json_to_structs( rmm::device_async_resource_ref mr = cudf::get_current_device_resource()); /** - * @brief Convert the input column into a desired type given by a schema. + * @brief Convert the input column into a desired type given by a data schema. * * The input column can be a nested column thus the given schema is specified as data arrays * flattened by depth-first-search order. */ std::unique_ptr convert_data_type( - cudf::column_view const& input, + cudf::strings_column_view const& input, std::vector const& num_children, std::vector const& types, std::vector const& scales, @@ -73,17 +73,16 @@ std::unique_ptr convert_data_type( rmm::device_async_resource_ref mr = cudf::get_current_device_resource()); /** - * @brief remove_quotes - * @param input - * @param stream - * @param mr - * @return + * @brief Remove quotes from each string in the given strings column. + * + * If the input string is not quoted, the corresponding row can be a null depending on the value of + * `nullify_if_not_quoted`. */ std::unique_ptr remove_quotes( - cudf::column_view const& input, + cudf::strings_column_view const& input, bool nullify_if_not_quoted, rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + rmm::device_async_resource_ref mr = cudf::get_current_device_resource()); /** * @brief Concatenate the JSON objects given by a strings column into one single character buffer, @@ -109,6 +108,6 @@ std::tuple, char, std::unique_ptr Date: Wed, 13 Nov 2024 20:16:34 -0800 Subject: [PATCH 46/58] Remove deprecated function Signed-off-by: Nghia Truong --- .../com/nvidia/spark/rapids/jni/JSONUtils.java | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 11b7061b4..e96d83dda 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -165,23 +165,6 @@ public static ColumnVector extractRawMapFromJsonString(ColumnView input, JSONOpt opts.unquotedControlChars())); } - /** - * Extract key-value pairs for each output map from the given json strings. This method is - * similar to {@link #extractRawMapFromJsonString(ColumnView, JSONOptions)} but is deprecated. - * - * @deprecated This method is deprecated since it does not have parameters to control various - * JSON reader behaviors. - * - * @param input The input strings column in which each row specifies a json object - * @return A map column (i.e., a column of type {@code List>}) in - * which the key-value pairs are extracted directly from the input json strings - */ - public static ColumnVector extractRawMapFromJsonString(ColumnView input) { - assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - return new ColumnVector(extractRawMapFromJsonString(input.getNativeView(), - true, true, true, true)); - } - /** * Parse a JSON string into a struct column following by the given data schema. * From 1243599df120b939523e20ad32d5b54d842c2a9f Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 13 Nov 2024 21:17:08 -0800 Subject: [PATCH 47/58] Revert test Signed-off-by: Nghia Truong --- src/main/cpp/tests/cast_float_to_string.cpp | 127 ++++++++------------ 1 file changed, 47 insertions(+), 80 deletions(-) diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp index 1daddcb97..edf9ff86e 100644 --- a/src/main/cpp/tests/cast_float_to_string.cpp +++ b/src/main/cpp/tests/cast_float_to_string.cpp @@ -16,101 +16,68 @@ #include #include -#include - -#include #include -#include +#include + +#include using namespace cudf; +constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR}; + struct FloatToStringTests : public cudf::test::BaseFixture {}; TEST_F(FloatToStringTests, FromFloats32) { - std::string json_string = R"({"student": [{"name": "abc", "class": "junior"}]})"; + auto const floats = + cudf::test::fixed_width_column_wrapper{100.0f, + 654321.25f, + -12761.125f, + 0.f, + 5.0f, + -4.0f, + std::numeric_limits::quiet_NaN(), + 123456789012.34f, + -0.0f}; - { - cudf::io::json_reader_options in_options = - cudf::io::json_reader_options::builder( - cudf::io::source_info{json_string.data(), json_string.size()}) - .prune_columns(true) - .mixed_types_as_string(true) - .lines(true); + auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream()); - cudf::io::schema_element dtype_schema{cudf::data_type{cudf::type_id::STRUCT}, - { - {"student", - {data_type{cudf::type_id::LIST}, - {{"element", - {data_type{cudf::type_id::STRUCT}, - { - {"name", {data_type{cudf::type_id::STRING}}}, - {"abc", {data_type{cudf::type_id::STRING}}}, - {"class", {data_type{cudf::type_id::STRING}}}, - }, - {{"name", "abc", "class"}}}}}}}, - }, - {{"student"}}}; - in_options.set_dtypes(dtype_schema); + auto const expected = cudf::test::strings_column_wrapper{ + "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "1.2345679E11", "-0.0"}; - auto const parsed_table_with_meta = cudf::io::read_json(in_options); - // auto const& parsed_meta = parsed_table_with_meta.metadata; - auto parsed_columns = parsed_table_with_meta.tbl->release(); - for (auto& col : parsed_columns) { - cudf::test::print(*col); - } - } + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); +} - { - /* - * colname: -student, -element, -name, -abc, -class, -num child: -1, -3, -0, -0, -0, -num child: -1, -3, -0, -0, -0, -types: -24, -28, -23, -23, -23, +TEST_F(FloatToStringTests, FromFloats64) +{ + auto const floats = + cudf::test::fixed_width_column_wrapper{100.0d, + 654321.25d, + -12761.125d, + 1.123456789123456789d, + 0.000000000000000000123456789123456789d, + 0.0d, + 5.0d, + -4.0d, + std::numeric_limits::quiet_NaN(), + 839542223232.794248339d, + -0.0d}; - */ + auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream()); - std::vector col_names{"student", "element", "name", "abc", "class"}; - std::vector num_children{1, 3, 0, 0, 0}; - std::vector types{24, 28, 23, 23, 23}; - std::vector scales{0, 0, 0, 0, 0}; - std::vector precisions{-1, -1, -1, -1, -1}; + auto const expected = cudf::test::strings_column_wrapper{"100.0", + "654321.25", + "-12761.125", + "1.1234567891234568", + "1.234567891234568E-19", + "0.0", + "5.0", + "-4.0", + "NaN", + "8.395422232327942E11", + "-0.0"}; - auto const input = cudf::test::strings_column_wrapper{json_string}; - auto out = spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{input}, - col_names, - num_children, - types, - scales, - precisions, - true, - true, - true, - true, - true); - cudf::test::print(*out); - } + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); } From 6f89fcde6d042464f40543afb9f82acc105142ab Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 13 Nov 2024 21:21:48 -0800 Subject: [PATCH 48/58] Remove header Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 745876217..cd78f9fdb 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -18,7 +18,6 @@ #include "get_json_object.hpp" #include "json_utils.hpp" -#include #include #include From deb3ebf098b6e8831b7439c2e6502c36dd75809f Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 13 Nov 2024 21:21:54 -0800 Subject: [PATCH 49/58] Rewrite Javadoc Signed-off-by: Nghia Truong --- src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index e96d83dda..e8820c162 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -214,9 +214,12 @@ public static ColumnVector convertDataType(ColumnView input, Schema schema, /** * Remove quotes from each string in the given strings column. + *

+ * If `nullifyIfNotQuoted` is true, an input string that is not quoted will result in a null. + * Otherwise, the output will be the same as the unquoted input. * * @param input The input strings column - * @param nullifyIfNotQuoted Whether to nullify the output if the input string is not quoted + * @param nullifyIfNotQuoted Whether to output a null row if the input string is not quoted * @return A strings column in which quotes are removed from all strings */ public static ColumnVector removeQuotes(ColumnView input, boolean nullifyIfNotQuoted) { From 9dc641fa9cddf3239ee551a5044d568cc8844f45 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 13 Nov 2024 21:22:41 -0800 Subject: [PATCH 50/58] Rename variable Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index cd78f9fdb..d74dfd57c 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -191,7 +191,7 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJSONToStructs(JNIEnv* env, try { cudf::jni::auto_set_device(env); - auto const input = reinterpret_cast(j_input); + auto const input_cv = reinterpret_cast(j_input); auto const col_names = cudf::jni::native_jstringArray(env, j_col_names).as_cpp_vector(); auto const num_children = cudf::jni::native_jintArray(env, j_num_children).to_vector(); auto const types = cudf::jni::native_jintArray(env, j_types).to_vector(); @@ -205,7 +205,7 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJSONToStructs(JNIEnv* env, CUDF_EXPECTS(col_names.size() == precisions.size(), "Invalid schema data: precisions."); return cudf::jni::ptr_as_jlong( - spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{*input}, + spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{*input_cv}, col_names, num_children, types, From 53b121dbcd4a8809a229d1230ed46f62b7c8ef82 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 13 Nov 2024 21:28:21 -0800 Subject: [PATCH 51/58] Rewrite docs Signed-off-by: Nghia Truong --- src/main/cpp/src/json_utils.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index c42b3f6b7..641e9b839 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -60,10 +60,9 @@ std::unique_ptr from_json_to_structs( rmm::device_async_resource_ref mr = cudf::get_current_device_resource()); /** - * @brief Convert the input column into a desired type given by a data schema. + * @brief Convert the input strings column into a desired type given by a data schema. * - * The input column can be a nested column thus the given schema is specified as data arrays - * flattened by depth-first-search order. + * The given column schema is specified as data arrays flattened by depth-first-search order. */ std::unique_ptr convert_data_type( cudf::strings_column_view const& input, @@ -79,8 +78,8 @@ std::unique_ptr convert_data_type( /** * @brief Remove quotes from each string in the given strings column. * - * If the input string is not quoted, the corresponding row can be a null depending on the value of - * `nullify_if_not_quoted`. + * If `nullify_if_not_quoted` is true, an input string that is not quoted will result in a null. + * Otherwise, the output will be the same as the unquoted input. */ std::unique_ptr remove_quotes( cudf::strings_column_view const& input, From 69265b4f1768b59d2473e9613a7eb4d74de11e40 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 13 Nov 2024 21:29:46 -0800 Subject: [PATCH 52/58] Revert test Signed-off-by: Nghia Truong --- src/main/cpp/tests/cast_float_to_string.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp index edf9ff86e..a118ec7fe 100644 --- a/src/main/cpp/tests/cast_float_to_string.cpp +++ b/src/main/cpp/tests/cast_float_to_string.cpp @@ -80,4 +80,4 @@ TEST_F(FloatToStringTests, FromFloats64) "-0.0"}; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); -} +} \ No newline at end of file From da4d1f625ab14246e7f3509c1806426fcccc0e1d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 14 Nov 2024 10:35:06 -0800 Subject: [PATCH 53/58] Cleanup headers Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 2a2f0c12e..59f88b78d 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -24,24 +24,19 @@ #include #include #include -#include -#include -#include +#include #include -#include -#include #include #include #include +#include #include #include -#include -#include #include #include -#include +#include #include #include #include @@ -50,8 +45,6 @@ #include #include -#include - namespace spark_rapids_jni { namespace detail { From 1d91e648ff28eeb6993753af0983315a639da771 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 14 Nov 2024 10:43:28 -0800 Subject: [PATCH 54/58] Cleanup Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 54 +++++++++++++++++------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 59f88b78d..0c4bf8a0f 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -125,7 +125,12 @@ std::pair, rmm::device_uvector> cast_strings auto const d_input_ptr = cudf::column_device_view::create(input, stream); auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); + // We need to nullify the invalid string rows. + // Technically, we should just mask out these rows as invalid and ignore them. + // However, spark_rapids_jni::string_to_integer cannot handle these non-empty null rows, + // thus we have to materialzie the valid strings into a new strings column. auto string_pairs = rmm::device_uvector(string_count, stream); + // Since the strings store integer numbers, they should be very short. // As such, using one thread per string should be good. thrust::tabulate(rmm::exec_policy_nosync(stream), @@ -162,12 +167,16 @@ std::pair, rmm::device_uvector> cast_strings auto chars_data = cudf::strings::detail::make_chars_buffer( offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr); - // Don't care about the null mask, as nulls imply empty strings, and will be nullified. + // Don't care about the null mask, as nulls imply empty strings, which will also result in nulls. auto const sanitized_input = cudf::make_strings_column(string_count, std::move(offsets_column), chars_data.release(), 0, {}); - auto output = string_to_integer( - output_type, cudf::strings_column_view{sanitized_input->view()}, false, false, stream, mr); + auto output = string_to_integer(output_type, + cudf::strings_column_view{sanitized_input->view()}, + /*ansi_mode*/ false, + /*strip*/ false, + stream, + mr); return {std::move(output), rmm::device_uvector(0, stream)}; } @@ -543,7 +552,7 @@ std::pair parse_schema_ } } else { CUDF_EXPECTS(col_num_children == 0, - "Found children for a type that should have none.", + "Found children for a non-nested type that should have none.", std::invalid_argument); } @@ -561,9 +570,10 @@ std::pair parse_schema_ // Two separate schema is generated: // - The first one is used as input to `cudf::read_json`, in which the data types of all columns // are specified as STRING type. As such, the table returned by `cudf::read_json` will contain -// only strings columns. -// - The second schema is used for converting from STRING type to the desired types for the final -// output. +// only strings columns or nested (LIST/STRUCT) columns. +// - The second schema contains decimal precision (if available) and preserves schema column types +// as well as the column order, used for converting from STRING type to the desired types for the +// final output. std::pair generate_struct_schema( std::vector const& col_names, std::vector const& num_children, @@ -591,6 +601,7 @@ std::pair generate_stru cudf::data_type{cudf::type_id::STRUCT}, -1, std::move(schema_cols_with_precisions)}}; } +// For the input pair of column-validity, create null mask for the column. std::unique_ptr make_column_from_pair( std::pair, rmm::device_uvector>&& input, rmm::cuda_stream_view stream, @@ -605,6 +616,8 @@ std::unique_ptr make_column_from_pair( return std::move(output); } +// For each pair of column-validity, create null mask for the column. +// This is done asynchronously for all columns with only one stream sync at the end. std::vector> make_column_array_from_pairs( std::vector, rmm::device_uvector>>& input, rmm::cuda_stream_view stream, @@ -640,6 +653,7 @@ std::vector> make_column_array_from_pairs( d_valid_counts.data() + idx); } + // This is the only stream sync for all columns. auto const valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream); std::vector> output(num_columns); @@ -672,7 +686,9 @@ std::pair, rmm::device_uvector> convert_data using DecayInputT = std::decay_t; auto constexpr input_is_const_cv = std::is_same_v; auto constexpr input_is_column_ptr = std::is_same_v>; - static_assert(input_is_const_cv ^ input_is_column_ptr); + static_assert(input_is_const_cv ^ input_is_column_ptr, + "Input to `convert_data_type` must either be `cudf::column_view const&` or " + "`std::unique_ptr`"); if (cudf::is_chrono(schema.type)) { // Date/time is not processed here - it should be handled separately in spark-rapids. @@ -749,6 +765,8 @@ std::pair, rmm::device_uvector> convert_data mr), stream, mr)); + // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` + // on the child column as it does not have non-empty nulls. return {std::make_unique(cudf::data_type{cudf::type_id::LIST}, num_rows, rmm::device_buffer{}, @@ -773,6 +791,8 @@ std::pair, rmm::device_uvector> convert_data mr)); } + // Do not use `cudf::make_structs_column` since we do not need to call `superimpose_nulls` + // on the children columns. return {std::make_unique( cudf::data_type{cudf::type_id::STRUCT}, num_rows, @@ -802,6 +822,8 @@ std::pair, rmm::device_uvector> convert_data mr), stream, mr)); + // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` + // on the child column as it does not have non-empty nulls. return {std::make_unique(cudf::data_type{cudf::type_id::LIST}, num_rows, rmm::device_buffer{}, @@ -824,6 +846,8 @@ std::pair, rmm::device_uvector> convert_data stream, mr)); } + // Do not use `cudf::make_structs_column` since we do not need to call `superimpose_nulls` + // on the children columns. return {std::make_unique( cudf::data_type{cudf::type_id::STRUCT}, num_rows, @@ -839,8 +863,6 @@ std::pair, rmm::device_uvector> convert_data return {nullptr, rmm::device_uvector{0, stream, mr}}; } -} // namespace - std::unique_ptr from_json_to_structs(cudf::strings_column_view const& input, std::vector const& col_names, std::vector const& num_children, @@ -855,7 +877,7 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto const [concat_input, delimiter, is_invalid_or_empty] = + auto const [concat_input, delimiter, should_be_nullified] = concat_json(input, false, stream, cudf::get_current_device_resource()); auto const [schema, schema_with_precision] = generate_struct_schema(col_names, num_children, types, scales, precisions); @@ -895,7 +917,7 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con auto const d_type = parsed_columns[i]->type().id(); CUDF_EXPECTS(d_type == cudf::type_id::LIST || d_type == cudf::type_id::STRUCT || d_type == cudf::type_id::STRING, - "Input column should be STRING or nested."); + "Parsed JSON columns should be STRING or nested."); auto const& [col_name, col_schema] = schema_with_precision.child_types[i]; CUDF_EXPECTS(parsed_meta.schema_info[i].name == col_name, "Mismatched column name."); @@ -907,9 +929,9 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con mr)); } - auto const valid_it = is_invalid_or_empty->view().begin(); + auto const valid_it = should_be_nullified->view().begin(); auto [null_mask, null_count] = cudf::detail::valid_if( - valid_it, valid_it + is_invalid_or_empty->size(), thrust::logical_not{}, stream, mr); + valid_it, valid_it + should_be_nullified->size(), thrust::logical_not{}, stream, mr); return cudf::make_structs_column( input.size(), @@ -920,6 +942,8 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con mr); } +} // namespace + } // namespace detail std::unique_ptr from_json_to_structs(cudf::strings_column_view const& input, @@ -972,7 +996,7 @@ std::unique_ptr convert_data_type(cudf::strings_column_view const& scales, precisions); CUDF_EXPECTS(schema_with_precision.child_types.size() == 1, - "The input schema must have exactly one column."); + "The input schema to convert must have exactly one column."); auto const input_cv = input.parent(); return detail::make_column_from_pair( From d0fa2aeebe18f29ce5cc5f24936a4986c5256079 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 14 Nov 2024 13:42:18 -0800 Subject: [PATCH 55/58] Rewrite the conversion functions Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 421 ++++++++--------------- 1 file changed, 150 insertions(+), 271 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 0c4bf8a0f..804ab1707 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -53,16 +53,14 @@ namespace { using string_index_pair = thrust::pair; -std::pair, rmm::device_uvector> cast_strings_to_booleans( - cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) +std::unique_ptr cast_strings_to_booleans(cudf::column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); auto const string_count = input.size(); - if (string_count == 0) { - return {cudf::make_empty_column(cudf::data_type{cudf::type_id::BOOL8}), - rmm::device_uvector(0, stream)}; - } + if (string_count == 0) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::BOOL8}); } auto output = cudf::make_fixed_width_column( cudf::data_type{cudf::type_id::BOOL8}, string_count, cudf::mask_state::UNALLOCATED, stream, mr); @@ -100,24 +98,22 @@ std::pair, rmm::device_uvector> cast_strings return {false, false}; }); - // Reset null count, as it is invalidated after calling to `mutable_view()`. - output->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); + auto [null_mask, null_count] = + cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr); + if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } - return {std::move(output), std::move(validity)}; + return output; } -std::pair, rmm::device_uvector> cast_strings_to_integers( - cudf::column_view const& input, - cudf::data_type output_type, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr cast_strings_to_integers(cudf::column_view const& input, + cudf::data_type output_type, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); auto const string_count = input.size(); - if (string_count == 0) { - return {cudf::make_empty_column(output_type), rmm::device_uvector(0, stream)}; - } + if (string_count == 0) { return cudf::make_empty_column(output_type); } auto const input_sv = cudf::strings_column_view{input}; auto const input_offsets_it = @@ -171,28 +167,23 @@ std::pair, rmm::device_uvector> cast_strings auto const sanitized_input = cudf::make_strings_column(string_count, std::move(offsets_column), chars_data.release(), 0, {}); - auto output = string_to_integer(output_type, - cudf::strings_column_view{sanitized_input->view()}, - /*ansi_mode*/ false, - /*strip*/ false, - stream, - mr); - - return {std::move(output), rmm::device_uvector(0, stream)}; + return string_to_integer(output_type, + cudf::strings_column_view{sanitized_input->view()}, + /*ansi_mode*/ false, + /*strip*/ false, + stream, + mr); } // TODO: extract commond code for this and `remove_quotes`. -// This function always return zero size validity array. -std::pair, rmm::device_uvector> remove_quotes_for_floats( - cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) +std::unique_ptr remove_quotes_for_floats(cudf::column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); auto const string_count = input.size(); - if (string_count == 0) { - return {cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}), - rmm::device_uvector(0, stream)}; - } + if (string_count == 0) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); } auto const input_sv = cudf::strings_column_view{input}; auto const input_offsets_it = @@ -257,50 +248,41 @@ std::pair, rmm::device_uvector> remove_quote auto chars_data = cudf::strings::detail::make_chars_buffer( offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr); - auto output = cudf::make_strings_column(string_count, - std::move(offsets_column), - chars_data.release(), - input.null_count(), - cudf::detail::copy_bitmask(input, stream, mr)); - - return {std::move(output), rmm::device_uvector(0, stream)}; + return cudf::make_strings_column(string_count, + std::move(offsets_column), + chars_data.release(), + input.null_count(), + cudf::detail::copy_bitmask(input, stream, mr)); } -std::pair, rmm::device_uvector> cast_strings_to_floats( - cudf::column_view const& input, - cudf::data_type output_type, - bool allow_nonnumeric_numbers, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr cast_strings_to_floats(cudf::column_view const& input, + cudf::data_type output_type, + bool allow_nonnumeric_numbers, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); if (allow_nonnumeric_numbers) { - auto [removed_quotes, validity] = remove_quotes_for_floats(input, stream, mr); - return {::spark_rapids_jni::string_to_float( - output_type, cudf::strings_column_view{removed_quotes->view()}, false, stream, mr), - rmm::device_uvector{0, stream, mr}}; + auto const removed_quotes = remove_quotes_for_floats(input, stream, mr); + return string_to_float( + output_type, cudf::strings_column_view{removed_quotes->view()}, false, stream, mr); } - return {::spark_rapids_jni::string_to_float( - output_type, cudf::strings_column_view{input}, false, stream, mr), - rmm::device_uvector{0, stream, mr}}; + return string_to_float(output_type, cudf::strings_column_view{input}, false, stream, mr); } // TODO there is a bug here around 0 https://github.com/NVIDIA/spark-rapids/issues/10898 -std::pair, rmm::device_uvector> cast_strings_to_decimals( - cudf::column_view const& input, - cudf::data_type output_type, - int precision, - bool is_us_locale, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr cast_strings_to_decimals(cudf::column_view const& input, + cudf::data_type output_type, + int precision, + bool is_us_locale, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); auto const string_count = input.size(); - if (string_count == 0) { - return {cudf::make_empty_column(output_type), rmm::device_uvector{0, stream, mr}}; - } + if (string_count == 0) { return cudf::make_empty_column(output_type); } CUDF_EXPECTS(is_us_locale, "String to decimal conversion is only supported in US locale."); @@ -375,8 +357,7 @@ std::pair, rmm::device_uvector> cast_strings // If the output strings column does not change in its total bytes, we know that it does not have // any '"' or ',' characters. if (bytes == input_sv.chars_size(stream)) { - return {string_to_decimal(precision, output_type.scale(), input_sv, false, false, stream, mr), - rmm::device_uvector{0, stream, mr}}; + return string_to_decimal(precision, output_type.scale(), input_sv, false, false, stream, mr); } auto const out_offsets = @@ -415,39 +396,32 @@ std::pair, rmm::device_uvector> cast_strings } }); - auto const unquoted_strings = cudf::make_strings_column(string_count, - std::move(offsets_column), - chars_data.release(), - 0, - rmm::device_buffer{0, stream, mr}); - return {string_to_decimal(precision, - output_type.scale(), - cudf::strings_column_view{unquoted_strings->view()}, - false, - false, - stream, - mr), - rmm::device_uvector{0, stream, mr}}; + // Don't care about the null mask, as nulls imply empty strings, which will also result in nulls. + auto const unquoted_strings = + cudf::make_strings_column(string_count, std::move(offsets_column), chars_data.release(), 0, {}); + + return string_to_decimal(precision, + output_type.scale(), + cudf::strings_column_view{unquoted_strings->view()}, + false, + false, + stream, + mr); } -std::pair, rmm::device_uvector> remove_quotes( - cudf::column_view const& input, - bool nullify_if_not_quoted, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr remove_quotes(cudf::strings_column_view const& input, + bool nullify_if_not_quoted, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); auto const string_count = input.size(); - if (string_count == 0) { - return {cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}), - rmm::device_uvector(0, stream)}; - } + if (string_count == 0) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); } - auto const input_sv = cudf::strings_column_view{input}; auto const input_offsets_it = - cudf::detail::offsetalator_factory::make_input_iterator(input_sv.offsets()); - auto const d_input_ptr = cudf::column_device_view::create(input, stream); + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets()); + auto const d_input_ptr = cudf::column_device_view::create(input.parent(), stream); auto const is_valid_it = cudf::detail::make_validity_iterator(*d_input_ptr); auto string_pairs = rmm::device_uvector(string_count, stream); @@ -455,7 +429,7 @@ std::pair, rmm::device_uvector> remove_quote string_pairs.begin(), string_pairs.end(), [nullify_if_not_quoted, - chars = input_sv.chars_begin(stream), + chars = input.chars_begin(stream), offsets = input_offsets_it, is_valid = is_valid_it] __device__(cudf::size_type idx) -> string_index_pair { if (!is_valid[idx]) { return {nullptr, 0}; } @@ -486,31 +460,28 @@ std::pair, rmm::device_uvector> remove_quote offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr); if (nullify_if_not_quoted) { - auto validity = rmm::device_uvector(string_count, stream); - thrust::transform( - rmm::exec_policy_nosync(stream), - string_pairs.begin(), - string_pairs.end(), - validity.begin(), - [] __device__(string_index_pair const& pair) { return pair.first != nullptr; }); - - // Null mask and null count will be updated later from the validity vector. auto output = cudf::make_strings_column(string_count, std::move(offsets_column), chars_data.release(), 0, rmm::device_buffer{0, stream, mr}); - return {std::move(output), std::move(validity)}; - } else { - auto output = cudf::make_strings_column(string_count, - std::move(offsets_column), - chars_data.release(), - input.null_count(), - cudf::detail::copy_bitmask(input, stream, mr)); + auto [null_mask, null_count] = cudf::detail::valid_if( + string_pairs.begin(), + string_pairs.end(), + [] __device__(string_index_pair const& pair) { return pair.first != nullptr; }, + stream, + mr); + if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } - return {std::move(output), rmm::device_uvector(0, stream)}; + return output; } + + return cudf::make_strings_column(string_count, + std::move(offsets_column), + chars_data.release(), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); } /** @@ -601,85 +572,13 @@ std::pair generate_stru cudf::data_type{cudf::type_id::STRUCT}, -1, std::move(schema_cols_with_precisions)}}; } -// For the input pair of column-validity, create null mask for the column. -std::unique_ptr make_column_from_pair( - std::pair, rmm::device_uvector>&& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto& [output, validity] = input; - if (validity.size() > 0) { - auto [null_mask, null_count] = - cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr); - if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } - } - return std::move(output); -} - -// For each pair of column-validity, create null mask for the column. -// This is done asynchronously for all columns with only one stream sync at the end. -std::vector> make_column_array_from_pairs( - std::vector, rmm::device_uvector>>& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto const num_columns = input.size(); - std::vector null_masks; - null_masks.reserve(num_columns); - - rmm::device_uvector d_valid_counts(num_columns, stream, mr); - thrust::uninitialized_fill( - rmm::exec_policy_nosync(stream), d_valid_counts.begin(), d_valid_counts.end(), 0); - - for (std::size_t idx = 0; idx < num_columns; ++idx) { - auto const col_size = input[idx].first->size(); - auto const validity_size = input[idx].second.size(); - if (col_size == 0 || validity_size == 0) { - null_masks.emplace_back(rmm::device_buffer{}); // placeholder - continue; - } - - null_masks.emplace_back( - cudf::create_null_mask(col_size, cudf::mask_state::UNINITIALIZED, stream, mr)); - constexpr cudf::size_type block_size{256}; - auto const grid = - cudf::detail::grid_1d{static_cast(col_size), block_size}; - cudf::detail::valid_if_kernel - <<>>( - reinterpret_cast(null_masks.back().data()), - input[idx].second.data(), - col_size, - thrust::identity{}, - d_valid_counts.data() + idx); - } - - // This is the only stream sync for all columns. - auto const valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream); - std::vector> output(num_columns); - - for (std::size_t idx = 0; idx < num_columns; ++idx) { - auto const col_size = input[idx].first->size(); - output[idx] = std::move(input[idx].first); - - auto const validity_size = input[idx].second.size(); - if (col_size == 0 || validity_size == 0) { continue; } - - auto const valid_count = valid_counts[idx]; - auto const null_count = col_size - valid_count; - if (null_count > 0) { output[idx]->set_null_mask(std::move(null_masks[idx]), null_count); } - } - - return output; -} - template -std::pair, rmm::device_uvector> convert_data_type( - InputType&& input, - schema_element_with_precision const& schema, - bool allow_nonnumeric_numbers, - bool is_us_locale, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr convert_data_type(InputType&& input, + schema_element_with_precision const& schema, + bool allow_nonnumeric_numbers, + bool is_us_locale, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -693,10 +592,10 @@ std::pair, rmm::device_uvector> convert_data if (cudf::is_chrono(schema.type)) { // Date/time is not processed here - it should be handled separately in spark-rapids. if constexpr (input_is_column_ptr) { - return {std::move(input), rmm::device_uvector{0, stream, mr}}; + return std::move(input); } else { CUDF_FAIL("Cannot convert data type to a chrono (date/time) type."); - return {nullptr, rmm::device_uvector{0, stream, mr}}; + return nullptr; } } @@ -755,52 +654,44 @@ std::pair, rmm::device_uvector> convert_data std::vector> new_children; new_children.emplace_back( std::move(input_content.children[cudf::lists_column_view::offsets_column_index])); - new_children.emplace_back(make_column_from_pair( - convert_data_type( - std::move(input_content.children[cudf::lists_column_view::child_column_index]), - schema.child_types.front().second, - allow_nonnumeric_numbers, - is_us_locale, - stream, - mr), + new_children.emplace_back(convert_data_type( + std::move(input_content.children[cudf::lists_column_view::child_column_index]), + schema.child_types.front().second, + allow_nonnumeric_numbers, + is_us_locale, stream, mr)); // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` // on the child column as it does not have non-empty nulls. - return {std::make_unique(cudf::data_type{cudf::type_id::LIST}, - num_rows, - rmm::device_buffer{}, - std::move(*input_content.null_mask), - null_count, - std::move(new_children)), - rmm::device_uvector{0, stream, mr}}; + return std::make_unique(cudf::data_type{cudf::type_id::LIST}, + num_rows, + rmm::device_buffer{}, + std::move(*input_content.null_mask), + null_count, + std::move(new_children)); } if (schema.type.id() == cudf::type_id::STRUCT) { CUDF_EXPECTS(d_type == cudf::type_id::STRUCT, "Input column should be STRUCT."); - std::vector, rmm::device_uvector>> - new_children_with_validity; - new_children_with_validity.reserve(num_children); + std::vector> new_children; + new_children.reserve(num_children); for (cudf::size_type i = 0; i < num_children; ++i) { - new_children_with_validity.emplace_back( - convert_data_type(std::move(input_content.children[i]), - schema.child_types[i].second, - allow_nonnumeric_numbers, - is_us_locale, - stream, - mr)); + new_children.emplace_back(convert_data_type(std::move(input_content.children[i]), + schema.child_types[i].second, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr)); } // Do not use `cudf::make_structs_column` since we do not need to call `superimpose_nulls` // on the children columns. - return {std::make_unique( - cudf::data_type{cudf::type_id::STRUCT}, - num_rows, - rmm::device_buffer{}, - std::move(*input_content.null_mask), - null_count, - make_column_array_from_pairs(new_children_with_validity, stream, mr)), - rmm::device_uvector{0, stream, mr}}; + return std::make_unique(cudf::data_type{cudf::type_id::STRUCT}, + num_rows, + rmm::device_buffer{}, + std::move(*input_content.null_mask), + null_count, + std::move(new_children)); } } else { // input_is_const_cv auto const d_type = input.type().id(); @@ -813,54 +704,48 @@ std::pair, rmm::device_uvector> convert_data std::vector> new_children; new_children.emplace_back( std::make_unique(input.child(cudf::lists_column_view::offsets_column_index))); - new_children.emplace_back(make_column_from_pair( + new_children.emplace_back( convert_data_type(input.child(cudf::lists_column_view::child_column_index), schema.child_types.front().second, allow_nonnumeric_numbers, is_us_locale, stream, - mr), - stream, - mr)); + mr)); // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` // on the child column as it does not have non-empty nulls. - return {std::make_unique(cudf::data_type{cudf::type_id::LIST}, - num_rows, - rmm::device_buffer{}, - cudf::detail::copy_bitmask(input, stream, mr), - null_count, - std::move(new_children)), - rmm::device_uvector{0, stream, mr}}; + return std::make_unique(cudf::data_type{cudf::type_id::LIST}, + num_rows, + rmm::device_buffer{}, + cudf::detail::copy_bitmask(input, stream, mr), + null_count, + std::move(new_children)); } if (schema.type.id() == cudf::type_id::STRUCT) { CUDF_EXPECTS(d_type == cudf::type_id::STRUCT, "Input column should be STRUCT."); - std::vector, rmm::device_uvector>> - new_children_with_validity; - new_children_with_validity.reserve(num_children); + std::vector> new_children; + new_children.reserve(num_children); for (cudf::size_type i = 0; i < num_children; ++i) { - new_children_with_validity.emplace_back(convert_data_type(input.child(i), - schema.child_types[i].second, - allow_nonnumeric_numbers, - is_us_locale, - stream, - mr)); + new_children.emplace_back(convert_data_type(input.child(i), + schema.child_types[i].second, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr)); } // Do not use `cudf::make_structs_column` since we do not need to call `superimpose_nulls` // on the children columns. - return {std::make_unique( - cudf::data_type{cudf::type_id::STRUCT}, - num_rows, - rmm::device_buffer{}, - cudf::detail::copy_bitmask(input, stream, mr), - null_count, - make_column_array_from_pairs(new_children_with_validity, stream, mr)), - rmm::device_uvector{0, stream, mr}}; + return std::make_unique(cudf::data_type{cudf::type_id::STRUCT}, + num_rows, + rmm::device_buffer{}, + cudf::detail::copy_bitmask(input, stream, mr), + null_count, + std::move(new_children)); } } CUDF_FAIL("Unexpected column type for conversion."); - return {nullptr, rmm::device_uvector{0, stream, mr}}; + return nullptr; } std::unique_ptr from_json_to_structs(cudf::strings_column_view const& input, @@ -910,9 +795,8 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con CUDF_EXPECTS(parsed_columns.size() == schema.child_types.size(), "Numbers of output columns is different from schema size."); - std::vector, rmm::device_uvector>> - converted_cols_with_validity; - converted_cols_with_validity.reserve(parsed_columns.size()); + std::vector> converted_cols; + converted_cols.reserve(parsed_columns.size()); for (std::size_t i = 0; i < parsed_columns.size(); ++i) { auto const d_type = parsed_columns[i]->type().id(); CUDF_EXPECTS(d_type == cudf::type_id::LIST || d_type == cudf::type_id::STRUCT || @@ -921,12 +805,12 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con auto const& [col_name, col_schema] = schema_with_precision.child_types[i]; CUDF_EXPECTS(parsed_meta.schema_info[i].name == col_name, "Mismatched column name."); - converted_cols_with_validity.emplace_back(convert_data_type(std::move(parsed_columns[i]), - col_schema, - allow_nonnumeric_numbers, - is_us_locale, - stream, - mr)); + converted_cols.emplace_back(convert_data_type(std::move(parsed_columns[i]), + col_schema, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr)); } auto const valid_it = should_be_nullified->view().begin(); @@ -935,7 +819,7 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con return cudf::make_structs_column( input.size(), - make_column_array_from_pairs(converted_cols_with_validity, stream, mr), + std::move(converted_cols), null_count, null_count > 0 ? std::move(null_mask) : rmm::device_buffer{0, stream, mr}, stream, @@ -999,15 +883,12 @@ std::unique_ptr convert_data_type(cudf::strings_column_view const& "The input schema to convert must have exactly one column."); auto const input_cv = input.parent(); - return detail::make_column_from_pair( - detail::convert_data_type(input_cv, - schema_with_precision.child_types.front().second, - allow_nonnumeric_numbers, - is_us_locale, - stream, - mr), - stream, - mr); + return detail::convert_data_type(input_cv, + schema_with_precision.child_types.front().second, + allow_nonnumeric_numbers, + is_us_locale, + stream, + mr); } std::unique_ptr remove_quotes(cudf::strings_column_view const& input, @@ -1017,9 +898,7 @@ std::unique_ptr remove_quotes(cudf::strings_column_view const& inp { CUDF_FUNC_RANGE(); - auto const input_cv = input.parent(); - return detail::make_column_from_pair( - detail::remove_quotes(input_cv, nullify_if_not_quoted, stream, mr), stream, mr); + return detail::remove_quotes(input, nullify_if_not_quoted, stream, mr); } } // namespace spark_rapids_jni From f375a4da2aa1c032d09479b672f416a02473b4f9 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 14 Nov 2024 13:56:37 -0800 Subject: [PATCH 56/58] Move code Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 176 +++++++++++------------ 1 file changed, 88 insertions(+), 88 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 804ab1707..da7719275 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -51,6 +51,94 @@ namespace detail { namespace { +/** + * @brief The struct similar to `cudf::io::schema_element` with adding decimal precision and + * preserving column order. + */ +struct schema_element_with_precision { + cudf::data_type type; + int precision; + std::vector> child_types; +}; + +std::pair parse_schema_element( + std::size_t& index, + std::vector const& col_names, + std::vector const& num_children, + std::vector const& types, + std::vector const& scales, + std::vector const& precisions) +{ + // Get data for the current column. + auto const d_type = cudf::data_type{static_cast(types[index]), scales[index]}; + auto const precision = precisions[index]; + auto const col_num_children = num_children[index]; + index++; + + std::map children; + std::vector> children_with_precisions; + std::vector child_names(col_num_children); + + if (d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST) { + for (int i = 0; i < col_num_children; ++i) { + auto const& name = col_names[index]; + auto [child, child_with_precision] = + parse_schema_element(index, col_names, num_children, types, scales, precisions); + children.emplace(name, std::move(child)); + children_with_precisions.emplace_back(name, std::move(child_with_precision)); + child_names[i] = name; + } + } else { + CUDF_EXPECTS(col_num_children == 0, + "Found children for a non-nested type that should have none.", + std::invalid_argument); + } + + // Note that if the first schema element does not has type STRUCT/LIST then it always has type + // STRING, since we intentionally parse JSON into strings column for later post-processing. + auto const schema_dtype = + d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST + ? d_type + : cudf::data_type{cudf::type_id::STRING}; + return {cudf::io::schema_element{schema_dtype, std::move(children), {std::move(child_names)}}, + schema_element_with_precision{d_type, precision, std::move(children_with_precisions)}}; +} + +// Generate struct type schemas by traveling the schema data by depth-first search order. +// Two separate schemas is generated: +// - The first one is used as input to `cudf::read_json`, in which the data types of all columns +// are specified as STRING type. As such, the table returned by `cudf::read_json` will contain +// only strings columns or nested (LIST/STRUCT) columns. +// - The second schema contains decimal precision (if available) and preserves schema column types +// as well as the column order, used for converting from STRING type to the desired types for the +// final output. +std::pair generate_struct_schema( + std::vector const& col_names, + std::vector const& num_children, + std::vector const& types, + std::vector const& scales, + std::vector const& precisions) +{ + std::map schema_cols; + std::vector> schema_cols_with_precisions; + std::vector name_order; + + std::size_t index = 0; + while (index < types.size()) { + auto const& name = col_names[index]; + auto [child, child_with_precision] = + parse_schema_element(index, col_names, num_children, types, scales, precisions); + schema_cols.emplace(name, std::move(child)); + schema_cols_with_precisions.emplace_back(name, std::move(child_with_precision)); + name_order.push_back(name); + } + return { + cudf::io::schema_element{ + cudf::data_type{cudf::type_id::STRUCT}, std::move(schema_cols), {std::move(name_order)}}, + schema_element_with_precision{ + cudf::data_type{cudf::type_id::STRUCT}, -1, std::move(schema_cols_with_precisions)}}; +} + using string_index_pair = thrust::pair; std::unique_ptr cast_strings_to_booleans(cudf::column_view const& input, @@ -484,94 +572,6 @@ std::unique_ptr remove_quotes(cudf::strings_column_view const& inp cudf::detail::copy_bitmask(input.parent(), stream, mr)); } -/** - * @brief The struct similar to `cudf::io::schema_element` with adding decimal precision and - * preserving column order. - */ -struct schema_element_with_precision { - cudf::data_type type; - int precision; - std::vector> child_types; -}; - -std::pair parse_schema_element( - std::size_t& index, - std::vector const& col_names, - std::vector const& num_children, - std::vector const& types, - std::vector const& scales, - std::vector const& precisions) -{ - // Get data for the current column. - auto const d_type = cudf::data_type{static_cast(types[index]), scales[index]}; - auto const precision = precisions[index]; - auto const col_num_children = num_children[index]; - index++; - - std::map children; - std::vector> children_with_precisions; - std::vector child_names(col_num_children); - - if (d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST) { - for (int i = 0; i < col_num_children; ++i) { - auto const& name = col_names[index]; - auto [child, child_with_precision] = - parse_schema_element(index, col_names, num_children, types, scales, precisions); - children.emplace(name, std::move(child)); - children_with_precisions.emplace_back(name, std::move(child_with_precision)); - child_names[i] = name; - } - } else { - CUDF_EXPECTS(col_num_children == 0, - "Found children for a non-nested type that should have none.", - std::invalid_argument); - } - - // Note that if the first schema element does not has type STRUCT/LIST then it always has type - // STRING, since we intentionally parse JSON into strings column for later post-processing. - auto const schema_dtype = - d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST - ? d_type - : cudf::data_type{cudf::type_id::STRING}; - return {cudf::io::schema_element{schema_dtype, std::move(children), {std::move(child_names)}}, - schema_element_with_precision{d_type, precision, std::move(children_with_precisions)}}; -} - -// Travel the schema data by depth-first search order. -// Two separate schema is generated: -// - The first one is used as input to `cudf::read_json`, in which the data types of all columns -// are specified as STRING type. As such, the table returned by `cudf::read_json` will contain -// only strings columns or nested (LIST/STRUCT) columns. -// - The second schema contains decimal precision (if available) and preserves schema column types -// as well as the column order, used for converting from STRING type to the desired types for the -// final output. -std::pair generate_struct_schema( - std::vector const& col_names, - std::vector const& num_children, - std::vector const& types, - std::vector const& scales, - std::vector const& precisions) -{ - std::map schema_cols; - std::vector> schema_cols_with_precisions; - std::vector name_order; - - std::size_t index = 0; - while (index < types.size()) { - auto const& name = col_names[index]; - auto [child, child_with_precision] = - parse_schema_element(index, col_names, num_children, types, scales, precisions); - schema_cols.emplace(name, std::move(child)); - schema_cols_with_precisions.emplace_back(name, std::move(child_with_precision)); - name_order.push_back(name); - } - return { - cudf::io::schema_element{ - cudf::data_type{cudf::type_id::STRUCT}, std::move(schema_cols), {std::move(name_order)}}, - schema_element_with_precision{ - cudf::data_type{cudf::type_id::STRUCT}, -1, std::move(schema_cols_with_precisions)}}; -} - template std::unique_ptr convert_data_type(InputType&& input, schema_element_with_precision const& schema, From 034a5ec1750de1bc755b2ff32d2ba4bbeda40226 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 14 Nov 2024 14:26:04 -0800 Subject: [PATCH 57/58] Remove call to `make_structs_column` Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index da7719275..d45bf8cb1 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -817,13 +817,15 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con auto [null_mask, null_count] = cudf::detail::valid_if( valid_it, valid_it + should_be_nullified->size(), thrust::logical_not{}, stream, mr); - return cudf::make_structs_column( + // Do not use `cudf::make_structs_column` since we do not need to call `superimpose_nulls` + // on the children columns. + return std::make_unique( + cudf::data_type{cudf::type_id::STRUCT}, input.size(), - std::move(converted_cols), - null_count, + rmm::device_buffer{}, null_count > 0 ? std::move(null_mask) : rmm::device_buffer{0, stream, mr}, - stream, - mr); + null_count, + std::move(converted_cols)); } } // namespace From 74d858c33c5ed4d7a83fcdabe4c2d1bf9206e98f Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 14 Nov 2024 14:31:04 -0800 Subject: [PATCH 58/58] Cleanup Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 32 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index d45bf8cb1..c989cea84 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -188,7 +188,8 @@ std::unique_ptr cast_strings_to_booleans(cudf::column_view const& auto [null_mask, null_count] = cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr); - if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } + output->set_null_mask(null_count > 0 ? std::move(null_mask) : rmm::device_buffer{0, stream, mr}, + null_count); return output; } @@ -211,7 +212,7 @@ std::unique_ptr cast_strings_to_integers(cudf::column_view const& // We need to nullify the invalid string rows. // Technically, we should just mask out these rows as invalid and ignore them. - // However, spark_rapids_jni::string_to_integer cannot handle these non-empty null rows, + // However, `spark_rapids_jni::string_to_integer` cannot handle these non-empty null rows, // thus we have to materialzie the valid strings into a new strings column. auto string_pairs = rmm::device_uvector(string_count, stream); @@ -263,7 +264,6 @@ std::unique_ptr cast_strings_to_integers(cudf::column_view const& mr); } -// TODO: extract commond code for this and `remove_quotes`. std::unique_ptr remove_quotes_for_floats(cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) @@ -351,6 +351,9 @@ std::unique_ptr cast_strings_to_floats(cudf::column_view const& in { CUDF_FUNC_RANGE(); + auto const string_count = input.size(); + if (string_count == 0) { return cudf::make_empty_column(output_type); } + if (allow_nonnumeric_numbers) { auto const removed_quotes = remove_quotes_for_floats(input, stream, mr); return string_to_float( @@ -445,7 +448,13 @@ std::unique_ptr cast_strings_to_decimals(cudf::column_view const& // If the output strings column does not change in its total bytes, we know that it does not have // any '"' or ',' characters. if (bytes == input_sv.chars_size(stream)) { - return string_to_decimal(precision, output_type.scale(), input_sv, false, false, stream, mr); + return string_to_decimal(precision, + output_type.scale(), + input_sv, + /*ansi_mode*/ false, + /*strip*/ false, + stream, + mr); } auto const out_offsets = @@ -491,8 +500,8 @@ std::unique_ptr cast_strings_to_decimals(cudf::column_view const& return string_to_decimal(precision, output_type.scale(), cudf::strings_column_view{unquoted_strings->view()}, - false, - false, + /*ansi_mode*/ false, + /*strip*/ false, stream, mr); } @@ -532,8 +541,8 @@ std::unique_ptr remove_quotes(cudf::strings_column_view const& inp auto const is_quoted = size > 1 && str[0] == '"' && str[size - 1] == '"'; if (nullify_if_not_quoted && !is_quoted) { return {nullptr, 0}; } - auto const output_size = is_quoted ? size - 2 : size; - return {chars + start_offset + (is_quoted ? 1 : 0), output_size}; + if (is_quoted) { return {chars + start_offset + 1, size - 2}; } + return {chars + start_offset, size}; }); auto const size_it = cudf::detail::make_counting_transform_iterator( @@ -651,6 +660,7 @@ std::unique_ptr convert_data_type(InputType&& input, if (schema.type.id() == cudf::type_id::LIST) { CUDF_EXPECTS(d_type == cudf::type_id::LIST, "Input column should be LIST."); + std::vector> new_children; new_children.emplace_back( std::move(input_content.children[cudf::lists_column_view::offsets_column_index])); @@ -661,6 +671,7 @@ std::unique_ptr convert_data_type(InputType&& input, is_us_locale, stream, mr)); + // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` // on the child column as it does not have non-empty nulls. return std::make_unique(cudf::data_type{cudf::type_id::LIST}, @@ -673,6 +684,7 @@ std::unique_ptr convert_data_type(InputType&& input, if (schema.type.id() == cudf::type_id::STRUCT) { CUDF_EXPECTS(d_type == cudf::type_id::STRUCT, "Input column should be STRUCT."); + std::vector> new_children; new_children.reserve(num_children); for (cudf::size_type i = 0; i < num_children; ++i) { @@ -701,6 +713,7 @@ std::unique_ptr convert_data_type(InputType&& input, if (schema.type.id() == cudf::type_id::LIST) { CUDF_EXPECTS(d_type == cudf::type_id::LIST, "Input column should be LIST."); + std::vector> new_children; new_children.emplace_back( std::make_unique(input.child(cudf::lists_column_view::offsets_column_index))); @@ -711,6 +724,7 @@ std::unique_ptr convert_data_type(InputType&& input, is_us_locale, stream, mr)); + // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` // on the child column as it does not have non-empty nulls. return std::make_unique(cudf::data_type{cudf::type_id::LIST}, @@ -723,6 +737,7 @@ std::unique_ptr convert_data_type(InputType&& input, if (schema.type.id() == cudf::type_id::STRUCT) { CUDF_EXPECTS(d_type == cudf::type_id::STRUCT, "Input column should be STRUCT."); + std::vector> new_children; new_children.reserve(num_children); for (cudf::size_type i = 0; i < num_children; ++i) { @@ -733,6 +748,7 @@ std::unique_ptr convert_data_type(InputType&& input, stream, mr)); } + // Do not use `cudf::make_structs_column` since we do not need to call `superimpose_nulls` // on the children columns. return std::make_unique(cudf::data_type{cudf::type_id::STRUCT},