From 66e1c1ff276a5fe37bc6fc8027a41e12784e0f30 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 12 Nov 2023 23:06:28 +0800 Subject: [PATCH] ComputeNullValues --- cpp/src/arrow/array/array_dict.cc | 60 +++++++++++++++++++++++++++++++ cpp/src/arrow/array/array_dict.h | 9 +++++ 2 files changed, 69 insertions(+) diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 28fccdbfcffee..0b714ddc2b9ee 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -212,6 +212,56 @@ Result> TransposeDictIndices( return out_data; } +struct CompactDictionaryNullValuesVistor { + const std::shared_ptr& data; + int64_t& out_null_count; + + template + Status CompactDictionaryNullValuesImpl() { + int64_t index_length = data->length; + int64_t dict_length = data->dictionary->length; + const uint8_t* dictionary_null_bit_map = data->dictionary->GetValues(0); + + using CType = typename IndexArrowType::c_type; + const CType* indices_data = data->GetValues(1); + CType dict_len = static_cast(dict_length); + for (int64_t i = 0; i < index_length; i++) { + if (data->IsNull(i)) { + continue; + } + + CType current_index = indices_data[i]; + if (current_index < 0 || current_index >= dict_len) { + return Status::IndexError( + "Index out of bounds while counting dictionary array: ", current_index, + "(dictionary is ", dict_length, " long) at position ", i); + } + if (!bit_util::GetBit(dictionary_null_bit_map, current_index)) { + out_null_count++; + } + } + return Status::OK(); + } + + template + enable_if_integer Visit(const Type&) { + return CompactDictionaryNullValuesImpl(); + } + + Status Visit(const DataType& type) { + return Status::TypeError("Expected an Index Type of Int or UInt"); + } +}; + +Result CompactDictionaryNullValues(const std::shared_ptr& data) { + int64_t out_null_count = 0; + const auto& dict_type = checked_cast(*data->type); + CompactDictionaryNullValuesVistor vistor{data, out_null_count}; + RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), &vistor)); + + return out_null_count; +} + struct CompactTransposeMapVistor { const std::shared_ptr& data; arrow::MemoryPool* pool; @@ -323,6 +373,16 @@ Result> DictionaryArray::Transpose( return MakeArray(std::move(transposed)); } +Result DictionaryArray::CountNullValues() const { + if (this->dictionary()->null_count() == 0 || this->indices()->null_count() == 0) { + return this->indices()->null_count(); + } + + ARROW_ASSIGN_OR_RAISE(int64_t dictionary_null_count, + CompactDictionaryNullValues(data_)); + return dictionary_null_count + this->indices()->null_count(); +} + Result> DictionaryArray::Compact(MemoryPool* pool) const { std::shared_ptr compact_dictionary; ARROW_ASSIGN_OR_RAISE(std::unique_ptr transpose_map, diff --git a/cpp/src/arrow/array/array_dict.h b/cpp/src/arrow/array/array_dict.h index 9aa0a7bcc2d66..bbc224c3d86d8 100644 --- a/cpp/src/arrow/array/array_dict.h +++ b/cpp/src/arrow/array/array_dict.h @@ -96,6 +96,15 @@ class ARROW_EXPORT DictionaryArray : public Array { const std::shared_ptr& type, const std::shared_ptr& dictionary, const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const; + /// \brief Count the number of null values as the dictionary array is decoded. + Result CountNullValues() const; + + /// \brief Compat this DictionaryArray + /// + /// This method returns a compacted dictionary array. All the + /// values in the dictionary are referenced by indices. + /// + /// \param[in] pool a pool to allocate the array data from Result> Compact(MemoryPool* pool = default_memory_pool()) const; /// \brief Determine whether dictionary arrays may be compared without unification