From adaca6ddf866ae1a6b668785a737237630f12925 Mon Sep 17 00:00:00 2001 From: Anton Piontkovskii Date: Fri, 27 Dec 2024 22:07:42 +0300 Subject: [PATCH] Improve roaring UDF --- ydb/apps/ydbd/ya.make | 1 + .../yql/udfs/common/roaring/roaring.cpp | 144 ++++++++++++++++++ .../common/roaring/test/canondata/result.json | 5 + .../test.test_intersect_/results.txt | 31 ++++ .../test.test_run_optimize_/results.txt | 35 +++++ .../results.txt | 33 ++++ .../common/roaring/test/cases/intersect.sql | 2 + .../roaring/test/cases/run_optimize.sql | 1 + .../test/cases/serialize_deserialize.sql | 3 + 9 files changed, 255 insertions(+) create mode 100644 ydb/library/yql/udfs/common/roaring/test/canondata/test.test_run_optimize_/results.txt create mode 100644 ydb/library/yql/udfs/common/roaring/test/cases/run_optimize.sql diff --git a/ydb/apps/ydbd/ya.make b/ydb/apps/ydbd/ya.make index a3357b493d99..75cb114d9319 100644 --- a/ydb/apps/ydbd/ya.make +++ b/ydb/apps/ydbd/ya.make @@ -53,6 +53,7 @@ PEERDIR( yql/essentials/udfs/common/hyperloglog yql/essentials/udfs/common/ip_base ydb/library/yql/udfs/common/knn + ydb/library/yql/udfs/common/roaring yql/essentials/udfs/common/json yql/essentials/udfs/common/json2 yql/essentials/udfs/common/math diff --git a/ydb/library/yql/udfs/common/roaring/roaring.cpp b/ydb/library/yql/udfs/common/roaring/roaring.cpp index 9699f8c7a601..5adbef9cf211 100644 --- a/ydb/library/yql/udfs/common/roaring/roaring.cpp +++ b/ydb/library/yql/udfs/common/roaring/roaring.cpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -30,6 +31,11 @@ namespace { { } + TRoaringWrapper(roaring_bitmap_t* bitmap) + : Roaring(bitmap) + { + } + ~TRoaringWrapper() { roaring_bitmap_free(Roaring); } @@ -105,6 +111,47 @@ namespace { } }; + class TRoaringAndNotWithBinary: public TBoxedValue { + public: + TRoaringAndNotWithBinary() { + } + + static TStringRef Name() { + return TStringRef::Of("AndNotWithBinary"); + } + + private: + TUnboxedValue Run(const IValueBuilder* valueBuilder, + const TUnboxedValuePod* args) const override { + Y_UNUSED(valueBuilder); + auto binaryString = args[1].AsStringRef(); + auto bitmap = DeserializePortable(binaryString); + + roaring_bitmap_andnot_inplace(GetBitmapFromArg(args[0]), bitmap); + roaring_bitmap_free(bitmap); + + return args[0]; + } + }; + + class TRoaringAndNot: public TBoxedValue { + public: + TRoaringAndNot() { + } + + static TStringRef Name() { + return TStringRef::Of("AndNot"); + } + + private: + TUnboxedValue Run(const IValueBuilder* valueBuilder, + const TUnboxedValuePod* args) const override { + Y_UNUSED(valueBuilder); + roaring_bitmap_andnot_inplace(GetBitmapFromArg(args[0]), GetBitmapFromArg(args[1])); + return args[0]; + } + }; + class TRoaringOr: public TBoxedValue { public: TRoaringOr() { @@ -223,6 +270,46 @@ namespace { TSourcePosition Pos_; }; + class TRoaringFromUint32List: public TBoxedValue { + public: + TRoaringFromUint32List(TSourcePosition pos) + : Pos_(pos) + { + } + + static TStringRef Name() { + return TStringRef::Of("FromUint32List"); + } + + private: + TUnboxedValue Run(const IValueBuilder* valueBuilder, + const TUnboxedValuePod* args) const override { + Y_UNUSED(valueBuilder); + try { + auto *b = roaring_bitmap_create(); + + const auto vector = args[0]; + const auto* elements = vector.GetElements(); + if (elements) { + for (auto& value : TArrayRef{elements, vector.GetListLength()}) { + roaring_bitmap_add(b, value.Get()); + } + } else { + TUnboxedValue value; + const auto it = vector.GetListIterator(); + while (it.Next(value)) { + roaring_bitmap_add(b, value.Get()); + } + } + + return TUnboxedValuePod(new TRoaringWrapper(b)); + } catch (const std::exception& e) { + UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data()); + } + } + TSourcePosition Pos_; + }; + class TRoaringSerialize: public TBoxedValue { public: TRoaringSerialize() { @@ -266,6 +353,25 @@ namespace { } }; + class TRoaringRunOptimize: public TBoxedValue { + public: + TRoaringRunOptimize() { + } + + static TStringRef Name() { + return TStringRef::Of("RunOptimize"); + } + + private: + TUnboxedValue Run(const IValueBuilder* valueBuilder, + const TUnboxedValuePod* args) const override { + Y_UNUSED(valueBuilder); + auto bitmap = GetBitmapFromArg(args[0]); + roaring_bitmap_run_optimize(bitmap); + return args[0]; + } + }; + class TRoaringModule: public IUdfModule { public: TRoaringModule() { @@ -282,6 +388,7 @@ namespace { void GetAllFunctions(IFunctionsSink& sink) const final { sink.Add(TRoaringSerialize::Name()); sink.Add(TRoaringDeserialize::Name()); + sink.Add(TRoaringFromUint32List::Name()); sink.Add(TRoaringCardinality::Name()); @@ -292,6 +399,11 @@ namespace { sink.Add(TRoaringAndWithBinary::Name()); sink.Add(TRoaringAnd::Name()); + + sink.Add(TRoaringAndNotWithBinary::Name()); + sink.Add(TRoaringAndNot::Name()); + + sink.Add(TRoaringRunOptimize::Name()); } void CleanupOnTerminate() const final { @@ -312,6 +424,12 @@ namespace { if (!typesOnly) { builder.Implementation(new TRoaringDeserialize(builder.GetSourcePosition())); } + } else if (TRoaringFromUint32List::Name() == name) { + builder.Returns>().Args()->Add>(); + + if (!typesOnly) { + builder.Implementation(new TRoaringFromUint32List(builder.GetSourcePosition())); + } } else if (TRoaringSerialize::Name() == name) { builder.Returns(builder.SimpleType()) .Args() @@ -372,6 +490,32 @@ namespace { if (!typesOnly) { builder.Implementation(new TRoaringAnd()); } + } else if (TRoaringAndNotWithBinary::Name() == name) { + builder.Returns>() + .Args() + ->Add>>() + .Add>(); + + if (!typesOnly) { + builder.Implementation(new TRoaringAndNotWithBinary()); + } + } else if (TRoaringAndNot::Name() == name) { + builder.Returns>() + .Args() + ->Add>>() + .Add>>(); + + if (!typesOnly) { + builder.Implementation(new TRoaringAndNot()); + } + } else if (TRoaringRunOptimize::Name() == name) { + builder.Returns>() + .Args() + ->Add>>(); + + if (!typesOnly) { + builder.Implementation(new TRoaringRunOptimize()); + } } else { TStringBuilder sb; sb << "Unknown function: " << name.Data(); diff --git a/ydb/library/yql/udfs/common/roaring/test/canondata/result.json b/ydb/library/yql/udfs/common/roaring/test/canondata/result.json index 8c592fb352ad..e3ecf0d42e9d 100644 --- a/ydb/library/yql/udfs/common/roaring/test/canondata/result.json +++ b/ydb/library/yql/udfs/common/roaring/test/canondata/result.json @@ -18,5 +18,10 @@ { "uri": "file://test.test_union_/results.txt" } + ], + "test.test[run_optimize]": [ + { + "uri": "file://test.test_run_optimize_/results.txt" + } ] } diff --git a/ydb/library/yql/udfs/common/roaring/test/canondata/test.test_intersect_/results.txt b/ydb/library/yql/udfs/common/roaring/test/canondata/test.test_intersect_/results.txt index 326216669869..c464e4dddc4a 100644 --- a/ydb/library/yql/udfs/common/roaring/test/canondata/test.test_intersect_/results.txt +++ b/ydb/library/yql/udfs/common/roaring/test/canondata/test.test_intersect_/results.txt @@ -102,5 +102,36 @@ ] } ] + }; + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "AndNotList"; + [ + "ListType"; + [ + "DataType"; + "Uint32" + ] + ] + ] + ] + ] + ]; + "Data" = [ + [ + [ + "42" + ] + ] + ] + } + ] } ] \ No newline at end of file diff --git a/ydb/library/yql/udfs/common/roaring/test/canondata/test.test_run_optimize_/results.txt b/ydb/library/yql/udfs/common/roaring/test/canondata/test.test_run_optimize_/results.txt new file mode 100644 index 000000000000..e5dbe3672bbb --- /dev/null +++ b/ydb/library/yql/udfs/common/roaring/test/canondata/test.test_run_optimize_/results.txt @@ -0,0 +1,35 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "RunOptimizeList"; + [ + "ListType"; + [ + "DataType"; + "Uint32" + ] + ] + ] + ] + ] + ]; + "Data" = [ + [ + [ + "10"; + "42"; + "567" + ] + ] + ] + } + ] + } +] \ No newline at end of file diff --git a/ydb/library/yql/udfs/common/roaring/test/canondata/test.test_serialize_deserialize_/results.txt b/ydb/library/yql/udfs/common/roaring/test/canondata/test.test_serialize_deserialize_/results.txt index 05f944c62233..3608154230a7 100644 --- a/ydb/library/yql/udfs/common/roaring/test/canondata/test.test_serialize_deserialize_/results.txt +++ b/ydb/library/yql/udfs/common/roaring/test/canondata/test.test_serialize_deserialize_/results.txt @@ -172,5 +172,38 @@ ] } ] + }; + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "DeserializedList"; + [ + "ListType"; + [ + "DataType"; + "Uint32" + ] + ] + ] + ] + ] + ]; + "Data" = [ + [ + [ + "10"; + "42"; + "567" + ] + ] + ] + } + ] } ] \ No newline at end of file diff --git a/ydb/library/yql/udfs/common/roaring/test/cases/intersect.sql b/ydb/library/yql/udfs/common/roaring/test/cases/intersect.sql index 60c968946ead..7220ab8dcdec 100644 --- a/ydb/library/yql/udfs/common/roaring/test/cases/intersect.sql +++ b/ydb/library/yql/udfs/common/roaring/test/cases/intersect.sql @@ -1,3 +1,5 @@ SELECT Roaring::Uint32List(Roaring::And(Roaring::Deserialize(left), Roaring::Deserialize(right))) AS AndList FROM Input; SELECT Roaring::Uint32List(Roaring::AndWithBinary(Roaring::Deserialize(right), left)) AS AndWithBinaryList FROM Input; SELECT Roaring::Uint32List(Roaring::AndWithBinary(Roaring::Deserialize(right), NULL)) AS AndWithBinaryListEmpty FROM Input; + +SELECT Roaring::Uint32List(Roaring::AndNot(Roaring::FromUint32List(AsList(1, 10, 42)), Roaring::FromUint32List(AsList(10, 1)))) AS AndNotList; diff --git a/ydb/library/yql/udfs/common/roaring/test/cases/run_optimize.sql b/ydb/library/yql/udfs/common/roaring/test/cases/run_optimize.sql new file mode 100644 index 000000000000..ef8be8afc5af --- /dev/null +++ b/ydb/library/yql/udfs/common/roaring/test/cases/run_optimize.sql @@ -0,0 +1 @@ +SELECT Roaring::Uint32List(Roaring::RunOptimize(Roaring::FromUint32List(AsList(10, 567, 42)))) AS RunOptimizeList; \ No newline at end of file diff --git a/ydb/library/yql/udfs/common/roaring/test/cases/serialize_deserialize.sql b/ydb/library/yql/udfs/common/roaring/test/cases/serialize_deserialize.sql index 9735432181c9..fd951962485b 100644 --- a/ydb/library/yql/udfs/common/roaring/test/cases/serialize_deserialize.sql +++ b/ydb/library/yql/udfs/common/roaring/test/cases/serialize_deserialize.sql @@ -13,3 +13,6 @@ FROM Input; SELECT ListTake(ListSkip(Roaring::Uint32List(Roaring::Deserialize(binaryString)), 10), 1) AS EmptyList FROM Input; + +SELECT Roaring::Uint32List(Roaring::FromUint32List(AsList(10, 567, 42))) AS DeserializedList +FROM Input; \ No newline at end of file