From 5a8ec6d7296c1c78bcf298f1bf0a527d9ce1589a Mon Sep 17 00:00:00 2001 From: Elias Carvalho <73039601+eliascarv@users.noreply.github.com> Date: Wed, 10 Apr 2024 14:38:38 -0300 Subject: [PATCH] Add 'DropNaN' transform (#274) --- docs/src/transforms.md | 6 ++ src/TableTransforms.jl | 1 + src/transforms.jl | 1 + src/transforms/dropnan.jl | 55 ++++++++++++++++++ test/shows.jl | 14 +++++ test/transforms.jl | 1 + test/transforms/dropnan.jl | 111 +++++++++++++++++++++++++++++++++++++ 7 files changed, 189 insertions(+) create mode 100644 src/transforms/dropnan.jl create mode 100644 test/transforms/dropnan.jl diff --git a/docs/src/transforms.md b/docs/src/transforms.md index 4b80c04c..ba467a73 100644 --- a/docs/src/transforms.md +++ b/docs/src/transforms.md @@ -80,6 +80,12 @@ Filter DropMissing ``` +## DropNaN + +```@docs +DropNaN +``` + ## DropExtrema ```@docs diff --git a/src/TableTransforms.jl b/src/TableTransforms.jl index f850822f..f4d9142a 100644 --- a/src/TableTransforms.jl +++ b/src/TableTransforms.jl @@ -62,6 +62,7 @@ export Sample, Filter, DropMissing, + DropNaN, DropExtrema, DropUnits, AbsoluteUnits, diff --git a/src/transforms.jl b/src/transforms.jl index 66ba95e0..7abc755b 100644 --- a/src/transforms.jl +++ b/src/transforms.jl @@ -275,6 +275,7 @@ include("transforms/sort.jl") include("transforms/sample.jl") include("transforms/filter.jl") include("transforms/dropmissing.jl") +include("transforms/dropnan.jl") include("transforms/dropextrema.jl") include("transforms/dropunits.jl") include("transforms/dropconstant.jl") diff --git a/src/transforms/dropnan.jl b/src/transforms/dropnan.jl new file mode 100644 index 00000000..979ecd3a --- /dev/null +++ b/src/transforms/dropnan.jl @@ -0,0 +1,55 @@ +# ------------------------------------------------------------------ +# Licensed under the MIT License. See LICENSE in the project root. +# ------------------------------------------------------------------ + +""" + DropNaN() + DropNaN(:) + +Drop all rows with NaN values in table. + + DropNaN(col₁, col₂, ..., colₙ) + DropNaN([col₁, col₂, ..., colₙ]) + DropNaN((col₁, col₂, ..., colₙ)) + +Drop all rows with NaN values in selected columns `col₁`, `col₂`, ..., `colₙ`. + + DropNaN(regex) + +Drop all rows with NaN values in columns that match with `regex`. + +# Examples + +```julia +DropNaN(2, 3, 4) +DropNaN([:b, :c, :d]) +DropNaN(("b", "c", "d")) +DropNaN(r"[bcd]") +``` +""" +struct DropNaN{S<:ColumnSelector} <: StatelessFeatureTransform + selector::S +end + +DropNaN() = DropNaN(AllSelector()) +DropNaN(cols) = DropNaN(selector(cols)) +DropNaN(cols::C...) where {C<:Column} = DropNaN(selector(cols)) + +isrevertible(::Type{<:DropNaN}) = false + +_isnan(_) = false +_isnan(x::Number) = isnan(x) + +function preprocess(transform::DropNaN, feat) + cols = Tables.columns(feat) + names = Tables.columnnames(cols) + snames = transform.selector(names) + ftrans = Filter(row -> all(!_isnan(row[nm]) for nm in snames)) + fprep = preprocess(ftrans, feat) + ftrans, fprep +end + +function applyfeat(::DropNaN, feat, prep) + ftrans, fprep = prep + applyfeat(ftrans, feat, fprep) +end diff --git a/test/shows.jl b/test/shows.jl index 46f79bdd..9c044334 100644 --- a/test/shows.jl +++ b/test/shows.jl @@ -175,6 +175,20 @@ └─ selector = [:a, :b, :c]""" end + @testset "DropNaN" begin + T = DropNaN(:a, :b, :c) + + # compact mode + iostr = sprint(show, T) + @test iostr == "DropNaN([:a, :b, :c])" + + # full mode + iostr = sprint(show, MIME("text/plain"), T) + @test iostr == """ + DropNaN transform + └─ selector = [:a, :b, :c]""" + end + @testset "DropExtrema" begin T = DropExtrema("a", low=0.25, high=0.75) diff --git a/test/transforms.jl b/test/transforms.jl index 8208cfbd..46a2c4ab 100644 --- a/test/transforms.jl +++ b/test/transforms.jl @@ -9,6 +9,7 @@ transformfiles = [ "sample.jl", "filter.jl", "dropmissing.jl", + "dropnan.jl", "dropextrema.jl", "dropunits.jl", "dropconstant.jl", diff --git a/test/transforms/dropnan.jl b/test/transforms/dropnan.jl new file mode 100644 index 00000000..68c5f22b --- /dev/null +++ b/test/transforms/dropnan.jl @@ -0,0 +1,111 @@ +@testset "DropNaN" begin + @test !isrevertible(DropNaN()) + + a = [1.8, 0.5, 1.2, 3.7, 5.0, NaN] + b = [6.0f0, 5.4f0, 5.4f0, NaN32, 5.5f0, 2.6f0] + c = [4.9, 5.1, NaN, 5.1, 8.6, 4.4] * u"m" + d = [NaN32, 1.0f0, 8.8f0, 0.1f0, 1.5f0, 9.5f0] * u"m" + e = ["yes", "no", "no", "yes", "yes", "no"] + t = Table(; a, b, c, d, e) + + T = DropNaN() + n, c = apply(T, t) + @test n.a == [0.5, 5.0] + @test n.b == [5.4f0, 5.5f0] + @test n.c == [5.1, 8.6] * u"m" + @test n.d == [1.0f0, 1.5f0] * u"m" + @test n.e == ["no", "yes"] + + # args... + # integers + T = DropNaN(1, 3) + n, c = apply(T, t) + @test isequal(n.a, [1.8, 0.5, 3.7, 5.0]) + @test isequal(n.b, [6.0f0, 5.4f0, NaN32, 5.5f0]) + @test isequal(n.c, [4.9, 5.1, 5.1, 8.6] * u"m") + @test isequal(n.d, [NaN32, 1.0f0, 0.1f0, 1.5f0] * u"m") + @test isequal(n.e, ["yes", "no", "yes", "yes"]) + + # symbols + T = DropNaN(:a, :c) + n, c = apply(T, t) + @test isequal(n.a, [1.8, 0.5, 3.7, 5.0]) + @test isequal(n.b, [6.0f0, 5.4f0, NaN32, 5.5f0]) + @test isequal(n.c, [4.9, 5.1, 5.1, 8.6] * u"m") + @test isequal(n.d, [NaN32, 1.0f0, 0.1f0, 1.5f0] * u"m") + @test isequal(n.e, ["yes", "no", "yes", "yes"]) + + # strings + T = DropNaN("a", "c") + n, c = apply(T, t) + @test isequal(n.a, [1.8, 0.5, 3.7, 5.0]) + @test isequal(n.b, [6.0f0, 5.4f0, NaN32, 5.5f0]) + @test isequal(n.c, [4.9, 5.1, 5.1, 8.6] * u"m") + @test isequal(n.d, [NaN32, 1.0f0, 0.1f0, 1.5f0] * u"m") + @test isequal(n.e, ["yes", "no", "yes", "yes"]) + + # vector + # integers + T = DropNaN([2, 4]) + n, c = apply(T, t) + @test isequal(n.a, [0.5, 1.2, 5.0, NaN]) + @test isequal(n.b, [5.4f0, 5.4f0, 5.5f0, 2.6f0]) + @test isequal(n.c, [5.1, NaN, 8.6, 4.4] * u"m") + @test isequal(n.d, [1.0f0, 8.8f0, 1.5f0, 9.5f0] * u"m") + @test isequal(n.e, ["no", "no", "yes", "no"]) + + # symbols + T = DropNaN([:b, :d]) + n, c = apply(T, t) + @test isequal(n.a, [0.5, 1.2, 5.0, NaN]) + @test isequal(n.b, [5.4f0, 5.4f0, 5.5f0, 2.6f0]) + @test isequal(n.c, [5.1, NaN, 8.6, 4.4] * u"m") + @test isequal(n.d, [1.0f0, 8.8f0, 1.5f0, 9.5f0] * u"m") + @test isequal(n.e, ["no", "no", "yes", "no"]) + + # strings + T = DropNaN(["b", "d"]) + n, c = apply(T, t) + @test isequal(n.a, [0.5, 1.2, 5.0, NaN]) + @test isequal(n.b, [5.4f0, 5.4f0, 5.5f0, 2.6f0]) + @test isequal(n.c, [5.1, NaN, 8.6, 4.4] * u"m") + @test isequal(n.d, [1.0f0, 8.8f0, 1.5f0, 9.5f0] * u"m") + @test isequal(n.e, ["no", "no", "yes", "no"]) + + # tuple + # integers + T = DropNaN((1, 2, 3)) + n, c = apply(T, t) + @test isequal(n.a, [1.8, 0.5, 5.0]) + @test isequal(n.b, [6.0f0, 5.4f0, 5.5f0]) + @test isequal(n.c, [4.9, 5.1, 8.6] * u"m") + @test isequal(n.d, [NaN32, 1.0f0, 1.5f0] * u"m") + @test isequal(n.e, ["yes", "no", "yes"]) + + # symbols + T = DropNaN((:a, :b, :c)) + n, c = apply(T, t) + @test isequal(n.a, [1.8, 0.5, 5.0]) + @test isequal(n.b, [6.0f0, 5.4f0, 5.5f0]) + @test isequal(n.c, [4.9, 5.1, 8.6] * u"m") + @test isequal(n.d, [NaN32, 1.0f0, 1.5f0] * u"m") + @test isequal(n.e, ["yes", "no", "yes"]) + + # strings + T = DropNaN(("a", "b", "c")) + n, c = apply(T, t) + @test isequal(n.a, [1.8, 0.5, 5.0]) + @test isequal(n.b, [6.0f0, 5.4f0, 5.5f0]) + @test isequal(n.c, [4.9, 5.1, 8.6] * u"m") + @test isequal(n.d, [NaN32, 1.0f0, 1.5f0] * u"m") + @test isequal(n.e, ["yes", "no", "yes"]) + + # regex + T = DropNaN(r"[bcd]") + n, c = apply(T, t) + @test isequal(n.a, [0.5, 5.0, NaN]) + @test isequal(n.b, [5.4f0, 5.5f0, 2.6f0]) + @test isequal(n.c, [5.1, 8.6, 4.4] * u"m") + @test isequal(n.d, [1.0f0, 1.5f0, 9.5f0] * u"m") + @test isequal(n.e, ["no", "yes", "no"]) +end