Skip to content

Commit

Permalink
Add new distance functions for Multiclass and OrderedFactor. (#5)
Browse files Browse the repository at this point in the history
* Add new distance functions for Multiclass and OrderedFactor.

* Update src/TableDistances.jl

* Move CategoricalArrays to before CoDa. Remove unecessary levelcode and type conversion.

* Add MulticlassDistance and OrderedFactorDistance to tests.

* Update result_type to return Bool instead of Float64 for MulticlassDistance.

* Add docstring and move from the top of file the new defined distances.

* Quick update.

* Update src/distances.jl

* Update src/distances.jl

Co-authored-by: Júlio Hoffimann <[email protected]>
  • Loading branch information
mrr00b00t and juliohm authored Oct 5, 2021
1 parent 0700289 commit 43eacfc
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 3 deletions.
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ authors = ["Júlio Hoffimann <[email protected]>", "José Augusto <jose.
version = "0.1.0"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
CoDa = "5900dafe-f573-5c72-b367-76665857777b"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
Expand All @@ -13,6 +14,7 @@ TableOperations = "ab02a1b2-a7df-11e8-156e-fb1833f50b87"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
CategoricalArrays = "0.10"
CoDa = "0.6"
Distances = "0.10"
ScientificTypes = "2.3"
Expand Down
3 changes: 2 additions & 1 deletion src/TableDistances.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ using TableOperations
using ScientificTypes
using Distances
using StringDistances
using CategoricalArrays
using CoDa
using Statistics

import Distances: pairwise
import Distances: pairwise, result_type

include("distances.jl")
include("normalizations.jl")
Expand Down
26 changes: 24 additions & 2 deletions src/distances.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,32 @@
# Licensed under the MIT License. See LICENCE in the project root.
# ------------------------------------------------------------------

"""
MulticlassDistance()(x, y)
Return `true` if x and y are different classes, else return `false`.
"""
struct MulticlassDistance <: Metric end

(::MulticlassDistance)(x, y) = x != y

result_type(::MulticlassDistance, x, y) = Bool

"""
OrderedFactorDistance()(x, y)
Return the absolute value of the difference between the categorical codes of x and y.
"""
struct OrderedFactorDistance <: Metric end

(::OrderedFactorDistance)(x, y) = abs(levelcode(x) - levelcode(y))

result_type(::OrderedFactorDistance, x, y) = Float64

default_distance(::Type{Continuous}) = Distances.Euclidean()
default_distance(::Type{Count}) = Distances.Cityblock()
default_distance(::Type{<:Multiclass}) = Distances.Hamming()
default_distance(::Type{<:OrderedFactor}) = Distances.Chebyshev()
default_distance(::Type{<:Multiclass}) = MulticlassDistance()
default_distance(::Type{<:OrderedFactor}) = OrderedFactorDistance()
default_distance(::Type{Textual}) = StringDistances.Levenshtein()
default_distance(::Type{<:Compositional}) = CoDa.CoDaDistance()

Expand Down
1 change: 1 addition & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
CoDa = "5900dafe-f573-5c72-b367-76665857777b"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
Expand Down
11 changes: 11 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using TableDistances
using Tables
using Test
using CategoricalArrays
using CoDa
using Distances
using ScientificTypes
Expand All @@ -10,12 +11,15 @@ using ScientificTypes
# test data
table₁ = (a = rand(4), b = rand(Composition{5}, 4))
table₂ = (a = rand(6), b = rand(Composition{5}, 6))
table₃ = (a = categorical(["a", "b", "a", "c"]), b = categorical([1, 4, 1, 5]))

# specific columns
euclidcol₁ = Tables.getcolumn(table₁, :a)
euclidcol₂ = Tables.getcolumn(table₂, :a)
codacol₁ = Tables.getcolumn(table₁, :b)
codacol₂ = Tables.getcolumn(table₂, :b)
multiclass = Tables.getcolumn(table₃, :a)
ordered = Tables.getcolumn(table₃, :b)

# column normalization
D₁ = pairwise(TableDistance(normalize=true), table₁, table₂)
Expand All @@ -36,5 +40,12 @@ using ScientificTypes
D₂ = 0.5*pairwise(Euclidean(), euclidcol₁) +
0.5*pairwise(CoDaDistance(), codacol₁)
@test D₁ D₂

# pairwise with multiclass and ordered factor
table₃ = coerce(table₃, :a => Multiclass, :b => OrderedFactor)
D₁ = pairwise(TableDistance(normalize=false), table₃)
D₂ = 0.5*pairwise(TableDistances.MulticlassDistance(), multiclass) +
0.5*pairwise(TableDistances.OrderedFactorDistance(), ordered)
@test D₁ D₂
end
end

0 comments on commit 43eacfc

Please sign in to comment.