From 77c12f1ba9fa99f8c37d99827f43ded799a551ce Mon Sep 17 00:00:00 2001 From: "Paulito Palmes, PhD" Date: Fri, 18 Oct 2024 14:58:13 +0100 Subject: [PATCH] refactor and reformat by lsp --- src/skcrossvalidator.jl | 112 +++++++-------- src/sklearners.jl | 303 ++++++++++++++++++++-------------------- src/skpreprocessor.jl | 290 +++++++++++++++++++------------------- 3 files changed, 350 insertions(+), 355 deletions(-) diff --git a/src/skcrossvalidator.jl b/src/skcrossvalidator.jl index 713c45c..cbe37e3 100644 --- a/src/skcrossvalidator.jl +++ b/src/skcrossvalidator.jl @@ -1,7 +1,7 @@ module SKCrossValidators import PythonCall -const PYC=PythonCall +const PYC = PythonCall # standard included modules using DataFrames @@ -16,49 +16,49 @@ const metric_dict = Dict{String,PYC.Py}() const SKM = PYC.pynew() function __init__() - PYC.pycopy!(SKM, PYC.pyimport("sklearn.metrics")) - - metric_dict["roc_auc_score"] = SKM.roc_auc_score - metric_dict["accuracy_score"] = SKM.accuracy_score - metric_dict["auc"] = SKM.auc - metric_dict["average_precision_score"] = SKM.average_precision_score - metric_dict["balanced_accuracy_score"] = SKM.balanced_accuracy_score - metric_dict["brier_score_loss"] = SKM.brier_score_loss - metric_dict["classification_report"] = SKM.classification_report - metric_dict["cohen_kappa_score"] = SKM.cohen_kappa_score - metric_dict["confusion_matrix"] = SKM.confusion_matrix - metric_dict["f1_score"] = SKM.f1_score - metric_dict["fbeta_score"] = SKM.fbeta_score - metric_dict["hamming_loss"] = SKM.hamming_loss - metric_dict["hinge_loss"] = SKM.hinge_loss - metric_dict["log_loss"] = SKM.log_loss - metric_dict["matthews_corrcoef"] = SKM.matthews_corrcoef - metric_dict["multilabel_confusion_matrix"] = SKM.multilabel_confusion_matrix - metric_dict["precision_recall_curve"] = SKM.precision_recall_curve - metric_dict["precision_recall_fscore_support"] = SKM.precision_recall_fscore_support - metric_dict["precision_score"] = SKM.precision_score - metric_dict["recall_score"] = SKM.recall_score - metric_dict["roc_auc_score"] = SKM.roc_auc_score - metric_dict["roc_curve"] = SKM.roc_curve - metric_dict["jaccard_score"] = SKM.jaccard_score - metric_dict["zero_one_loss"] = SKM.zero_one_loss - # regression - metric_dict["mean_squared_error"] = SKM.mean_squared_error - metric_dict["mean_squared_log_error"] = SKM.mean_squared_log_error - metric_dict["mean_absolute_error"] = SKM.mean_absolute_error - metric_dict["median_absolute_error"] = SKM.median_absolute_error - metric_dict["r2_score"] = SKM.r2_score - metric_dict["max_error"] = SKM.max_error - metric_dict["mean_poisson_deviance"] = SKM.mean_poisson_deviance - metric_dict["mean_gamma_deviance"] = SKM.mean_gamma_deviance - metric_dict["mean_tweedie_deviance"] = SKM.mean_tweedie_deviance - metric_dict["explained_variance_score"] = SKM.explained_variance_score + PYC.pycopy!(SKM, PYC.pyimport("sklearn.metrics")) + + metric_dict["roc_auc_score"] = SKM.roc_auc_score + metric_dict["accuracy_score"] = SKM.accuracy_score + metric_dict["auc"] = SKM.auc + metric_dict["average_precision_score"] = SKM.average_precision_score + metric_dict["balanced_accuracy_score"] = SKM.balanced_accuracy_score + metric_dict["brier_score_loss"] = SKM.brier_score_loss + metric_dict["classification_report"] = SKM.classification_report + metric_dict["cohen_kappa_score"] = SKM.cohen_kappa_score + metric_dict["confusion_matrix"] = SKM.confusion_matrix + metric_dict["f1_score"] = SKM.f1_score + metric_dict["fbeta_score"] = SKM.fbeta_score + metric_dict["hamming_loss"] = SKM.hamming_loss + metric_dict["hinge_loss"] = SKM.hinge_loss + metric_dict["log_loss"] = SKM.log_loss + metric_dict["matthews_corrcoef"] = SKM.matthews_corrcoef + metric_dict["multilabel_confusion_matrix"] = SKM.multilabel_confusion_matrix + metric_dict["precision_recall_curve"] = SKM.precision_recall_curve + metric_dict["precision_recall_fscore_support"] = SKM.precision_recall_fscore_support + metric_dict["precision_score"] = SKM.precision_score + metric_dict["recall_score"] = SKM.recall_score + metric_dict["roc_auc_score"] = SKM.roc_auc_score + metric_dict["roc_curve"] = SKM.roc_curve + metric_dict["jaccard_score"] = SKM.jaccard_score + metric_dict["zero_one_loss"] = SKM.zero_one_loss + # regression + metric_dict["mean_squared_error"] = SKM.mean_squared_error + metric_dict["mean_squared_log_error"] = SKM.mean_squared_log_error + metric_dict["mean_absolute_error"] = SKM.mean_absolute_error + metric_dict["median_absolute_error"] = SKM.median_absolute_error + metric_dict["r2_score"] = SKM.r2_score + metric_dict["max_error"] = SKM.max_error + metric_dict["mean_poisson_deviance"] = SKM.mean_poisson_deviance + metric_dict["mean_gamma_deviance"] = SKM.mean_gamma_deviance + metric_dict["mean_tweedie_deviance"] = SKM.mean_tweedie_deviance + metric_dict["explained_variance_score"] = SKM.explained_variance_score end function checkfun(sfunc::String) if !(sfunc in keys(metric_dict)) println("$sfunc metric is not supported") - println("metric: ",keys(metric_dict)) + println("metric: ", keys(metric_dict)) error("Metric keyword error") end end @@ -87,39 +87,39 @@ and the following metrics for regression: - "max_error" - "explained_variance_score" """ -function crossvalidate(pl::Machine,X::DataFrame,Y::Vector, - sfunc::String; nfolds=10,verbose::Bool=true) +function crossvalidate(pl::Machine, X::DataFrame, Y::Vector, + sfunc::String; nfolds=10, verbose::Bool=true) - YC=Y + YC = Y if !(eltype(YC) <: Real) - YC = Y |> Vector{String} + YC = Y |> Vector{String} end checkfun(sfunc) pfunc = metric_dict[sfunc] - metric(a,b) = pfunc(a,b) |> (x -> PYC.pyconvert(Float64,x)) - crossvalidate(pl,X,YC,metric,nfolds,verbose) + metric(a, b) = pfunc(a, b) |> (x -> PYC.pyconvert(Float64, x)) + crossvalidate(pl, X, YC, metric, nfolds, verbose) end -function crossvalidate(pl::Machine,X::DataFrame,Y::Vector,sfunc::String,nfolds::Int) - crossvalidate(pl,X,Y,sfunc; nfolds) +function crossvalidate(pl::Machine, X::DataFrame, Y::Vector, sfunc::String, nfolds::Int) + crossvalidate(pl, X, Y, sfunc; nfolds) end -function crossvalidate(pl::Machine,X::DataFrame,Y::Vector,sfunc::String,verbose::Bool) - crossvalidate(pl,X,Y,sfunc; verbose) +function crossvalidate(pl::Machine, X::DataFrame, Y::Vector, sfunc::String, verbose::Bool) + crossvalidate(pl, X, Y, sfunc; verbose) end -function crossvalidate(pl::Machine,X::DataFrame,Y::Vector, - sfunc::String, nfolds::Int,verbose::Bool) - crossvalidate(pl,X,Y,sfunc; nfolds,verbose) +function crossvalidate(pl::Machine, X::DataFrame, Y::Vector, + sfunc::String, nfolds::Int, verbose::Bool) + crossvalidate(pl, X, Y, sfunc; nfolds, verbose) end -function crossvalidate(pl::Machine,X::DataFrame,Y::Vector, - sfunc::String,averagetype::String;nfolds=10,verbose::Bool=true) +function crossvalidate(pl::Machine, X::DataFrame, Y::Vector, + sfunc::String, averagetype::String; nfolds=10, verbose::Bool=true) checkfun(sfunc) pfunc = metric_dict[sfunc] - metric(a,b) = pfunc(a,b,average=averagetype) |> (x -> PYC.pyconvert(Float64,x)) - crossvalidate(pl,X,Y,metric,nfolds,verbose) + metric(a, b) = pfunc(a, b, average=averagetype) |> (x -> PYC.pyconvert(Float64, x)) + crossvalidate(pl, X, Y, metric, nfolds, verbose) end diff --git a/src/sklearners.jl b/src/sklearners.jl index 2c8111d..42e3469 100644 --- a/src/sklearners.jl +++ b/src/sklearners.jl @@ -1,7 +1,7 @@ module SKLearners import PythonCall -const PYC=PythonCall +const PYC = PythonCall # standard included modules using DataFrames @@ -14,81 +14,81 @@ export fit, fit!, transform, transform! export SKLearner, sklearners const learner_dict = Dict{String,PYC.Py}() -const ENS = PYC.pynew() -const LM = PYC.pynew() -const DA = PYC.pynew() -const NN = PYC.pynew() -const SVM = PYC.pynew() -const TREE = PYC.pynew() -const ANN = PYC.pynew() -const GP = PYC.pynew() -const KR = PYC.pynew() -const NB = PYC.pynew() -const ISO = PYC.pynew() +const ENS = PYC.pynew() +const LM = PYC.pynew() +const DA = PYC.pynew() +const NN = PYC.pynew() +const SVM = PYC.pynew() +const TREE = PYC.pynew() +const ANN = PYC.pynew() +const GP = PYC.pynew() +const KR = PYC.pynew() +const NB = PYC.pynew() +const ISO = PYC.pynew() function __init__() - PYC.pycopy!(ENS , PYC.pyimport("sklearn.ensemble")) - PYC.pycopy!(LM , PYC.pyimport("sklearn.linear_model")) - PYC.pycopy!(DA , PYC.pyimport("sklearn.discriminant_analysis")) - PYC.pycopy!(NN , PYC.pyimport("sklearn.neighbors")) - PYC.pycopy!(SVM , PYC.pyimport("sklearn.svm")) - PYC.pycopy!(TREE, PYC.pyimport("sklearn.tree")) - PYC.pycopy!(ANN , PYC.pyimport("sklearn.neural_network")) - PYC.pycopy!(GP , PYC.pyimport("sklearn.gaussian_process")) - PYC.pycopy!(KR , PYC.pyimport("sklearn.kernel_ridge")) - PYC.pycopy!(NB , PYC.pyimport("sklearn.naive_bayes")) - PYC.pycopy!(ISO , PYC.pyimport("sklearn.isotonic")) - - # Available scikit-learn learners. - learner_dict["AdaBoostClassifier"] = ENS - learner_dict["BaggingClassifier"] = ENS - learner_dict["ExtraTreesClassifier"] = ENS - learner_dict["VotingClassifier"] = ENS - learner_dict["GradientBoostingClassifier"] = ENS - learner_dict["RandomForestClassifier"] = ENS - learner_dict["QuadraticDiscriminantAnalysis"] = DA - learner_dict["LinearDiscriminantAnalysis"] = DA - learner_dict["LogisticRegression"] = LM - learner_dict["PassiveAggressiveClassifier"] = LM - learner_dict["RidgeClassifier"] = LM - learner_dict["RidgeClassifierCV"] = LM - learner_dict["SGDClassifier"] = LM - learner_dict["KNeighborsClassifier"] = NN - learner_dict["RadiusNeighborsClassifier"] = NN - learner_dict["NearestCentroid"] = NN - learner_dict["SVC"] = SVM - learner_dict["LinearSVC"] = SVM - learner_dict["NuSVC"] = SVM - learner_dict["MLPClassifier"] = ANN - learner_dict["GaussianProcessClassifier"] = GP - learner_dict["DecisionTreeClassifier"] = TREE - learner_dict["GaussianNB"] = NB - learner_dict["MultinomialNB"] = NB - learner_dict["ComplementNB"] = NB - learner_dict["BernoulliNB"] = NB - learner_dict["SVR"] = SVM - learner_dict["Ridge"] = LM - learner_dict["RidgeCV"] = LM - learner_dict["Lasso"] = LM - learner_dict["ElasticNet"] = LM - learner_dict["Lars"] = LM - learner_dict["LassoLars"] = LM - learner_dict["OrthogonalMatchingPursuit"] = LM - learner_dict["BayesianRidge"] = LM - learner_dict["ARDRegression"] = LM - learner_dict["SGDRegressor"] = LM - learner_dict["PassiveAggressiveRegressor"] = LM - learner_dict["KernelRidge"] = KR - learner_dict["KNeighborsRegressor"] = NN - learner_dict["RadiusNeighborsRegressor"] = NN - learner_dict["GaussianProcessRegressor"] = GP - learner_dict["DecisionTreeRegressor"] = TREE - learner_dict["RandomForestRegressor"] = ENS - learner_dict["ExtraTreesRegressor"] = ENS - learner_dict["AdaBoostRegressor"] = ENS - learner_dict["GradientBoostingRegressor"] = ENS - learner_dict["IsotonicRegression"] = ISO - learner_dict["MLPRegressor"] = ANN + PYC.pycopy!(ENS, PYC.pyimport("sklearn.ensemble")) + PYC.pycopy!(LM, PYC.pyimport("sklearn.linear_model")) + PYC.pycopy!(DA, PYC.pyimport("sklearn.discriminant_analysis")) + PYC.pycopy!(NN, PYC.pyimport("sklearn.neighbors")) + PYC.pycopy!(SVM, PYC.pyimport("sklearn.svm")) + PYC.pycopy!(TREE, PYC.pyimport("sklearn.tree")) + PYC.pycopy!(ANN, PYC.pyimport("sklearn.neural_network")) + PYC.pycopy!(GP, PYC.pyimport("sklearn.gaussian_process")) + PYC.pycopy!(KR, PYC.pyimport("sklearn.kernel_ridge")) + PYC.pycopy!(NB, PYC.pyimport("sklearn.naive_bayes")) + PYC.pycopy!(ISO, PYC.pyimport("sklearn.isotonic")) + + # Available scikit-learn learners. + learner_dict["AdaBoostClassifier"] = ENS + learner_dict["BaggingClassifier"] = ENS + learner_dict["ExtraTreesClassifier"] = ENS + learner_dict["VotingClassifier"] = ENS + learner_dict["GradientBoostingClassifier"] = ENS + learner_dict["RandomForestClassifier"] = ENS + learner_dict["QuadraticDiscriminantAnalysis"] = DA + learner_dict["LinearDiscriminantAnalysis"] = DA + learner_dict["LogisticRegression"] = LM + learner_dict["PassiveAggressiveClassifier"] = LM + learner_dict["RidgeClassifier"] = LM + learner_dict["RidgeClassifierCV"] = LM + learner_dict["SGDClassifier"] = LM + learner_dict["KNeighborsClassifier"] = NN + learner_dict["RadiusNeighborsClassifier"] = NN + learner_dict["NearestCentroid"] = NN + learner_dict["SVC"] = SVM + learner_dict["LinearSVC"] = SVM + learner_dict["NuSVC"] = SVM + learner_dict["MLPClassifier"] = ANN + learner_dict["GaussianProcessClassifier"] = GP + learner_dict["DecisionTreeClassifier"] = TREE + learner_dict["GaussianNB"] = NB + learner_dict["MultinomialNB"] = NB + learner_dict["ComplementNB"] = NB + learner_dict["BernoulliNB"] = NB + learner_dict["SVR"] = SVM + learner_dict["Ridge"] = LM + learner_dict["RidgeCV"] = LM + learner_dict["Lasso"] = LM + learner_dict["ElasticNet"] = LM + learner_dict["Lars"] = LM + learner_dict["LassoLars"] = LM + learner_dict["OrthogonalMatchingPursuit"] = LM + learner_dict["BayesianRidge"] = LM + learner_dict["ARDRegression"] = LM + learner_dict["SGDRegressor"] = LM + learner_dict["PassiveAggressiveRegressor"] = LM + learner_dict["KernelRidge"] = KR + learner_dict["KNeighborsRegressor"] = NN + learner_dict["RadiusNeighborsRegressor"] = NN + learner_dict["GaussianProcessRegressor"] = GP + learner_dict["DecisionTreeRegressor"] = TREE + learner_dict["RandomForestRegressor"] = ENS + learner_dict["ExtraTreesRegressor"] = ENS + learner_dict["AdaBoostRegressor"] = ENS + learner_dict["GradientBoostingRegressor"] = ENS + learner_dict["IsotonicRegression"] = ISO + learner_dict["MLPRegressor"] = ANN end """ @@ -101,44 +101,44 @@ consult Scikitlearn documentation for arguments to pass. Implements `fit!` and `transform!`. """ mutable struct SKLearner <: Learner - name::String - model::Dict{Symbol,Any} - - function SKLearner(args=Dict{Symbol,Any}()) - default_args=Dict{Symbol,Any}( - :name => "sklearner", - :output => :class, - :learner => "LinearSVC", - :impl_args => Dict{Symbol,Any}() - ) - cargs = nested_dict_merge(default_args, args) - cargs[:name] = cargs[:name]*"_"*randstring(3) - skl = cargs[:learner] - if !(skl in keys(learner_dict)) - println("$skl is not supported.") - println() - sklearners() - error("Argument keyword error") - end - new(cargs[:name],cargs) - end + name::String + model::Dict{Symbol,Any} + + function SKLearner(args=Dict{Symbol,Any}()) + default_args = Dict{Symbol,Any}( + :name => "sklearner", + :output => :class, + :learner => "LinearSVC", + :impl_args => Dict{Symbol,Any}() + ) + cargs = nested_dict_merge(default_args, args) + cargs[:name] = cargs[:name] * "_" * randstring(3) + skl = cargs[:learner] + if !(skl in keys(learner_dict)) + println("$skl is not supported.") + println() + sklearners() + error("Argument keyword error") + end + new(cargs[:name], cargs) + end end function SKLearner(learner::String, args::Dict) - SKLearner(Dict(:learner => learner,:name=>learner, args...)) + SKLearner(Dict(:learner => learner, :name => learner, args...)) end function SKLearner(learner::String; args...) - SKLearner(Dict(:learner => learner,:name=>learner,:impl_args=>Dict(pairs(args)))) + SKLearner(Dict(:learner => learner, :name => learner, :impl_args => Dict(pairs(args)))) end -function (skl::SKLearner)(;objargs...) - skl.model[:impl_args] = Dict(pairs(objargs)) - skname = skl.model[:learner] - skobj = getproperty(learner_dict[skname],skname) - newskobj = skobj(;objargs...) - skl.model[:sklearner] = newskobj - return skl +function (skl::SKLearner)(; objargs...) + skl.model[:impl_args] = Dict(pairs(objargs)) + skname = skl.model[:learner] + skobj = getproperty(learner_dict[skname], skname) + newskobj = skobj(; objargs...) + skl.model[:sklearner] = newskobj + return skl end """ @@ -147,65 +147,64 @@ end List the available scikitlearn machine learners. """ function sklearners() - learners = keys(learner_dict) |> collect |> x-> sort(x,lt=(x,y)->lowercase(x) collect |> x -> sort(x, lt=(x, y) -> lowercase(x) < lowercase(y)) + println("syntax: SKLearner(name::String, args::Dict=Dict())") + println("where 'name' can be one of:") + println() + [print(learner, " ") for learner in learners] + println() + println() + println("and 'args' are the corresponding learner's initial parameters.") + println("Note: Consult Scikitlearn's online help for more details about the learner's arguments.") end function fit!(skl::SKLearner, xx::DataFrame, yy::Vector)::Nothing - # normalize inputs - x = xx |> Array - y = yy - skl.model[:predtype] = :numeric - if !(eltype(yy) <: Real) - y = yy |> Vector{String} - skl.model[:predtype] = :alpha - end - - impl_args = copy(skl.model[:impl_args]) - learner = skl.model[:learner] - py_learner = getproperty(learner_dict[learner],learner) - - # Assign CombineML-specific defaults if required - if learner == "RadiusNeighborsClassifier" - if get(impl_args, :outlier_label, nothing) == nothing - impl_options[:outlier_label] = labels[rand(1:size(labels, 1))] + # normalize inputs + x = xx |> Array + y = yy + skl.model[:predtype] = :numeric + if !(eltype(yy) <: Real) + y = yy |> Vector{String} + skl.model[:predtype] = :alpha + end + + impl_args = copy(skl.model[:impl_args]) + learner = skl.model[:learner] + py_learner = getproperty(learner_dict[learner], learner) + + # Assign CombineML-specific defaults if required + if learner == "RadiusNeighborsClassifier" + if get(impl_args, :outlier_label, nothing) === nothing + impl_args[:outlier_label] = "radiusnn" * "_" * randstring(3) + end end - end - - # Train - modelobj = py_learner(;impl_args...) - modelobj.fit(x,y) - skl.model[:sklearner] = modelobj - skl.model[:impl_args] = impl_args - return nothing + + # Train + modelobj = py_learner(; impl_args...) + modelobj.fit(x, y) + skl.model[:sklearner] = modelobj + skl.model[:impl_args] = impl_args + return nothing end function fit(skl::SKLearner, xx::DataFrame, y::Vector)::SKLearner - fit!(skl,xx,y) - return deepcopy(skl) + fit!(skl, xx, y) + return deepcopy(skl) end function transform!(skl::SKLearner, xx::DataFrame)::Vector - x = deepcopy(xx) |> Array - sklearner = skl.model[:sklearner] - res = sklearner.predict(x) - if skl.model[:predtype] == :numeric - predn = PYC.pyconvert(Vector{Float64},res) - return predn - else - predc = PYC.pyconvert(Vector{String},res) - return predc - end + x = deepcopy(xx) |> Array + sklearner = skl.model[:sklearner] + res = sklearner.predict(x) + if skl.model[:predtype] == :numeric + predn = PYC.pyconvert(Vector{Float64}, res) + return predn + else + predc = PYC.pyconvert(Vector{String}, res) + return predc + end end -transform(skl::SKLearner, xx::DataFrame)::Vector = transform!(skl,xx) +transform(skl::SKLearner, xx::DataFrame)::Vector = transform!(skl, xx) end - diff --git a/src/skpreprocessor.jl b/src/skpreprocessor.jl index 47c0f96..a38d135 100644 --- a/src/skpreprocessor.jl +++ b/src/skpreprocessor.jl @@ -1,7 +1,7 @@ module SKPreprocessors import PythonCall -const PYC=PythonCall +const PYC = PythonCall # standard included modules using DataFrames @@ -14,83 +14,83 @@ export fit, fit!, transform, transform! export SKPreprocessor, skpreprocessors const preprocessor_dict = Dict{String,PYC.Py}() -const DEC = PYC.pynew() -const FS = PYC.pynew() -const IMP = PYC.pynew() +const DEC = PYC.pynew() +const FS = PYC.pynew() +const IMP = PYC.pynew() const PREP = PYC.pynew() function __init__() - PYC.pycopy!(DEC , PYC.pyimport("sklearn.decomposition")) - PYC.pycopy!(FS , PYC.pyimport("sklearn.feature_selection",)) - PYC.pycopy!(IMP , PYC.pyimport("sklearn.impute")) - PYC.pycopy!(PREP, PYC.pyimport("sklearn.preprocessing")) - - # Available scikit-learn learners. - preprocessor_dict["DictionaryLearning"] = DEC - preprocessor_dict["FactorAnalysis"] = DEC - preprocessor_dict["FastICA"] = DEC - preprocessor_dict["IncrementalPCA"] = DEC - preprocessor_dict["KernelPCA"] = DEC - preprocessor_dict["LatentDirichletAllocation"] = DEC - preprocessor_dict["MiniBatchDictionaryLearning"] = DEC - preprocessor_dict["MiniBatchSparsePCA"] = DEC - preprocessor_dict["NMF"] = DEC - preprocessor_dict["PCA"] = DEC - preprocessor_dict["SparsePCA"] = DEC - preprocessor_dict["SparseCoder"] = DEC - preprocessor_dict["TruncatedSVD"] = DEC - preprocessor_dict["dict_learning"] = DEC - preprocessor_dict["dict_learning_online"] = DEC - preprocessor_dict["fastica"] = DEC - preprocessor_dict["non_negative_factorization"] = DEC - preprocessor_dict["sparse_encode"] = DEC - preprocessor_dict["GenericUnivariateSelect"] = FS - preprocessor_dict["SelectPercentile"] = FS - preprocessor_dict["SelectKBest"] = FS - preprocessor_dict["SelectFpr"] = FS - preprocessor_dict["SelectFdr"] = FS - preprocessor_dict["SelectFromModel"] = FS - preprocessor_dict["SelectFwe"] = FS - preprocessor_dict["RFE"] = FS - preprocessor_dict["RFECV"] = FS - preprocessor_dict["VarianceThreshold"] = FS - preprocessor_dict["chi2"] = FS - preprocessor_dict["f_classif"] = FS - preprocessor_dict["f_regression"] = FS - preprocessor_dict["mutual_info_classif"] = FS - preprocessor_dict["mutual_info_regression"] = FS - preprocessor_dict["SimpleImputer"] = IMP - preprocessor_dict["MissingIndicator"] = IMP - preprocessor_dict["Binarizer"] = PREP - preprocessor_dict["FunctionTransformer"] = PREP - preprocessor_dict["KBinsDiscretizer"] = PREP - preprocessor_dict["KernelCenterer"] = PREP - preprocessor_dict["LabelBinarizer"] = PREP - preprocessor_dict["LabelEncoder"] = PREP - preprocessor_dict["MultiLabelBinarizer"] = PREP - preprocessor_dict["MaxAbsScaler"] = PREP - preprocessor_dict["MinMaxScaler"] = PREP - preprocessor_dict["Normalizer"] = PREP - preprocessor_dict["OneHotEncoder"] = PREP - preprocessor_dict["OrdinalEncoder"] = PREP - preprocessor_dict["PolynomialFeatures"] = PREP - preprocessor_dict["PowerTransformer"] = PREP - preprocessor_dict["QuantileTransformer"] = PREP - preprocessor_dict["RobustScaler"] = PREP - preprocessor_dict["StandardScaler"] = PREP - #"IterativeImputer" => IMP.IterativeImputer, - #"KNNImputer" => IMP.KNNImputer, - #"add_dummy_feature" => PREP.add_dummy_feature, - #"binarize" => PREP.binarize, - #"label_binarize" => PREP.label_binarize, - #"maxabs_scale" => PREP.maxabs_scale, - #"minmax_scale" => PREP.minmax_scale, - #"normalize" => PREP.normalize, - #"quantile_transform" => PREP.quantile_transform, - #"robust_scale" => PREP.robust_scale, - #"scale" => PREP.scale, - #"power_transform" => PREP.power_transform + PYC.pycopy!(DEC, PYC.pyimport("sklearn.decomposition")) + PYC.pycopy!(FS, PYC.pyimport("sklearn.feature_selection",)) + PYC.pycopy!(IMP, PYC.pyimport("sklearn.impute")) + PYC.pycopy!(PREP, PYC.pyimport("sklearn.preprocessing")) + + # Available scikit-learn learners. + preprocessor_dict["DictionaryLearning"] = DEC + preprocessor_dict["FactorAnalysis"] = DEC + preprocessor_dict["FastICA"] = DEC + preprocessor_dict["IncrementalPCA"] = DEC + preprocessor_dict["KernelPCA"] = DEC + preprocessor_dict["LatentDirichletAllocation"] = DEC + preprocessor_dict["MiniBatchDictionaryLearning"] = DEC + preprocessor_dict["MiniBatchSparsePCA"] = DEC + preprocessor_dict["NMF"] = DEC + preprocessor_dict["PCA"] = DEC + preprocessor_dict["SparsePCA"] = DEC + preprocessor_dict["SparseCoder"] = DEC + preprocessor_dict["TruncatedSVD"] = DEC + preprocessor_dict["dict_learning"] = DEC + preprocessor_dict["dict_learning_online"] = DEC + preprocessor_dict["fastica"] = DEC + preprocessor_dict["non_negative_factorization"] = DEC + preprocessor_dict["sparse_encode"] = DEC + preprocessor_dict["GenericUnivariateSelect"] = FS + preprocessor_dict["SelectPercentile"] = FS + preprocessor_dict["SelectKBest"] = FS + preprocessor_dict["SelectFpr"] = FS + preprocessor_dict["SelectFdr"] = FS + preprocessor_dict["SelectFromModel"] = FS + preprocessor_dict["SelectFwe"] = FS + preprocessor_dict["RFE"] = FS + preprocessor_dict["RFECV"] = FS + preprocessor_dict["VarianceThreshold"] = FS + preprocessor_dict["chi2"] = FS + preprocessor_dict["f_classif"] = FS + preprocessor_dict["f_regression"] = FS + preprocessor_dict["mutual_info_classif"] = FS + preprocessor_dict["mutual_info_regression"] = FS + preprocessor_dict["SimpleImputer"] = IMP + preprocessor_dict["MissingIndicator"] = IMP + preprocessor_dict["Binarizer"] = PREP + preprocessor_dict["FunctionTransformer"] = PREP + preprocessor_dict["KBinsDiscretizer"] = PREP + preprocessor_dict["KernelCenterer"] = PREP + preprocessor_dict["LabelBinarizer"] = PREP + preprocessor_dict["LabelEncoder"] = PREP + preprocessor_dict["MultiLabelBinarizer"] = PREP + preprocessor_dict["MaxAbsScaler"] = PREP + preprocessor_dict["MinMaxScaler"] = PREP + preprocessor_dict["Normalizer"] = PREP + preprocessor_dict["OneHotEncoder"] = PREP + preprocessor_dict["OrdinalEncoder"] = PREP + preprocessor_dict["PolynomialFeatures"] = PREP + preprocessor_dict["PowerTransformer"] = PREP + preprocessor_dict["QuantileTransformer"] = PREP + preprocessor_dict["RobustScaler"] = PREP + preprocessor_dict["StandardScaler"] = PREP + #"IterativeImputer" => IMP.IterativeImputer, + #"KNNImputer" => IMP.KNNImputer, + #"add_dummy_feature" => PREP.add_dummy_feature, + #"binarize" => PREP.binarize, + #"label_binarize" => PREP.label_binarize, + #"maxabs_scale" => PREP.maxabs_scale, + #"minmax_scale" => PREP.minmax_scale, + #"normalize" => PREP.normalize, + #"quantile_transform" => PREP.quantile_transform, + #"robust_scale" => PREP.robust_scale, + #"scale" => PREP.scale, + #"power_transform" => PREP.power_transform end """ @@ -104,99 +104,95 @@ documentation for arguments to pass. Implements `fit!` and `transform!`. """ mutable struct SKPreprocessor <: Transformer - name::String - model::Dict{Symbol,Any} - - function SKPreprocessor(args=Dict()) - default_args=Dict( - :name => "skprep", - :preprocessor => "PCA", - :autocomponent=>false, - :impl_args => Dict() - ) - cargs = nested_dict_merge(default_args, args) - cargs[:name] = cargs[:name]*"_"*randstring(3) - prep = cargs[:preprocessor] - if !(prep in keys(preprocessor_dict)) - println("$prep is not supported.") - println() - skpreprocessors() - error("Argument keyword error") - end - new(cargs[:name],cargs) - end + name::String + model::Dict{Symbol,Any} + + function SKPreprocessor(args=Dict()) + default_args = Dict( + :name => "skprep", + :preprocessor => "PCA", + :autocomponent => false, + :impl_args => Dict() + ) + cargs = nested_dict_merge(default_args, args) + cargs[:name] = cargs[:name] * "_" * randstring(3) + prep = cargs[:preprocessor] + if !(prep in keys(preprocessor_dict)) + println("$prep is not supported.") + println() + skpreprocessors() + error("Argument keyword error") + end + new(cargs[:name], cargs) + end end -function SKPreprocessor(prep::String,args::Dict) - SKPreprocessor(Dict(:preprocessor => prep,:name=>prep,args...)) +function SKPreprocessor(prep::String, args::Dict) + SKPreprocessor(Dict(:preprocessor => prep, :name => prep, args...)) end function SKPreprocessor(prep::String; args...) - SKPreprocessor(Dict(:preprocessor => prep,:name=>prep,:impl_args=>Dict(pairs(args)))) + SKPreprocessor(Dict(:preprocessor => prep, :name => prep, :impl_args => Dict(pairs(args)))) end -function (skp::SKPreprocessor)(;objargs...) - skp.model[:impl_args] = Dict(pairs(objargs)) - prepname = skp.model[:preprocessor] - skobj = getproperty(preprocessor_dict[prepname],prepname) - newskobj = skobj(;objargs...) - skp.model[:skpreprocessor] = newskobj - return skp +function (skp::SKPreprocessor)(; objargs...) + skp.model[:impl_args] = Dict(pairs(objargs)) + prepname = skp.model[:preprocessor] + skobj = getproperty(preprocessor_dict[prepname], prepname) + newskobj = skobj(; objargs...) + skp.model[:skpreprocessor] = newskobj + return skp end function skpreprocessors() - processors = keys(preprocessor_dict) |> collect |> x-> sort(x,lt=(x,y)->lowercase(x) collect |> x -> sort(x, lt=(x, y) -> lowercase(x) < lowercase(y)) + println("syntax: SKPreprocessor(name::String, args::Dict=Dict())") + println("where *name* can be one of:") + println() + [print(processor, " ") for processor in processors] + println() + println() + println("and *args* are the corresponding preprocessor's initial parameters.") + println("Note: Please consult Scikitlearn's online help for more details about the preprocessor's arguments.") end -function fit!(skp::SKPreprocessor, x::DataFrame, yc::Vector=[])::Nothing - features = x |> Array - y = yc - #if !(eltype(yc) <: Real) - # y = yc |> Vector{String} - #end - - impl_args = copy(skp.model[:impl_args]) - autocomp = skp.model[:autocomponent] - if autocomp == true - cols = ncol(x) - ncomponents = 1 - if cols > 0 - ncomponents = round(sqrt(cols),digits=0) |> Integer - push!(impl_args,:n_components => ncomponents) - end - end - preprocessor = skp.model[:preprocessor] - py_preprocessor = getproperty(preprocessor_dict[preprocessor],preprocessor) - - # Train model - preproc = py_preprocessor(;impl_args...) - preproc.fit(features) - skp.model[:skpreprocessor] = preproc - skp.model[:impl_args] = impl_args - return nothing +function fit!(skp::SKPreprocessor, x::DataFrame, _::Vector=[])::Nothing + features = x |> Array + + impl_args = copy(skp.model[:impl_args]) + autocomp = skp.model[:autocomponent] + if autocomp == true + cols = ncol(x) + ncomponents = 1 + if cols > 0 + ncomponents = round(sqrt(cols), digits=0) |> Integer + push!(impl_args, :n_components => ncomponents) + end + end + preprocessor = skp.model[:preprocessor] + py_preprocessor = getproperty(preprocessor_dict[preprocessor], preprocessor) + + # Train model + preproc = py_preprocessor(; impl_args...) + preproc.fit(features) + skp.model[:skpreprocessor] = preproc + skp.model[:impl_args] = impl_args + return nothing end function fit(skp::SKPreprocessor, x::DataFrame, y::Vector=[])::SKPreprocessor - fit!(skp,x,y) - return deepcopy(skp) + fit!(skp, x, y) + return deepcopy(skp) end function transform!(skp::SKPreprocessor, x::DataFrame)::DataFrame - features = deepcopy(x) |> Array - model=skp.model[:skpreprocessor] - res = (model.transform(features)) - PYC.pyconvert(Matrix,res) |> x->DataFrame(x,:auto) + features = deepcopy(x) |> Array + model = skp.model[:skpreprocessor] + res = (model.transform(features)) + PYC.pyconvert(Matrix, res) |> x -> DataFrame(x, :auto) end -transform(skp::SKPreprocessor, x::DataFrame)::DataFrame = transform!(skp,x) +transform(skp::SKPreprocessor, x::DataFrame)::DataFrame = transform!(skp, x) end