From 9cab2d6ebce3cf205923a26140b4bda2443907b3 Mon Sep 17 00:00:00 2001 From: Tyler Thomas <36181311+tylerjthomas9@users.noreply.github.com> Date: Thu, 13 Oct 2022 08:59:39 -0700 Subject: [PATCH] Switch from PyCall.jl to PythonCall.jl (#15) * Initial conversion from PyCall to PythonCall * Remove PyCall references * run format check * fix __init__ (do not return cv) * bump julia version in docs to v1.6 * commit @erikphanson's suggestions * fix to_pandas * fix to_pandas Co-authored-by: Tyler Thomas --- .github/workflows/docs.yml | 2 +- .gitignore | 1 + CondaPkg.toml | 3 ++ Project.toml | 10 +++--- README.md | 18 ++-------- deps/build.jl | 6 ---- examples/binary.jl | 8 +++-- examples/cross_validation.jl | 13 +++---- examples/learning_to_rank.jl | 15 ++++---- examples/multiclass.jl | 15 ++++---- examples/regression.jl | 7 ++-- examples/return_best.jl | 9 ++--- src/CatBoost.jl | 66 ++++++++---------------------------- test/runtests.jl | 4 +-- 14 files changed, 66 insertions(+), 111 deletions(-) create mode 100644 CondaPkg.toml delete mode 100644 deps/build.jl diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 9853c1b..f7e4e80 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -21,7 +21,7 @@ jobs: - uses: actions/checkout@v2 - uses: julia-actions/setup-julia@latest with: - version: 1.5 # earliest supported version + version: 1.6 # earliest supported version - uses: actions/cache@v2 with: path: ~/.julia/artifacts diff --git a/.gitignore b/.gitignore index 41665f8..f8f509c 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,4 @@ Manifest.toml # Custom. catboost_info +.CondaPkg* diff --git a/CondaPkg.toml b/CondaPkg.toml new file mode 100644 index 0000000..aa91286 --- /dev/null +++ b/CondaPkg.toml @@ -0,0 +1,3 @@ +[deps.catboost] +channel = "conda-forge" +version = "=1.1" diff --git a/Project.toml b/Project.toml index dba6272..7d05a7a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,23 +1,21 @@ name = "CatBoost" uuid = "e2e10f9a-a85d-4fa9-b6b2-639a32100a12" authors = ["Beacon Biosignals, Inc."] -version = "0.1.2" +version = "0.2.0" [deps] -Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" +PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] Aqua = "0.5" -Conda = "1.5" DataFrames = "0.22, 1" OrderedCollections = "1.4" -PyCall = "1.9" +PythonCall = "0.9" Tables = "1.4" -julia = "1.5" +julia = "1.6" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" diff --git a/README.md b/README.md index e7d08f3..5789c7d 100644 --- a/README.md +++ b/README.md @@ -10,18 +10,6 @@ Julia interface to [CatBoost](https://catboost.ai/). -## Setting up PyCall - -Please follow the PyCall guidelines described in [PyCall.jl](https://github.com/JuliaPy/PyCall.jl). - -We highly recommend using a Julia-specific Python environment to handle dependencies. We recommend that users follow the build instructions in [Conda.jl](https://github.com/JuliaPy/Conda.jl). - -If users have installed [miniconda](https://docs.conda.io/en/latest/miniconda.html) on their local machine, we recommend checking out the Julia-specific Python environment (which is usually located at `$HOME/.julia/conda/3`) and installing `catboost` there with `pip`: - -``` -pip install catboost -``` - ## Example ```julia @@ -29,9 +17,9 @@ module Regression using CatBoost -train_data = [[1, 4, 5, 6], [4, 5, 6, 7], [30, 40, 50, 60]] -eval_data = [[2, 4, 6, 8], [1, 4, 50, 60]] -train_labels = [10, 20, 30] +train_data = PyList([[1, 4, 5, 6], [4, 5, 6, 7], [30, 40, 50, 60]]) +eval_data = PyList([[2, 4, 6, 8], [1, 4, 50, 60]]) +train_labels = PyList([10, 20, 30]) # Initialize CatBoostRegressor model = CatBoostRegressor(iterations = 2, learning_rate = 1, depth = 2) diff --git a/deps/build.jl b/deps/build.jl deleted file mode 100644 index 7939a91..0000000 --- a/deps/build.jl +++ /dev/null @@ -1,6 +0,0 @@ -using PyCall - -let dependencies = ["catboost==0.24.8"] - pip = pyimport("pip") - pip.main(["install"; split(get(ENV, "PIPFLAGS", "")); dependencies]) -end diff --git a/examples/binary.jl b/examples/binary.jl index 52b3c7c..991de67 100644 --- a/examples/binary.jl +++ b/examples/binary.jl @@ -1,12 +1,14 @@ module Binary -using CatBoost, DataFrames +using CatBoost +using DataFrames +using PythonCall # Initialize data -cat_features = [0, 1] +cat_features = pylist([0, 1]) train_data = DataFrame([["a", "a", "c"], ["b", "b", "d"], [1, 4, 30], [4, 5, 40], [5, 6, 50], [6, 7, 60]], :auto) -train_labels = [1, 1, -1] +train_labels = pylist([1, 1, -1]) eval_data = DataFrame([["a", "a"], ["b", "d"], [2, 1], [4, 4], [6, 50], [8, 60]], :auto) # Initialize CatBoostClassifier diff --git a/examples/cross_validation.jl b/examples/cross_validation.jl index 72ed6ee..dddaf34 100644 --- a/examples/cross_validation.jl +++ b/examples/cross_validation.jl @@ -3,18 +3,19 @@ module CrossValidation using CatBoost using DataFrames +using PythonCall -cv_data = [["France", 1924, 44], ["USA", 1932, 37], ["Switzerland", 1928, 25], - ["Norway", 1952, 30], ["Japan", 1972, 35], ["Mexico", 1968, 112]] +cv_data = pylist([["France", 1924, 44], ["USA", 1932, 37], ["Switzerland", 1928, 25], + ["Norway", 1952, 30], ["Japan", 1972, 35], ["Mexico", 1968, 112]]) -labels = [1, 1, 0, 0, 0, 1] +labels = pylist([1, 1, 0, 0, 0, 1]) -cat_features = [0] +cat_features = pylist([0]) cv_dataset = Pool(; data=cv_data, label=labels, cat_features=cat_features) -params = Dict("iterations" => 100, "depth" => 2, "loss_function" => "Logloss", - "verbose" => false) +params = PyDict(Dict("iterations" => 100, "depth" => 2, "loss_function" => "Logloss", + "verbose" => false)) scores = cv(cv_dataset; fold_count=2, params) diff --git a/examples/learning_to_rank.jl b/examples/learning_to_rank.jl index f6267d3..4b59b78 100644 --- a/examples/learning_to_rank.jl +++ b/examples/learning_to_rank.jl @@ -2,8 +2,11 @@ module LearningToRank using CatBoost using DataFrames +using PythonCall train, test = load_dataset(:msrank_10k) +train.sort_values(2; inplace=true) +test.sort_values(2; inplace=true) x_train = train.drop([0, 1]; axis=1).values y_train = train[1].values queries_train = train[2].values @@ -13,8 +16,8 @@ y_test = test[1].values queries_test = test[2].values # Important dims. -num_documents, num_features = size(x_train) -num_queries = size(unique(queries_train))[1] +num_documents, num_features = size(pyconvert(Array, x_train)) +num_queries = size(pyconvert(Array, queries_train), 1) println("Data dims: $((num_documents, num_features))") println("Num queries: $(num_queries)") @@ -28,10 +31,10 @@ train = Pool(; data=x_train, label=y_train, group_id=queries_train) test = Pool(; data=x_test, label=y_test, group_id=queries_test) # small number of iterations to not slow down CI too much -default_parameters = Dict("iterations" => 10, "loss_function" => "RMSE", - "custom_metric" => ["MAP:top=10", "PrecisionAt:top=10", - "RecallAt:top=10"], "verbose" => false, - "random_seed" => 314159) +default_parameters = PyDict(Dict("iterations" => 10, "loss_function" => "RMSE", + "custom_metric" => ["MAP:top=10", "PrecisionAt:top=10", + "RecallAt:top=10"], "verbose" => false, + "random_seed" => 314159)) function fit_model(params, train_pool, test_pool) model = catboost.CatBoost(params) diff --git a/examples/multiclass.jl b/examples/multiclass.jl index c581f48..2efa3f5 100644 --- a/examples/multiclass.jl +++ b/examples/multiclass.jl @@ -1,17 +1,18 @@ module MultiClass using CatBoost +using PythonCall -train_data = [["summer", 1924, 44], ["summer", 1932, 37], ["winter", 1980, 37], - ["summer", 2012, 204]] +train_data = PyList([["summer", 1924, 44], ["summer", 1932, 37], ["winter", 1980, 37], + ["summer", 2012, 204]]) -eval_data = [["winter", 1996, 197], ["winter", 1968, 37], ["summer", 2002, 77], - ["summer", 1948, 59]] +eval_data = PyList([["winter", 1996, 197], ["winter", 1968, 37], ["summer", 2002, 77], + ["summer", 1948, 59]]) -cat_features = [0] +cat_features = PyList([0]) -train_label = ["France", "USA", "USA", "UK"] -eval_label = ["USA", "France", "USA", "UK"] +train_label = PyList(["France", "USA", "USA", "UK"]) +eval_label = PyList(["USA", "France", "USA", "UK"]) train_dataset = Pool(; data=train_data, label=train_label, cat_features=cat_features) diff --git a/examples/regression.jl b/examples/regression.jl index dd97d30..cf5768f 100644 --- a/examples/regression.jl +++ b/examples/regression.jl @@ -1,10 +1,11 @@ module Regression using CatBoost +using PythonCall -train_data = [[1, 4, 5, 6], [4, 5, 6, 7], [30, 40, 50, 60]] -eval_data = [[2, 4, 6, 8], [1, 4, 50, 60]] -train_labels = [10, 20, 30] +train_data = PyList([[1, 4, 5, 6], [4, 5, 6, 7], [30, 40, 50, 60]]) +eval_data = PyList([[2, 4, 6, 8], [1, 4, 50, 60]]) +train_labels = PyList([10, 20, 30]) # Initialize CatBoostRegressor model = CatBoostRegressor(; iterations=2, learning_rate=1, depth=2) diff --git a/examples/return_best.jl b/examples/return_best.jl index 780a49c..32b55bb 100644 --- a/examples/return_best.jl +++ b/examples/return_best.jl @@ -1,14 +1,15 @@ module ReturnBestOnMetric using CatBoost +using PythonCall -train_data = [[0, 3], [4, 1], [8, 1], [9, 1]] +train_data = PyList([[0, 3], [4, 1], [8, 1], [9, 1]]) -train_labels = [0, 0, 1, 1] +train_labels = PyList([0, 0, 1, 1]) -eval_data = [[2, 1], [3, 1], [9, 0], [5, 3]] +eval_data = PyList([[2, 1], [3, 1], [9, 0], [5, 3]]) -eval_labels = [0, 1, 1, 0] +eval_labels = PyList([0, 1, 1, 0]) eval_dataset = Pool(; data=eval_data, label=eval_labels) diff --git a/src/CatBoost.jl b/src/CatBoost.jl index 49514a4..3d8a198 100644 --- a/src/CatBoost.jl +++ b/src/CatBoost.jl @@ -1,7 +1,6 @@ module CatBoost -using PyCall -using Conda +using PythonCall using DataFrames using OrderedCollections using Tables @@ -24,40 +23,17 @@ export load_dataset ##### _init_ ##### -const catboost = PyNULL() -const catboost_datasets = PyNULL() -const pandas = PyNULL() - -function load_python_deps!() - copy!(catboost, pyimport("catboost")) - copy!(catboost_datasets, pyimport("catboost.datasets")) - copy!(pandas, pyimport("pandas")) - return nothing -end +const catboost = PythonCall.pynew() +const catboost_datasets = PythonCall.pynew() +const pandas = PythonCall.pynew() function __init__() - try - load_python_deps!() - catch ee - if PyCall.conda - Conda.pip_interop(true) - Conda.pip("install", ["catboost", "pandas"]) - load_python_deps!() - else - typeof(ee) <: PyCall.PyError || rethrow(ee) - @warn(""" - Python Dependencies not installed - Please either: - - Rebuild PyCall to use Conda, by running in the julia REPL: - - `ENV["PYTHON"]=""; Pkg.build("PyCall"); Pkg.build("CatBoost")` - - Or install the depencences, eg by running pip - - `pip install catboost pandas` - """) - end - end + PythonCall.pycopy!(catboost, pyimport("catboost")) + PythonCall.pycopy!(catboost_datasets, pyimport("catboost.datasets")) + PythonCall.pycopy!(pandas, pyimport("pandas")) @doc """ - cv(pool::PyObject; kwargs...) -> DataFrame + cv(pool::Py; kwargs...) -> DataFrame Accepts a [`CatBoost.Pool`](@ref) positional argument to specify the training data, and keyword arguments to configure the settings. See the python documentation below @@ -83,7 +59,7 @@ end pairs=nothing, delimiter='\t', has_header=false, weight=nothing, group_id = nothing, group_weight=nothing, subgroup_id=nothing, pairs_weight=nothing, baseline=nothing, features_names=nothing, - thread_count = -1) -> PyObject + thread_count = -1) -> Py Creates a `Pool` object holding training data and labels. `data` may also be passed as a keyword argument. @@ -112,7 +88,7 @@ end ##### Cross validation ##### -cv(pool::PyObject; kwargs...) = pandas_to_df(catboost.cv(pool; kwargs...)) +cv(pool::Py; kwargs...) = pandas_to_df(catboost.cv(pool; kwargs...)) ##### ##### Conversion utilities @@ -122,8 +98,7 @@ cv(pool::PyObject; kwargs...) = pandas_to_df(catboost.cv(pool; kwargs...)) to_catboost(arg) `to_catboost` is called on each argument passed to [`fit`](@ref), [`predict`](@ref), [`predict_proba`](@ref), and [`cv`](@ref) -to allow customization of the conversion of Julia types to python types. If `to_catboost` emits a Julia type, then -PyCall will try to convert it appropriately (automatically). +to allow customization of the conversion of Julia types to python types. By default, `to_catboost` simply checks if the argument satisfies `Tables.istable(arg)`, and if so, it outputs a corresponding pandas table, and otherwise passes it on. @@ -135,25 +110,12 @@ to_catboost(arg) = Tables.istable(arg) ? to_pandas(arg) : arg # utility for calling `to_catboost` on each argument of a function all_to_catboost(args) = (to_catboost(arg) for arg in args) -# the Julia-side code does not copy the columns, but the `pandas.DataFrame` -# constructor seems to make a copy here. Maybe that can be avoided? function to_pandas(tbl) - # ensure we have a column table - col_table = Tables.columns(tbl) - # write it in a way that pandas will understand (after PyCall conversion) - dict_table = OrderedDict(col => Tables.getcolumn(col_table, col) - for col in Tables.columnnames(col_table)) - return pandas.DataFrame(; data=dict_table) + return pytable(tbl, :pandas) end -function pandas_to_df(pandas_df::PyObject) - colnames = map(pandas_df.columns) do c - ret = c isa PyObject ? PyAny(c) : c - return ret isa Int ? ret + 1 : ret - end - df = DataFrame(Any[Array(getproperty(pandas_df, c).values) for c in colnames], - map(Symbol, colnames)) - return df +function pandas_to_df(pandas_df::Py) + return DataFrame(PyTable(pandas_df)) end ##### diff --git a/test/runtests.jl b/test/runtests.jl index c7b907f..13d7d1a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,4 +1,4 @@ -using Test, CatBoost, DataFrames, PyCall +using Test, CatBoost, DataFrames, PythonCall using Aqua EXAMPLES_DIR = joinpath(@__DIR__, "..", "examples") @@ -6,7 +6,7 @@ EXAMPLES_DIR = joinpath(@__DIR__, "..", "examples") @testset "`to_pandas` and `pandas_to_df`" begin df = DataFrame(; floats=0.5:0.5:3.0, ints=1:6) pd = CatBoost.to_pandas(df) - @test pd isa PyObject + @test pd isa Py df2 = pandas_to_df(pd) @test df2 == df end