From 9cab2d6ebce3cf205923a26140b4bda2443907b3 Mon Sep 17 00:00:00 2001
From: Tyler Thomas <36181311+tylerjthomas9@users.noreply.github.com>
Date: Thu, 13 Oct 2022 08:59:39 -0700
Subject: [PATCH] Switch from PyCall.jl to PythonCall.jl (#15)

* Initial conversion from PyCall to PythonCall

* Remove PyCall references

* run format check

* fix __init__ (do not return cv)

* bump julia version in docs to v1.6

* commit @erikphanson's suggestions

* fix to_pandas

* fix to_pandas

Co-authored-by: Tyler Thomas <tyler@saguarocm.om>
---
 .github/workflows/docs.yml   |  2 +-
 .gitignore                   |  1 +
 CondaPkg.toml                |  3 ++
 Project.toml                 | 10 +++---
 README.md                    | 18 ++--------
 deps/build.jl                |  6 ----
 examples/binary.jl           |  8 +++--
 examples/cross_validation.jl | 13 +++----
 examples/learning_to_rank.jl | 15 ++++----
 examples/multiclass.jl       | 15 ++++----
 examples/regression.jl       |  7 ++--
 examples/return_best.jl      |  9 ++---
 src/CatBoost.jl              | 66 ++++++++----------------------------
 test/runtests.jl             |  4 +--
 14 files changed, 66 insertions(+), 111 deletions(-)
 create mode 100644 CondaPkg.toml
 delete mode 100644 deps/build.jl

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 9853c1b..f7e4e80 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -21,7 +21,7 @@ jobs:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@latest
         with:
-          version: 1.5 # earliest supported version
+          version: 1.6 # earliest supported version
       - uses: actions/cache@v2
         with:
           path: ~/.julia/artifacts
diff --git a/.gitignore b/.gitignore
index 41665f8..f8f509c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,4 @@ Manifest.toml
 
 # Custom.
 catboost_info
+.CondaPkg*
diff --git a/CondaPkg.toml b/CondaPkg.toml
new file mode 100644
index 0000000..aa91286
--- /dev/null
+++ b/CondaPkg.toml
@@ -0,0 +1,3 @@
+[deps.catboost]
+channel = "conda-forge"
+version = "=1.1"
diff --git a/Project.toml b/Project.toml
index dba6272..7d05a7a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,23 +1,21 @@
 name = "CatBoost"
 uuid = "e2e10f9a-a85d-4fa9-b6b2-639a32100a12"
 authors = ["Beacon Biosignals, Inc."]
-version = "0.1.2"
+version = "0.2.0"
 
 [deps]
-Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
+PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
 Aqua = "0.5"
-Conda = "1.5"
 DataFrames = "0.22, 1"
 OrderedCollections = "1.4"
-PyCall = "1.9"
+PythonCall = "0.9"
 Tables = "1.4"
-julia = "1.5"
+julia = "1.6"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
diff --git a/README.md b/README.md
index e7d08f3..5789c7d 100644
--- a/README.md
+++ b/README.md
@@ -10,18 +10,6 @@
 
 Julia interface to [CatBoost](https://catboost.ai/).
 
-## Setting up PyCall
-
-Please follow the PyCall guidelines described in [PyCall.jl](https://github.com/JuliaPy/PyCall.jl).
-
-We highly recommend using a Julia-specific Python environment to handle dependencies. We recommend that users follow the build instructions in [Conda.jl](https://github.com/JuliaPy/Conda.jl).
-
-If users have installed [miniconda](https://docs.conda.io/en/latest/miniconda.html) on their local machine, we recommend checking out the Julia-specific Python environment (which is usually located at `$HOME/.julia/conda/3`) and installing `catboost` there with `pip`:
-
-```
-pip install catboost
-```
-
 ## Example
 
 ```julia
@@ -29,9 +17,9 @@ module Regression
 
 using CatBoost
 
-train_data = [[1, 4, 5, 6], [4, 5, 6, 7], [30, 40, 50, 60]]
-eval_data = [[2, 4, 6, 8], [1, 4, 50, 60]]
-train_labels = [10, 20, 30]
+train_data = PyList([[1, 4, 5, 6], [4, 5, 6, 7], [30, 40, 50, 60]])
+eval_data = PyList([[2, 4, 6, 8], [1, 4, 50, 60]])
+train_labels = PyList([10, 20, 30])
 
 # Initialize CatBoostRegressor
 model = CatBoostRegressor(iterations = 2, learning_rate = 1, depth = 2)
diff --git a/deps/build.jl b/deps/build.jl
deleted file mode 100644
index 7939a91..0000000
--- a/deps/build.jl
+++ /dev/null
@@ -1,6 +0,0 @@
-using PyCall
-
-let dependencies = ["catboost==0.24.8"]
-    pip = pyimport("pip")
-    pip.main(["install"; split(get(ENV, "PIPFLAGS", "")); dependencies])
-end
diff --git a/examples/binary.jl b/examples/binary.jl
index 52b3c7c..991de67 100644
--- a/examples/binary.jl
+++ b/examples/binary.jl
@@ -1,12 +1,14 @@
 module Binary
 
-using CatBoost, DataFrames
+using CatBoost
+using DataFrames
+using PythonCall
 
 # Initialize data
-cat_features = [0, 1]
+cat_features = pylist([0, 1])
 train_data = DataFrame([["a", "a", "c"], ["b", "b", "d"], [1, 4, 30], [4, 5, 40],
                         [5, 6, 50], [6, 7, 60]], :auto)
-train_labels = [1, 1, -1]
+train_labels = pylist([1, 1, -1])
 eval_data = DataFrame([["a", "a"], ["b", "d"], [2, 1], [4, 4], [6, 50], [8, 60]], :auto)
 
 # Initialize CatBoostClassifier
diff --git a/examples/cross_validation.jl b/examples/cross_validation.jl
index 72ed6ee..dddaf34 100644
--- a/examples/cross_validation.jl
+++ b/examples/cross_validation.jl
@@ -3,18 +3,19 @@ module CrossValidation
 
 using CatBoost
 using DataFrames
+using PythonCall
 
-cv_data = [["France", 1924, 44], ["USA", 1932, 37], ["Switzerland", 1928, 25],
-           ["Norway", 1952, 30], ["Japan", 1972, 35], ["Mexico", 1968, 112]]
+cv_data = pylist([["France", 1924, 44], ["USA", 1932, 37], ["Switzerland", 1928, 25],
+                  ["Norway", 1952, 30], ["Japan", 1972, 35], ["Mexico", 1968, 112]])
 
-labels = [1, 1, 0, 0, 0, 1]
+labels = pylist([1, 1, 0, 0, 0, 1])
 
-cat_features = [0]
+cat_features = pylist([0])
 
 cv_dataset = Pool(; data=cv_data, label=labels, cat_features=cat_features)
 
-params = Dict("iterations" => 100, "depth" => 2, "loss_function" => "Logloss",
-              "verbose" => false)
+params = PyDict(Dict("iterations" => 100, "depth" => 2, "loss_function" => "Logloss",
+                     "verbose" => false))
 
 scores = cv(cv_dataset; fold_count=2, params)
 
diff --git a/examples/learning_to_rank.jl b/examples/learning_to_rank.jl
index f6267d3..4b59b78 100644
--- a/examples/learning_to_rank.jl
+++ b/examples/learning_to_rank.jl
@@ -2,8 +2,11 @@ module LearningToRank
 
 using CatBoost
 using DataFrames
+using PythonCall
 
 train, test = load_dataset(:msrank_10k)
+train.sort_values(2; inplace=true)
+test.sort_values(2; inplace=true)
 x_train = train.drop([0, 1]; axis=1).values
 y_train = train[1].values
 queries_train = train[2].values
@@ -13,8 +16,8 @@ y_test = test[1].values
 queries_test = test[2].values
 
 # Important dims.
-num_documents, num_features = size(x_train)
-num_queries = size(unique(queries_train))[1]
+num_documents, num_features = size(pyconvert(Array, x_train))
+num_queries = size(pyconvert(Array, queries_train), 1)
 println("Data dims: $((num_documents, num_features))")
 println("Num queries: $(num_queries)")
 
@@ -28,10 +31,10 @@ train = Pool(; data=x_train, label=y_train, group_id=queries_train)
 test = Pool(; data=x_test, label=y_test, group_id=queries_test)
 
 # small number of iterations to not slow down CI too much
-default_parameters = Dict("iterations" => 10, "loss_function" => "RMSE",
-                          "custom_metric" => ["MAP:top=10", "PrecisionAt:top=10",
-                                              "RecallAt:top=10"], "verbose" => false,
-                          "random_seed" => 314159)
+default_parameters = PyDict(Dict("iterations" => 10, "loss_function" => "RMSE",
+                                 "custom_metric" => ["MAP:top=10", "PrecisionAt:top=10",
+                                                     "RecallAt:top=10"], "verbose" => false,
+                                 "random_seed" => 314159))
 
 function fit_model(params, train_pool, test_pool)
     model = catboost.CatBoost(params)
diff --git a/examples/multiclass.jl b/examples/multiclass.jl
index c581f48..2efa3f5 100644
--- a/examples/multiclass.jl
+++ b/examples/multiclass.jl
@@ -1,17 +1,18 @@
 module MultiClass
 
 using CatBoost
+using PythonCall
 
-train_data = [["summer", 1924, 44], ["summer", 1932, 37], ["winter", 1980, 37],
-              ["summer", 2012, 204]]
+train_data = PyList([["summer", 1924, 44], ["summer", 1932, 37], ["winter", 1980, 37],
+                     ["summer", 2012, 204]])
 
-eval_data = [["winter", 1996, 197], ["winter", 1968, 37], ["summer", 2002, 77],
-             ["summer", 1948, 59]]
+eval_data = PyList([["winter", 1996, 197], ["winter", 1968, 37], ["summer", 2002, 77],
+                    ["summer", 1948, 59]])
 
-cat_features = [0]
+cat_features = PyList([0])
 
-train_label = ["France", "USA", "USA", "UK"]
-eval_label = ["USA", "France", "USA", "UK"]
+train_label = PyList(["France", "USA", "USA", "UK"])
+eval_label = PyList(["USA", "France", "USA", "UK"])
 
 train_dataset = Pool(; data=train_data, label=train_label, cat_features=cat_features)
 
diff --git a/examples/regression.jl b/examples/regression.jl
index dd97d30..cf5768f 100644
--- a/examples/regression.jl
+++ b/examples/regression.jl
@@ -1,10 +1,11 @@
 module Regression
 
 using CatBoost
+using PythonCall
 
-train_data = [[1, 4, 5, 6], [4, 5, 6, 7], [30, 40, 50, 60]]
-eval_data = [[2, 4, 6, 8], [1, 4, 50, 60]]
-train_labels = [10, 20, 30]
+train_data = PyList([[1, 4, 5, 6], [4, 5, 6, 7], [30, 40, 50, 60]])
+eval_data = PyList([[2, 4, 6, 8], [1, 4, 50, 60]])
+train_labels = PyList([10, 20, 30])
 
 # Initialize CatBoostRegressor
 model = CatBoostRegressor(; iterations=2, learning_rate=1, depth=2)
diff --git a/examples/return_best.jl b/examples/return_best.jl
index 780a49c..32b55bb 100644
--- a/examples/return_best.jl
+++ b/examples/return_best.jl
@@ -1,14 +1,15 @@
 module ReturnBestOnMetric
 
 using CatBoost
+using PythonCall
 
-train_data = [[0, 3], [4, 1], [8, 1], [9, 1]]
+train_data = PyList([[0, 3], [4, 1], [8, 1], [9, 1]])
 
-train_labels = [0, 0, 1, 1]
+train_labels = PyList([0, 0, 1, 1])
 
-eval_data = [[2, 1], [3, 1], [9, 0], [5, 3]]
+eval_data = PyList([[2, 1], [3, 1], [9, 0], [5, 3]])
 
-eval_labels = [0, 1, 1, 0]
+eval_labels = PyList([0, 1, 1, 0])
 
 eval_dataset = Pool(; data=eval_data, label=eval_labels)
 
diff --git a/src/CatBoost.jl b/src/CatBoost.jl
index 49514a4..3d8a198 100644
--- a/src/CatBoost.jl
+++ b/src/CatBoost.jl
@@ -1,7 +1,6 @@
 module CatBoost
 
-using PyCall
-using Conda
+using PythonCall
 using DataFrames
 using OrderedCollections
 using Tables
@@ -24,40 +23,17 @@ export load_dataset
 ##### _init_
 #####
 
-const catboost = PyNULL()
-const catboost_datasets = PyNULL()
-const pandas = PyNULL()
-
-function load_python_deps!()
-    copy!(catboost, pyimport("catboost"))
-    copy!(catboost_datasets, pyimport("catboost.datasets"))
-    copy!(pandas, pyimport("pandas"))
-    return nothing
-end
+const catboost = PythonCall.pynew()
+const catboost_datasets = PythonCall.pynew()
+const pandas = PythonCall.pynew()
 
 function __init__()
-    try
-        load_python_deps!()
-    catch ee
-        if PyCall.conda
-            Conda.pip_interop(true)
-            Conda.pip("install", ["catboost", "pandas"])
-            load_python_deps!()
-        else
-            typeof(ee) <: PyCall.PyError || rethrow(ee)
-            @warn("""
-                 Python Dependencies not installed
-                 Please either:
-                 - Rebuild PyCall to use Conda, by running in the julia REPL:
-                 - `ENV["PYTHON"]=""; Pkg.build("PyCall"); Pkg.build("CatBoost")`
-                 - Or install the depencences, eg by running pip
-                 - `pip install catboost pandas`
-                 """)
-        end
-    end
+    PythonCall.pycopy!(catboost, pyimport("catboost"))
+    PythonCall.pycopy!(catboost_datasets, pyimport("catboost.datasets"))
+    PythonCall.pycopy!(pandas, pyimport("pandas"))
 
     @doc """
-        cv(pool::PyObject; kwargs...) -> DataFrame
+        cv(pool::Py; kwargs...) -> DataFrame
 
     Accepts a [`CatBoost.Pool`](@ref) positional argument to specify the training data,
     and keyword arguments to configure the settings. See the python documentation below
@@ -83,7 +59,7 @@ end
          pairs=nothing, delimiter='\t', has_header=false, weight=nothing,
          group_id = nothing, group_weight=nothing, subgroup_id=nothing,
          pairs_weight=nothing, baseline=nothing, features_names=nothing,
-         thread_count = -1) -> PyObject
+         thread_count = -1) -> Py
 
 Creates a `Pool` object holding training data and labels. `data` may also be passed
 as a keyword argument.
@@ -112,7 +88,7 @@ end
 ##### Cross validation
 #####
 
-cv(pool::PyObject; kwargs...) = pandas_to_df(catboost.cv(pool; kwargs...))
+cv(pool::Py; kwargs...) = pandas_to_df(catboost.cv(pool; kwargs...))
 
 #####
 ##### Conversion utilities
@@ -122,8 +98,7 @@ cv(pool::PyObject; kwargs...) = pandas_to_df(catboost.cv(pool; kwargs...))
     to_catboost(arg)
 
 `to_catboost` is called on each argument passed to [`fit`](@ref), [`predict`](@ref), [`predict_proba`](@ref), and [`cv`](@ref)
-to allow customization of the conversion of Julia types to python types. If `to_catboost` emits a Julia type, then
-PyCall will try to convert it appropriately (automatically).
+to allow customization of the conversion of Julia types to python types. 
 
 By default, `to_catboost` simply checks if the argument satisfies `Tables.istable(arg)`, and if so, it outputs
 a corresponding pandas table, and otherwise passes it on.
@@ -135,25 +110,12 @@ to_catboost(arg) = Tables.istable(arg) ? to_pandas(arg) : arg
 # utility for calling `to_catboost` on each argument of a function
 all_to_catboost(args) = (to_catboost(arg) for arg in args)
 
-# the Julia-side code does not copy the columns, but the `pandas.DataFrame`
-# constructor seems to make a copy here. Maybe that can be avoided?
 function to_pandas(tbl)
-    # ensure we have a column table
-    col_table = Tables.columns(tbl)
-    # write it in a way that pandas will understand (after PyCall conversion)
-    dict_table = OrderedDict(col => Tables.getcolumn(col_table, col)
-                             for col in Tables.columnnames(col_table))
-    return pandas.DataFrame(; data=dict_table)
+    return pytable(tbl, :pandas)
 end
 
-function pandas_to_df(pandas_df::PyObject)
-    colnames = map(pandas_df.columns) do c
-        ret = c isa PyObject ? PyAny(c) : c
-        return ret isa Int ? ret + 1 : ret
-    end
-    df = DataFrame(Any[Array(getproperty(pandas_df, c).values) for c in colnames],
-                   map(Symbol, colnames))
-    return df
+function pandas_to_df(pandas_df::Py)
+    return DataFrame(PyTable(pandas_df))
 end
 
 #####
diff --git a/test/runtests.jl b/test/runtests.jl
index c7b907f..13d7d1a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,4 +1,4 @@
-using Test, CatBoost, DataFrames, PyCall
+using Test, CatBoost, DataFrames, PythonCall
 using Aqua
 
 EXAMPLES_DIR = joinpath(@__DIR__, "..", "examples")
@@ -6,7 +6,7 @@ EXAMPLES_DIR = joinpath(@__DIR__, "..", "examples")
 @testset "`to_pandas` and `pandas_to_df`" begin
     df = DataFrame(; floats=0.5:0.5:3.0, ints=1:6)
     pd = CatBoost.to_pandas(df)
-    @test pd isa PyObject
+    @test pd isa Py
     df2 = pandas_to_df(pd)
     @test df2 == df
 end