From dc9b3d227ffc251fb6ae5dd4dc6e4d2d9b86f6d3 Mon Sep 17 00:00:00 2001 From: rileyh Date: Thu, 19 Dec 2024 10:47:49 -0600 Subject: [PATCH] [#183] Document that training.param_grid is deprecated --- docs/.buildinfo | 2 +- docs/.buildinfo.bak | 2 +- docs/_sources/config.md.txt | 28 ++++++++++++++++---- docs/_sources/use_examples.md.txt | 2 +- docs/_static/documentation_options.js | 2 +- docs/column_mappings.html | 4 +-- docs/comparison_features.html | 4 +-- docs/comparisons.html | 4 +-- docs/config.html | 34 ++++++++++++++++++------- docs/feature_selection_transforms.html | 4 +-- docs/genindex.html | 4 +-- docs/index.html | 4 +-- docs/installation.html | 4 +-- docs/introduction.html | 4 +-- docs/link_tasks.html | 4 +-- docs/models.html | 4 +-- docs/objects.inv | Bin 501 -> 503 bytes docs/pipeline_features.html | 4 +-- docs/running_the_program.html | 4 +-- docs/search.html | 4 +-- docs/searchindex.js | 2 +- docs/substitutions.html | 4 +-- docs/use_examples.html | 6 ++--- sphinx-docs/config.md | 28 ++++++++++++++++---- sphinx-docs/use_examples.md | 2 +- 25 files changed, 108 insertions(+), 56 deletions(-) diff --git a/docs/.buildinfo b/docs/.buildinfo index bcf68bc..f042f08 100644 --- a/docs/.buildinfo +++ b/docs/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file records the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 3d084ea912736a6c4043e49bc2b58167 +config: 346c22873853f51d4bd34095fc5e3354 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/.buildinfo.bak b/docs/.buildinfo.bak index 2190a1c..bcf68bc 100644 --- a/docs/.buildinfo.bak +++ b/docs/.buildinfo.bak @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file records the configuration used when building these files. When it is not found, a full rebuild will be done. -config: a706061ae4b2d0ec765440a2505ca382 +config: 3d084ea912736a6c4043e49bc2b58167 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/_sources/config.md.txt b/docs/_sources/config.md.txt index b5ec9f7..21562ee 100644 --- a/docs/_sources/config.md.txt +++ b/docs/_sources/config.md.txt @@ -334,7 +334,7 @@ split_by_id_a = true decision = "drop_duplicate_with_threshold_ratio" n_training_iterations = 2 -param_grid = true +model_parameter_search = {strategy = "grid"} model_parameters = [ { type = "random_forest", maxDepth = [7], numTrees = [100], threshold = [0.05, 0.005], threshold_ratio = [1.2, 1.3] }, { type = "logistic_regression", threshold = [0.50, 0.65, 0.80], threshold_ratio = [1.0, 1.1] } @@ -360,7 +360,7 @@ split_by_id_a = true decision = "drop_duplicate_with_threshold_ratio" n_training_iterations = 10 -param_grid = false +model_parameter_search = {strategy = "explicit"} model_parameters = [ { type = "random_forest", maxDepth = 6, numTrees = 50, threshold = 0.5, threshold_ratio = 1.0 }, { type = "probit", threshold = 0.5, threshold_ratio = 1.0 } @@ -743,7 +743,6 @@ splits = [-1,0,6,11,9999] * `decision` -- Type: `string`. Optional. Specifies which decision function to use to create the final prediction. The first option is `drop_duplicate_a`, which drops any links for which a record in the `a` data set has a predicted match more than one time. The second option is `drop_duplicate_with_threshold_ratio` which only takes links for which the `a` record has the highest probability out of any other potential links, and the second best link for the `a` record is less than the `threshold_ratio`. * `threshold_ratio` -- Type: `float`. Optional. For use when `decision` is `drop_duplicate_with_threshold_ratio` . Specifies the smallest possible ratio to accept between a best and second best link for a given record. Can be used to specify a threshold ratio (beta threshold) to use for all models. Alternatively, unique threshold ratios can be specified in each individual `chosen_model` and `model_parameters` specification. * `model_parameters` -- Type: `list`. Specifies models to test out in the `model_exploration` task. See the [models](models) section for more information on model specifications. - * `param_grid` -- Type: `boolean`. Optional. If you would like to evaluate multiple hyper-parameters for a single model type in your `model_parameters` specification, you can give hyper-parameter inputs as arrays of length >= 1 instead of integers to allow one model per row specification with multiple model eval outputs. * `score_with_model` -- Type: `boolean`. If set to false, will skip the `apply_model` step of the matching task. Use this if you want to use the `run_all_steps` command and are just trying to generate potential links, such as for the creation of training data. * `n_training_iterations` -- Type: `integer`. Optional; default value is 10. The number of training iterations to use during the `model_exploration` task. * `scale_data` -- Type: `boolean`. Optional. Whether to scale the data as part of the machine learning pipeline. @@ -752,6 +751,25 @@ splits = [-1,0,6,11,9999] * `feature_importances` -- Type: `boolean`. Optional. Whether to record feature importances or coefficients for the training features when training the ML model. Set this to true to enable training step 3. +* Deprecated Attributes: + * `param_grid` (*Deprecated in version 4.0.0*) -- Type: `boolean`. Optional. + `param_grid` has been deprecated and will eventually be removed. Please use + the more flexible `model_parameter_search` option by replacing `param_grid + = false` + + with + + ```toml + model_parameter_search = {strategy = "explicit"} + ``` + + and replacing `param_grid = true` + + with + + ```toml + model_parameter_search = {strategy = "grid"} + ``` ``` @@ -769,7 +787,7 @@ feature_importances = true decision = "drop_duplicate_with_threshold_ratio" n_training_iterations = 10 -param_grid = false +model_parameter_search = {strategy = "explicit"} model_parameters = [ { type = "random_forest", maxDepth = 6, numTrees = 50 }, { type = "probit", threshold = 0.5} @@ -805,7 +823,7 @@ score_with_model = true feature_importances = true decision = "drop_duplicate_with_threshold_ratio" -param_grid = true +model_parameter_search = {strategy = "grid"} n_training_iterations = 10 model_parameters = [ { type = "logistic_regression", threshold = [0.5], threshold_ratio = [1.1]}, diff --git a/docs/_sources/use_examples.md.txt b/docs/_sources/use_examples.md.txt index 4d41811..bd1c2be 100644 --- a/docs/_sources/use_examples.md.txt +++ b/docs/_sources/use_examples.md.txt @@ -88,7 +88,7 @@ However, when this training data set is used for other years, the program does n score_with_model = true feature_importances = false decision = "drop_duplicate_with_threshold_ratio" - param_grid = true + model_parameter_search = {strategy = "grid"} n_training_iterations = 10 model_parameters = [ { type = "logistic_regression", threshold = [0.5], threshold_ratio = [1.0, 1.1]}, diff --git a/docs/_static/documentation_options.js b/docs/_static/documentation_options.js index 8650bf1..3cc34bf 100644 --- a/docs/_static/documentation_options.js +++ b/docs/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '3.8.0', + VERSION: '4.0.0a1', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/docs/column_mappings.html b/docs/column_mappings.html index af2f1f3..008b5c5 100644 --- a/docs/column_mappings.html +++ b/docs/column_mappings.html @@ -5,11 +5,11 @@ - Column Mappings — hlink 3.8.0 documentation + Column Mappings — hlink 4.0.0a1 documentation - + diff --git a/docs/comparison_features.html b/docs/comparison_features.html index 46e4b84..285f3fc 100644 --- a/docs/comparison_features.html +++ b/docs/comparison_features.html @@ -5,11 +5,11 @@ - Comparison Features — hlink 3.8.0 documentation + Comparison Features — hlink 4.0.0a1 documentation - + diff --git a/docs/comparisons.html b/docs/comparisons.html index fb65d55..e6b9441 100644 --- a/docs/comparisons.html +++ b/docs/comparisons.html @@ -5,11 +5,11 @@ - Comparisons — hlink 3.8.0 documentation + Comparisons — hlink 4.0.0a1 documentation - + diff --git a/docs/config.html b/docs/config.html index 3bc9b5e..f5dfc52 100644 --- a/docs/config.html +++ b/docs/config.html @@ -5,11 +5,11 @@ - Configuration — hlink 3.8.0 documentation + Configuration — hlink 4.0.0a1 documentation - + @@ -367,7 +367,7 @@

Advanced Config Filedecision = "drop_duplicate_with_threshold_ratio" n_training_iterations = 2 -param_grid = true +model_parameter_search = {strategy = "grid"} model_parameters = [ { type = "random_forest", maxDepth = [7], numTrees = [100], threshold = [0.05, 0.005], threshold_ratio = [1.2, 1.3] }, { type = "logistic_regression", threshold = [0.50, 0.65, 0.80], threshold_ratio = [1.0, 1.1] } @@ -393,7 +393,7 @@

Advanced Config Filedecision = "drop_duplicate_with_threshold_ratio" n_training_iterations = 10 -param_grid = false +model_parameter_search = {strategy = "explicit"} model_parameters = [ { type = "random_forest", maxDepth = 6, numTrees = 50, threshold = 0.5, threshold_ratio = 1.0 }, { type = "probit", threshold = 0.5, threshold_ratio = 1.0 } @@ -798,13 +798,13 @@

Training and models

-