From 6c6caeffb0ecc847634633df56893c486fd9aec5 Mon Sep 17 00:00:00 2001
From: Genevieve Warren <24882762+gewarren@users.noreply.github.com>
Date: Thu, 12 Dec 2024 11:56:44 -0800
Subject: [PATCH 1/3] fix up docs for MLContext
---
src/Microsoft.ML.Data/MLContext.cs | 47 +++++++++++++++++-------------
1 file changed, 27 insertions(+), 20 deletions(-)
diff --git a/src/Microsoft.ML.Data/MLContext.cs b/src/Microsoft.ML.Data/MLContext.cs
index c966e5b6be..4212b8a21f 100644
--- a/src/Microsoft.ML.Data/MLContext.cs
+++ b/src/Microsoft.ML.Data/MLContext.cs
@@ -11,59 +11,65 @@
namespace Microsoft.ML
{
///
- /// The common context for all ML.NET operations. Once instantiated by the user, it provides a way to
+ /// Represents the common context for all ML.NET operations.
+ ///
+ ///
+ /// Once instantiated by the user, this class provides a way to
/// create components for data preparation, feature engineering, training, prediction, and model evaluation.
/// It also allows logging, execution control, and the ability to set repeatable random numbers.
- ///
+ ///
public sealed class MLContext : IHostEnvironmentInternal
{
// REVIEW: consider making LocalEnvironment and MLContext the same class instead of encapsulation.
private readonly LocalEnvironment _env;
///
- /// Trainers and tasks specific to binary classification problems.
+ /// Gets the trainers and tasks specific to binary classification problems.
///
public BinaryClassificationCatalog BinaryClassification { get; }
+
///
- /// Trainers and tasks specific to multiclass classification problems.
+ /// Gets the trainers and tasks specific to multiclass classification problems.
///
public MulticlassClassificationCatalog MulticlassClassification { get; }
+
///
- /// Trainers and tasks specific to regression problems.
+ /// Gets the trainers and tasks specific to regression problems.
///
public RegressionCatalog Regression { get; }
+
///
- /// Trainers and tasks specific to clustering problems.
+ /// Gets the trainers and tasks specific to clustering problems.
///
public ClusteringCatalog Clustering { get; }
///
- /// Trainers and tasks specific to ranking problems.
+ /// Gets the trainers and tasks specific to ranking problems.
///
public RankingCatalog Ranking { get; }
///
- /// Trainers and tasks specific to anomaly detection problems.
+ /// Gets the trainers and tasks specific to anomaly detection problems.
///
public AnomalyDetectionCatalog AnomalyDetection { get; }
///
- /// Trainers and tasks specific to forecasting problems.
+ /// Gets the trainers and tasks specific to forecasting problems.
///
public ForecastingCatalog Forecasting { get; }
///
- /// Data processing operations.
+ /// Gets the data processing operations.
///
public TransformsCatalog Transforms { get; }
///
- /// Operations with trained models.
+ /// Gets the operations with trained models.
///
public ModelOperationsCatalog Model { get; }
///
- /// Data loading and saving.
+ /// Gets the data loading and saving operations.
///
public DataOperationsCatalog Data { get; }
@@ -71,12 +77,12 @@ public sealed class MLContext : IHostEnvironmentInternal
// and expand if and when necessary. Exposing classes like ChannelMessage, MessageSensitivity and so on
// looks premature at this point.
///
- /// The handler for the log messages.
+ /// Represents the callback method that will handle the log messages.
///
public event EventHandler Log;
///
- /// This is a catalog of components that will be used for model loading.
+ /// Gets the catalog of components that will be used for model loading.
///
public ComponentCatalog ComponentCatalog => _env.ComponentCatalog;
@@ -90,7 +96,8 @@ public string TempFilePath
}
///
- /// Allow falling back to run on CPU if couldn't run on GPU.
+ /// Gets or sets a value that indicates whether the CPU will
+ /// be used if the task couldn't run on GPU.
///
public bool FallbackToCpu
{
@@ -99,7 +106,7 @@ public bool FallbackToCpu
}
///
- /// GPU device ID to run execution on, to run on CPU.
+ /// Gets or sets the GPU device ID to run execution on, to run on CPU.
///
public int? GpuDeviceId
{
@@ -120,17 +127,17 @@ public int? GpuDeviceId
///
/// If a fixed seed is provided by , MLContext environment becomes
/// deterministic, meaning that the results are repeatable and will remain the same across multiple runs.
- /// For instance in many of ML.NET's API reference example code snippets, a seed is provided.
+ /// For instance, in many of ML.NET's API reference example code snippets, a seed is provided.
/// That's because we want the users to get the same output as what's included in example comments,
/// when they run the example on their own machine.
///
/// Generally though, repeatability is not a requirement and that's the default behavior.
- /// If a seed is not provided by , i.e. it's set to ,
+ /// If a seed is not provided by , that is, it's set to ,
/// MLContext environment becomes non-deterministic and outputs change across multiple runs.
///
/// There are many operations in ML.NET that don't use any randomness, such as
- /// min-max normalization, concatenating columns, missing value indication, etc.
- /// The behavior of those operations are deterministic regardless of the seed value.
+ /// min-max normalization, concatenating columns, and missing value indication.
+ /// The behavior of those operations is deterministic regardless of the seed value.
///
/// Also ML.NET trainers don't use randomness *after* the training is finished.
/// So, the predictions from a loaded model don't depend on the seed value.
From e5b69703e3ee9be66cd25484bd7ad3a448400f7d Mon Sep 17 00:00:00 2001
From: Genevieve Warren <24882762+gewarren@users.noreply.github.com>
Date: Fri, 13 Dec 2024 13:02:32 -0800
Subject: [PATCH 2/3] some more fixes
---
.../API/ColumnInference.cs | 40 +++++++++----------
.../ColumnInference/ColumnInformationUtil.cs | 2 +-
2 files changed, 21 insertions(+), 21 deletions(-)
diff --git a/src/Microsoft.ML.AutoML/API/ColumnInference.cs b/src/Microsoft.ML.AutoML/API/ColumnInference.cs
index bca0c7a97c..ce77aa4e35 100644
--- a/src/Microsoft.ML.AutoML/API/ColumnInference.cs
+++ b/src/Microsoft.ML.AutoML/API/ColumnInference.cs
@@ -15,7 +15,7 @@ namespace Microsoft.ML.AutoML
public sealed class ColumnInferenceResults
{
///
- /// Inferred for the dataset.
+ /// Gets the inferred for the dataset.
///
///
/// Can be used to instantiate a new to load
@@ -25,69 +25,69 @@ public sealed class ColumnInferenceResults
public TextLoader.Options TextLoaderOptions { get; internal set; }
///
- /// Information about the inferred columns in the dataset.
+ /// Gets information about the inferred columns in the dataset.
///
///
/// Contains the inferred purposes of each column. See for more details.
- /// This can be fed to the AutoML API when running an experiment.
- /// See
- /// for example.
+ /// This value can be fed to the AutoML API when running an experiment.
+ /// See , for example.
///
[JsonProperty(DefaultValueHandling = DefaultValueHandling.Include)]
public ColumnInformation ColumnInformation { get; internal set; }
}
///
- /// Information about the columns in a dataset.
+ /// Provides information about the columns in a dataset.
///
///
/// Contains information about the purpose of each column in the dataset. For instance,
/// it enumerates the dataset columns that AutoML should treat as categorical,
/// the columns AutoML should ignore, which column is the label, etc.
/// can be fed to the AutoML API when running an experiment.
- /// See
- /// for example.
+ /// See , for example.
///
public sealed class ColumnInformation
{
///
- /// The dataset column to use as the label.
+ /// Gets or sets the dataset column to use as the label.
///
/// The default value is "Label".
public string LabelColumnName { get; set; }
///
- /// The dataset column to use as a user ID for computation.
+ /// Gets or sets the dataset column to use as a user ID for computation.
///
public string UserIdColumnName { get; set; }
///
- /// The dataset column to use as a group ID for computation in a Ranking Task.
+ /// Gets or sets the dataset column to use as a group ID for computation in a Ranking Task.
/// If a SamplingKeyColumnName is provided, then it should be the same as this column.
///
public string GroupIdColumnName { get; set; }
///
- /// The dataset column to use as a item ID for computation.
+ /// Gets or sets the dataset column to use as a item ID for computation.
///
public string ItemIdColumnName { get; set; }
///
- /// The dataset column to use for example weight.
+ /// Gets or sets the dataset column to use for example weight.
///
public string ExampleWeightColumnName { get; set; }
///
- /// The dataset column to use for grouping rows.
+ /// Gets or sets the dataset column to use for grouping rows.
+ ///
+ ///
/// If two examples share the same sampling key column name,
/// they are guaranteed to appear in the same subset (train or test).
/// This can be used to ensure no label leakage from the train to the test set.
/// If , no row grouping will be performed.
- ///
+ ///
public string SamplingKeyColumnName { get; set; }
///
- /// The dataset columns that are categorical.
+ /// Gets or sets the dataset columns that are categorical.
///
/// The default value is a new, empty .
///
@@ -97,28 +97,28 @@ public sealed class ColumnInformation
public ICollection CategoricalColumnNames { get; private set; }
///
- /// The dataset columns that are numeric.
+ /// Gets the dataset columns that are numeric.
///
/// The default value is a new, empty .
[JsonProperty]
public ICollection NumericColumnNames { get; private set; }
///
- /// The dataset columns that are text.
+ /// Gets the dataset columns that are text.
///
/// The default value is a new, empty .
[JsonProperty]
public ICollection TextColumnNames { get; private set; }
///
- /// The dataset columns that AutoML should ignore.
+ /// Gets the dataset columns that AutoML should ignore.
///
/// The default value is a new, empty .
[JsonProperty]
public ICollection IgnoredColumnNames { get; private set; }
///
- /// The dataset columns that are image paths.
+ /// Gets the dataset columns that are image paths.
///
/// The default value is a new, empty .
[JsonProperty]
diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInformationUtil.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInformationUtil.cs
index a33f830298..40b83064b8 100644
--- a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInformationUtil.cs
+++ b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInformationUtil.cs
@@ -122,7 +122,7 @@ public static ColumnInformation BuildColumnInfo(IEnumerable c
}
///
- /// Get all column names that are in .
+ /// Gets all column names that are in .
///
/// Column information.
public static IEnumerable GetColumnNames(ColumnInformation columnInformation)
From 608bd929977be6fb5b3a168026019b76aa4e98cf Mon Sep 17 00:00:00 2001
From: Genevieve Warren <24882762+gewarren@users.noreply.github.com>
Date: Fri, 13 Dec 2024 14:01:24 -0800
Subject: [PATCH 3/3] text class and sentence similarity trainers
---
.../NasBert/SentenceSimilarityTrainer.cs | 21 ++++++++++---------
.../NasBert/TextClassificationTrainer.cs | 21 ++++++++++---------
2 files changed, 22 insertions(+), 20 deletions(-)
diff --git a/src/Microsoft.ML.TorchSharp/NasBert/SentenceSimilarityTrainer.cs b/src/Microsoft.ML.TorchSharp/NasBert/SentenceSimilarityTrainer.cs
index 31f6ae2997..026c486a34 100644
--- a/src/Microsoft.ML.TorchSharp/NasBert/SentenceSimilarityTrainer.cs
+++ b/src/Microsoft.ML.TorchSharp/NasBert/SentenceSimilarityTrainer.cs
@@ -27,31 +27,32 @@
namespace Microsoft.ML.TorchSharp.NasBert
{
///
- /// The for training a Deep Neural Network(DNN) to classify text.
+ /// Represents the for training a Deep Neural Network (DNN) to determine sentence similarity.
///
///
/// type and the sentence columns must be of type.
+ /// ### Input and output columns
+ /// The input label column data must be type and the sentence columns must be of type .
///
/// This trainer outputs the following columns:
///
- /// | Output Column Name | Column Type | Description|
+ /// | Output column name | Column type | Description|
/// | -- | -- | -- |
- /// | `Score` | | The degree of similarity between the 2 sentences. |
- /// ### Trainer Characteristics
- /// | | |
+ /// | `Score` | | The degree of similarity between the two sentences. |
+ ///
+ /// ### Trainer characteristics
+ /// | Characteristic | Value |
/// | -- | -- |
- /// | Machine learning task | Rregression |
+ /// | Machine learning task | Regression |
/// | Is normalization required? | No |
/// | Is caching required? | No |
/// | Required NuGet in addition to Microsoft.ML | Microsoft.ML.TorchSharp and libtorch-cpu or libtorch-cuda-11.3 or any of the OS specific variants. |
/// | Exportable to ONNX | No |
///
- /// ### Training Algorithm Details
- /// Trains a Deep Neural Network(DNN) by leveraging an existing pre-trained NAS-BERT roBERTa model for the purpose of determining sentence similarity.
+ /// ### Training algorithm details
+ /// Trains a Deep Neural Network (DNN) by leveraging an existing, pretrained NAS-BERT roBERTa model for the purpose of determining sentence similarity.
/// ]]>
///
///
diff --git a/src/Microsoft.ML.TorchSharp/NasBert/TextClassificationTrainer.cs b/src/Microsoft.ML.TorchSharp/NasBert/TextClassificationTrainer.cs
index 3120994054..a552dddc40 100644
--- a/src/Microsoft.ML.TorchSharp/NasBert/TextClassificationTrainer.cs
+++ b/src/Microsoft.ML.TorchSharp/NasBert/TextClassificationTrainer.cs
@@ -28,23 +28,24 @@
namespace Microsoft.ML.TorchSharp.NasBert
{
///
- /// The for training a Deep Neural Network(DNN) to classify text.
+ /// The for training a Deep Neural Network (DNN) to classify text.
///
///
/// .
+ /// ### Input and output columns
+ /// The input label column data must be [key](xref:Microsoft.ML.Data.KeyDataViewType) type and the sentence columns must be of type .
///
/// This trainer outputs the following columns:
///
- /// | Output Column Name | Column Type | Description|
+ /// | Output column name | Column type | Description|
/// | -- | -- | -- |
- /// | `PredictedLabel` | [key](xref:Microsoft.ML.Data.KeyDataViewType) type | The predicted label's index. If its value is i, the actual label would be the i-th category in the key-valued input label type. |
- /// | `Score` | Vector of | The scores of all classes.Higher value means higher probability to fall into the associated class. If the i-th element has the largest value, the predicted label index would be i.Note that i is zero-based index. |
- /// ### Trainer Characteristics
- /// | | |
+ /// | `PredictedLabel` | [key](xref:Microsoft.ML.Data.KeyDataViewType) type | The predicted label's index. If its value is `i`, the actual label would be the `i`-th category in the key-valued input label type. |
+ /// | `Score` | Vector of | The scores of all classes. Higher value means higher probability to fall into the associated class. If the `i`-th element has the largest value, the predicted label index would be `i`. Note that `i` is a zero-based index. |
+ ///
+ /// ### Trainer characteristics
+ /// | Characteristic | Value |
/// | -- | -- |
/// | Machine learning task | Multiclass classification |
/// | Is normalization required? | No |
@@ -52,8 +53,8 @@ namespace Microsoft.ML.TorchSharp.NasBert
/// | Required NuGet in addition to Microsoft.ML | Microsoft.ML.TorchSharp and libtorch-cpu or libtorch-cuda-11.3 or any of the OS specific variants. |
/// | Exportable to ONNX | No |
///
- /// ### Training Algorithm Details
- /// Trains a Deep Neural Network(DNN) by leveraging an existing pre-trained NAS-BERT roBERTa model for the purpose of classifying text.
+ /// ### Training algorithm details
+ /// Trains a Deep Neural Network (DNN) by leveraging an existing, pretrained NAS-BERT roBERTa model for the purpose of classifying text.
/// ]]>
///
///