From 6c6caeffb0ecc847634633df56893c486fd9aec5 Mon Sep 17 00:00:00 2001 From: Genevieve Warren <24882762+gewarren@users.noreply.github.com> Date: Thu, 12 Dec 2024 11:56:44 -0800 Subject: [PATCH 1/3] fix up docs for MLContext --- src/Microsoft.ML.Data/MLContext.cs | 47 +++++++++++++++++------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/src/Microsoft.ML.Data/MLContext.cs b/src/Microsoft.ML.Data/MLContext.cs index c966e5b6be..4212b8a21f 100644 --- a/src/Microsoft.ML.Data/MLContext.cs +++ b/src/Microsoft.ML.Data/MLContext.cs @@ -11,59 +11,65 @@ namespace Microsoft.ML { /// - /// The common context for all ML.NET operations. Once instantiated by the user, it provides a way to + /// Represents the common context for all ML.NET operations. + /// + /// + /// Once instantiated by the user, this class provides a way to /// create components for data preparation, feature engineering, training, prediction, and model evaluation. /// It also allows logging, execution control, and the ability to set repeatable random numbers. - /// + /// public sealed class MLContext : IHostEnvironmentInternal { // REVIEW: consider making LocalEnvironment and MLContext the same class instead of encapsulation. private readonly LocalEnvironment _env; /// - /// Trainers and tasks specific to binary classification problems. + /// Gets the trainers and tasks specific to binary classification problems. /// public BinaryClassificationCatalog BinaryClassification { get; } + /// - /// Trainers and tasks specific to multiclass classification problems. + /// Gets the trainers and tasks specific to multiclass classification problems. /// public MulticlassClassificationCatalog MulticlassClassification { get; } + /// - /// Trainers and tasks specific to regression problems. + /// Gets the trainers and tasks specific to regression problems. /// public RegressionCatalog Regression { get; } + /// - /// Trainers and tasks specific to clustering problems. + /// Gets the trainers and tasks specific to clustering problems. /// public ClusteringCatalog Clustering { get; } /// - /// Trainers and tasks specific to ranking problems. + /// Gets the trainers and tasks specific to ranking problems. /// public RankingCatalog Ranking { get; } /// - /// Trainers and tasks specific to anomaly detection problems. + /// Gets the trainers and tasks specific to anomaly detection problems. /// public AnomalyDetectionCatalog AnomalyDetection { get; } /// - /// Trainers and tasks specific to forecasting problems. + /// Gets the trainers and tasks specific to forecasting problems. /// public ForecastingCatalog Forecasting { get; } /// - /// Data processing operations. + /// Gets the data processing operations. /// public TransformsCatalog Transforms { get; } /// - /// Operations with trained models. + /// Gets the operations with trained models. /// public ModelOperationsCatalog Model { get; } /// - /// Data loading and saving. + /// Gets the data loading and saving operations. /// public DataOperationsCatalog Data { get; } @@ -71,12 +77,12 @@ public sealed class MLContext : IHostEnvironmentInternal // and expand if and when necessary. Exposing classes like ChannelMessage, MessageSensitivity and so on // looks premature at this point. /// - /// The handler for the log messages. + /// Represents the callback method that will handle the log messages. /// public event EventHandler Log; /// - /// This is a catalog of components that will be used for model loading. + /// Gets the catalog of components that will be used for model loading. /// public ComponentCatalog ComponentCatalog => _env.ComponentCatalog; @@ -90,7 +96,8 @@ public string TempFilePath } /// - /// Allow falling back to run on CPU if couldn't run on GPU. + /// Gets or sets a value that indicates whether the CPU will + /// be used if the task couldn't run on GPU. /// public bool FallbackToCpu { @@ -99,7 +106,7 @@ public bool FallbackToCpu } /// - /// GPU device ID to run execution on, to run on CPU. + /// Gets or sets the GPU device ID to run execution on, to run on CPU. /// public int? GpuDeviceId { @@ -120,17 +127,17 @@ public int? GpuDeviceId /// /// If a fixed seed is provided by , MLContext environment becomes /// deterministic, meaning that the results are repeatable and will remain the same across multiple runs. - /// For instance in many of ML.NET's API reference example code snippets, a seed is provided. + /// For instance, in many of ML.NET's API reference example code snippets, a seed is provided. /// That's because we want the users to get the same output as what's included in example comments, /// when they run the example on their own machine. /// /// Generally though, repeatability is not a requirement and that's the default behavior. - /// If a seed is not provided by , i.e. it's set to , + /// If a seed is not provided by , that is, it's set to , /// MLContext environment becomes non-deterministic and outputs change across multiple runs. /// /// There are many operations in ML.NET that don't use any randomness, such as - /// min-max normalization, concatenating columns, missing value indication, etc. - /// The behavior of those operations are deterministic regardless of the seed value. + /// min-max normalization, concatenating columns, and missing value indication. + /// The behavior of those operations is deterministic regardless of the seed value. /// /// Also ML.NET trainers don't use randomness *after* the training is finished. /// So, the predictions from a loaded model don't depend on the seed value. From e5b69703e3ee9be66cd25484bd7ad3a448400f7d Mon Sep 17 00:00:00 2001 From: Genevieve Warren <24882762+gewarren@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:02:32 -0800 Subject: [PATCH 2/3] some more fixes --- .../API/ColumnInference.cs | 40 +++++++++---------- .../ColumnInference/ColumnInformationUtil.cs | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/Microsoft.ML.AutoML/API/ColumnInference.cs b/src/Microsoft.ML.AutoML/API/ColumnInference.cs index bca0c7a97c..ce77aa4e35 100644 --- a/src/Microsoft.ML.AutoML/API/ColumnInference.cs +++ b/src/Microsoft.ML.AutoML/API/ColumnInference.cs @@ -15,7 +15,7 @@ namespace Microsoft.ML.AutoML public sealed class ColumnInferenceResults { /// - /// Inferred for the dataset. + /// Gets the inferred for the dataset. /// /// /// Can be used to instantiate a new to load @@ -25,69 +25,69 @@ public sealed class ColumnInferenceResults public TextLoader.Options TextLoaderOptions { get; internal set; } /// - /// Information about the inferred columns in the dataset. + /// Gets information about the inferred columns in the dataset. /// /// /// Contains the inferred purposes of each column. See for more details. - /// This can be fed to the AutoML API when running an experiment. - /// See - /// for example. + /// This value can be fed to the AutoML API when running an experiment. + /// See , for example. /// [JsonProperty(DefaultValueHandling = DefaultValueHandling.Include)] public ColumnInformation ColumnInformation { get; internal set; } } /// - /// Information about the columns in a dataset. + /// Provides information about the columns in a dataset. /// /// /// Contains information about the purpose of each column in the dataset. For instance, /// it enumerates the dataset columns that AutoML should treat as categorical, /// the columns AutoML should ignore, which column is the label, etc. /// can be fed to the AutoML API when running an experiment. - /// See - /// for example. + /// See , for example. /// public sealed class ColumnInformation { /// - /// The dataset column to use as the label. + /// Gets or sets the dataset column to use as the label. /// /// The default value is "Label". public string LabelColumnName { get; set; } /// - /// The dataset column to use as a user ID for computation. + /// Gets or sets the dataset column to use as a user ID for computation. /// public string UserIdColumnName { get; set; } /// - /// The dataset column to use as a group ID for computation in a Ranking Task. + /// Gets or sets the dataset column to use as a group ID for computation in a Ranking Task. /// If a SamplingKeyColumnName is provided, then it should be the same as this column. /// public string GroupIdColumnName { get; set; } /// - /// The dataset column to use as a item ID for computation. + /// Gets or sets the dataset column to use as a item ID for computation. /// public string ItemIdColumnName { get; set; } /// - /// The dataset column to use for example weight. + /// Gets or sets the dataset column to use for example weight. /// public string ExampleWeightColumnName { get; set; } /// - /// The dataset column to use for grouping rows. + /// Gets or sets the dataset column to use for grouping rows. + /// + /// /// If two examples share the same sampling key column name, /// they are guaranteed to appear in the same subset (train or test). /// This can be used to ensure no label leakage from the train to the test set. /// If , no row grouping will be performed. - /// + /// public string SamplingKeyColumnName { get; set; } /// - /// The dataset columns that are categorical. + /// Gets or sets the dataset columns that are categorical. /// /// The default value is a new, empty . /// @@ -97,28 +97,28 @@ public sealed class ColumnInformation public ICollection CategoricalColumnNames { get; private set; } /// - /// The dataset columns that are numeric. + /// Gets the dataset columns that are numeric. /// /// The default value is a new, empty . [JsonProperty] public ICollection NumericColumnNames { get; private set; } /// - /// The dataset columns that are text. + /// Gets the dataset columns that are text. /// /// The default value is a new, empty . [JsonProperty] public ICollection TextColumnNames { get; private set; } /// - /// The dataset columns that AutoML should ignore. + /// Gets the dataset columns that AutoML should ignore. /// /// The default value is a new, empty . [JsonProperty] public ICollection IgnoredColumnNames { get; private set; } /// - /// The dataset columns that are image paths. + /// Gets the dataset columns that are image paths. /// /// The default value is a new, empty . [JsonProperty] diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInformationUtil.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInformationUtil.cs index a33f830298..40b83064b8 100644 --- a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInformationUtil.cs +++ b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInformationUtil.cs @@ -122,7 +122,7 @@ public static ColumnInformation BuildColumnInfo(IEnumerable c } /// - /// Get all column names that are in . + /// Gets all column names that are in . /// /// Column information. public static IEnumerable GetColumnNames(ColumnInformation columnInformation) From 608bd929977be6fb5b3a168026019b76aa4e98cf Mon Sep 17 00:00:00 2001 From: Genevieve Warren <24882762+gewarren@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:01:24 -0800 Subject: [PATCH 3/3] text class and sentence similarity trainers --- .../NasBert/SentenceSimilarityTrainer.cs | 21 ++++++++++--------- .../NasBert/TextClassificationTrainer.cs | 21 ++++++++++--------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/src/Microsoft.ML.TorchSharp/NasBert/SentenceSimilarityTrainer.cs b/src/Microsoft.ML.TorchSharp/NasBert/SentenceSimilarityTrainer.cs index 31f6ae2997..026c486a34 100644 --- a/src/Microsoft.ML.TorchSharp/NasBert/SentenceSimilarityTrainer.cs +++ b/src/Microsoft.ML.TorchSharp/NasBert/SentenceSimilarityTrainer.cs @@ -27,31 +27,32 @@ namespace Microsoft.ML.TorchSharp.NasBert { /// - /// The for training a Deep Neural Network(DNN) to classify text. + /// Represents the for training a Deep Neural Network (DNN) to determine sentence similarity. /// /// /// type and the sentence columns must be of type. + /// ### Input and output columns + /// The input label column data must be type and the sentence columns must be of type . /// /// This trainer outputs the following columns: /// - /// | Output Column Name | Column Type | Description| + /// | Output column name | Column type | Description| /// | -- | -- | -- | - /// | `Score` | | The degree of similarity between the 2 sentences. | - /// ### Trainer Characteristics - /// | | | + /// | `Score` | | The degree of similarity between the two sentences. | + /// + /// ### Trainer characteristics + /// | Characteristic | Value | /// | -- | -- | - /// | Machine learning task | Rregression | + /// | Machine learning task | Regression | /// | Is normalization required? | No | /// | Is caching required? | No | /// | Required NuGet in addition to Microsoft.ML | Microsoft.ML.TorchSharp and libtorch-cpu or libtorch-cuda-11.3 or any of the OS specific variants. | /// | Exportable to ONNX | No | /// - /// ### Training Algorithm Details - /// Trains a Deep Neural Network(DNN) by leveraging an existing pre-trained NAS-BERT roBERTa model for the purpose of determining sentence similarity. + /// ### Training algorithm details + /// Trains a Deep Neural Network (DNN) by leveraging an existing, pretrained NAS-BERT roBERTa model for the purpose of determining sentence similarity. /// ]]> /// /// diff --git a/src/Microsoft.ML.TorchSharp/NasBert/TextClassificationTrainer.cs b/src/Microsoft.ML.TorchSharp/NasBert/TextClassificationTrainer.cs index 3120994054..a552dddc40 100644 --- a/src/Microsoft.ML.TorchSharp/NasBert/TextClassificationTrainer.cs +++ b/src/Microsoft.ML.TorchSharp/NasBert/TextClassificationTrainer.cs @@ -28,23 +28,24 @@ namespace Microsoft.ML.TorchSharp.NasBert { /// - /// The for training a Deep Neural Network(DNN) to classify text. + /// The for training a Deep Neural Network (DNN) to classify text. /// /// /// . + /// ### Input and output columns + /// The input label column data must be [key](xref:Microsoft.ML.Data.KeyDataViewType) type and the sentence columns must be of type . /// /// This trainer outputs the following columns: /// - /// | Output Column Name | Column Type | Description| + /// | Output column name | Column type | Description| /// | -- | -- | -- | - /// | `PredictedLabel` | [key](xref:Microsoft.ML.Data.KeyDataViewType) type | The predicted label's index. If its value is i, the actual label would be the i-th category in the key-valued input label type. | - /// | `Score` | Vector of | The scores of all classes.Higher value means higher probability to fall into the associated class. If the i-th element has the largest value, the predicted label index would be i.Note that i is zero-based index. | - /// ### Trainer Characteristics - /// | | | + /// | `PredictedLabel` | [key](xref:Microsoft.ML.Data.KeyDataViewType) type | The predicted label's index. If its value is `i`, the actual label would be the `i`-th category in the key-valued input label type. | + /// | `Score` | Vector of | The scores of all classes. Higher value means higher probability to fall into the associated class. If the `i`-th element has the largest value, the predicted label index would be `i`. Note that `i` is a zero-based index. | + /// + /// ### Trainer characteristics + /// | Characteristic | Value | /// | -- | -- | /// | Machine learning task | Multiclass classification | /// | Is normalization required? | No | @@ -52,8 +53,8 @@ namespace Microsoft.ML.TorchSharp.NasBert /// | Required NuGet in addition to Microsoft.ML | Microsoft.ML.TorchSharp and libtorch-cpu or libtorch-cuda-11.3 or any of the OS specific variants. | /// | Exportable to ONNX | No | /// - /// ### Training Algorithm Details - /// Trains a Deep Neural Network(DNN) by leveraging an existing pre-trained NAS-BERT roBERTa model for the purpose of classifying text. + /// ### Training algorithm details + /// Trains a Deep Neural Network (DNN) by leveraging an existing, pretrained NAS-BERT roBERTa model for the purpose of classifying text. /// ]]> /// ///