Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix issue 4322, enable lda summary output #5260

Merged
merged 5 commits into from
Jul 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 71 additions & 12 deletions src/Microsoft.ML.Transforms/Text/LdaTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ internal sealed class Options : TransformInputBase
[Argument(ArgumentType.AtMostOnce, HelpText = "Reset the random number generator for each document", ShortName = "reset")]
public bool ResetRandomGenerator = LatentDirichletAllocationEstimator.Defaults.ResetRandomGenerator;

[Argument(ArgumentType.AtMostOnce, HelpText = "Whether to output the topic-word summary in text format", ShortName = "summary")]
public bool OutputTopicWordSummary;
[Argument(ArgumentType.AtMostOnce, HelpText = "Whether to output the topic-word summary in text format when saving the model to disk", ShortName = "summary")]
public bool OutputTopicWordSummary = LatentDirichletAllocationEstimator.Defaults.OutputTopicWordSummary;
}

internal sealed class Column : OneToOneColumn
Expand Down Expand Up @@ -141,6 +141,9 @@ internal sealed class Column : OneToOneColumn
[Argument(ArgumentType.AtMostOnce, HelpText = "Reset the random number generator for each document", ShortName = "reset")]
public bool? ResetRandomGenerator;

[Argument(ArgumentType.AtMostOnce, HelpText = "Whether to output the topic-word summary in text format when saving the model to disk", ShortName = "summary")]
public bool? OutputTopicWordSummary;

internal static Column Parse(string str)
{
Contracts.AssertNonEmpty(str);
Expand Down Expand Up @@ -206,13 +209,17 @@ internal ModelParameters(IReadOnlyList<IReadOnlyList<WordItemScore>> wordScoresP
}
}

[BestFriend]
internal ModelParameters GetLdaDetails(int iinfo)
/// <summary>
/// Method to provide details about the topics discovered by LightLDA
/// </summary>
/// <param name="columnIndex">index of column options pair</param>
/// <returns></returns>
public ModelParameters GetLdaDetails(int columnIndex)
{
Contracts.Assert(0 <= iinfo && iinfo < _ldas.Length);
Contracts.Assert(0 <= columnIndex && columnIndex < _ldas.Length);

var ldaState = _ldas[iinfo];
var mapping = _columnMappings[iinfo];
var ldaState = _ldas[columnIndex];
var mapping = _columnMappings[columnIndex];

return ldaState.GetLdaSummary(mapping);
}
Expand Down Expand Up @@ -630,7 +637,7 @@ private static VersionInfo GetVersionInfo()
private readonly List<VBuffer<ReadOnlyMemory<char>>> _columnMappings;

private const string RegistrationName = "LightLda";
private const string WordTopicModelFilename = "word_topic_summary.txt";
private const string WordTopicModelFilename = "word_topic_summary-{0}.txt";
internal const string Summary = "The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.";
internal const string UserName = "Latent Dirichlet Allocation Transform";
internal const string ShortName = "LightLda";
Expand Down Expand Up @@ -760,9 +767,50 @@ private protected override void SaveModel(ModelSaveContext ctx)
for (int i = 0; i < _ldas.Length; i++)
{
_ldas[i].Save(ctx);

if(_columns[i].OutputTopicWordSummary)
SaveTopicWordSummary(ctx, i);
}
}

private void SaveTopicWordSummary(ModelSaveContext ctx, int i)
{
var summary = GetLdaDetails(i);

var columnName = _columns[i].Name;

ctx.SaveTextStream(String.Format(WordTopicModelFilename, columnName), writer =>
{
if (summary.WordScoresPerTopic != null)
{
int topId = 0;
foreach (var wordScores in summary.WordScoresPerTopic)
{
foreach (var wordScore in wordScores)
{
writer.WriteLine($"Topic[{topId}]: {wordScore.Word}\t{wordScore.Score}");
}

topId++;
}
}

if (summary.ItemScoresPerTopic != null)
{
int topId = 0;
foreach (var itemScores in summary.ItemScoresPerTopic)
{
foreach (var itemScore in itemScores)
{
writer.WriteLine($"Topic[{topId}]: {itemScore.Item}\t{itemScore.Score}");
}

topId++;
}
}
});
}

private static int GetFrequency(double value)
{
int result = (int)value;
Expand Down Expand Up @@ -994,6 +1042,7 @@ internal static class Defaults
public const int NumberOfSummaryTermsPerTopic = 10;
public const int NumberOfBurninIterations = 10;
public const bool ResetRandomGenerator = false;
public const bool OutputTopicWordSummary = false;
}

private readonly IHost _host;
Expand All @@ -1014,6 +1063,7 @@ internal static class Defaults
/// <param name="likelihoodInterval">Compute log likelihood over local dataset on this iteration interval.</param>
/// <param name="numberOfBurninIterations">The number of burn-in iterations.</param>
/// <param name="resetRandomGenerator">Reset the random number generator for each document.</param>
/// <param name="outputTopicWordSummary">Whether to output the topic-word summary in text format when saving the model to disk.</param>
internal LatentDirichletAllocationEstimator(IHostEnvironment env,
string outputColumnName, string inputColumnName = null,
int numberOfTopics = Defaults.NumberOfTopics,
Expand All @@ -1026,10 +1076,11 @@ internal LatentDirichletAllocationEstimator(IHostEnvironment env,
int numberOfSummaryTermsPerTopic = Defaults.NumberOfSummaryTermsPerTopic,
int likelihoodInterval = Defaults.LikelihoodInterval,
int numberOfBurninIterations = Defaults.NumberOfBurninIterations,
bool resetRandomGenerator = Defaults.ResetRandomGenerator)
bool resetRandomGenerator = Defaults.ResetRandomGenerator,
bool outputTopicWordSummary = Defaults.OutputTopicWordSummary)
: this(env, new[] { new ColumnOptions(outputColumnName, inputColumnName ?? outputColumnName,
numberOfTopics, alphaSum, beta, samplingStepCount, maximumNumberOfIterations, likelihoodInterval, numberOfThreads, maximumTokenCountPerDocument,
numberOfSummaryTermsPerTopic, numberOfBurninIterations, resetRandomGenerator) })
numberOfSummaryTermsPerTopic, numberOfBurninIterations, resetRandomGenerator, outputTopicWordSummary) })
{ }

/// <include file='doc.xml' path='doc/members/member[@name="LightLDA"]/*' />
Expand Down Expand Up @@ -1100,6 +1151,10 @@ internal sealed class ColumnOptions
/// Reset the random number generator for each document.
/// </summary>
public readonly bool ResetRandomGenerator;
/// <summary>
/// Whether to output the topic-word summary in text format when saving the model to disk.
/// </summary>
public readonly bool OutputTopicWordSummary;

/// <summary>
/// Describes how the transformer handles one column pair.
Expand All @@ -1117,6 +1172,7 @@ internal sealed class ColumnOptions
/// <param name="numberOfSummaryTermsPerTopic">The number of words to summarize the topic.</param>
/// <param name="numberOfBurninIterations">The number of burn-in iterations.</param>
/// <param name="resetRandomGenerator">Reset the random number generator for each document.</param>
/// <param name="outputTopicWordSummary">Whether to output the topic-word summary in text format when saving the model to disk.</param>
public ColumnOptions(string name,
string inputColumnName = null,
int numberOfTopics = LatentDirichletAllocationEstimator.Defaults.NumberOfTopics,
Expand All @@ -1129,7 +1185,8 @@ public ColumnOptions(string name,
int maximumTokenCountPerDocument = LatentDirichletAllocationEstimator.Defaults.MaximumTokenCountPerDocument,
int numberOfSummaryTermsPerTopic = LatentDirichletAllocationEstimator.Defaults.NumberOfSummaryTermsPerTopic,
int numberOfBurninIterations = LatentDirichletAllocationEstimator.Defaults.NumberOfBurninIterations,
bool resetRandomGenerator = LatentDirichletAllocationEstimator.Defaults.ResetRandomGenerator)
bool resetRandomGenerator = LatentDirichletAllocationEstimator.Defaults.ResetRandomGenerator,
bool outputTopicWordSummary = LatentDirichletAllocationEstimator.Defaults.OutputTopicWordSummary)
{
Contracts.CheckValue(name, nameof(name));
Contracts.CheckValueOrNull(inputColumnName);
Expand All @@ -1155,6 +1212,7 @@ public ColumnOptions(string name,
NumberOfSummaryTermsPerTopic = numberOfSummaryTermsPerTopic;
NumberOfBurninIterations = numberOfBurninIterations;
ResetRandomGenerator = resetRandomGenerator;
OutputTopicWordSummary = outputTopicWordSummary;
}

internal ColumnOptions(LatentDirichletAllocationTransformer.Column item, LatentDirichletAllocationTransformer.Options options) :
Expand All @@ -1170,7 +1228,8 @@ internal ColumnOptions(LatentDirichletAllocationTransformer.Column item, LatentD
item.NumMaxDocToken ?? options.NumMaxDocToken,
item.NumSummaryTermPerTopic ?? options.NumSummaryTermPerTopic,
item.NumBurninIterations ?? options.NumBurninIterations,
item.ResetRandomGenerator ?? options.ResetRandomGenerator)
item.ResetRandomGenerator ?? options.ResetRandomGenerator,
item.OutputTopicWordSummary ?? options.OutputTopicWordSummary)
{
}

Expand Down
14 changes: 13 additions & 1 deletion test/BaselineOutput/Common/EntryPoints/core_manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -20900,6 +20900,18 @@
"IsNullable": true,
"Default": null
},
{
"Name": "OutputTopicWordSummary",
"Type": "Bool",
"Desc": "Whether to output the topic-word summary in text format when saving the model to disk",
"Aliases": [
"summary"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": true,
"Default": null
},
{
"Name": "Name",
"Type": "String",
Expand Down Expand Up @@ -21112,7 +21124,7 @@
{
"Name": "OutputTopicWordSummary",
"Type": "Bool",
"Desc": "Whether to output the topic-word summary in text format",
"Desc": "Whether to output the topic-word summary in text format when saving the model to disk",
"Aliases": [
"summary"
],
Expand Down
13 changes: 11 additions & 2 deletions test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs
Original file line number Diff line number Diff line change
Expand Up @@ -182,15 +182,24 @@ public void InspectLdaModelParameters()

// Define the pipeline.
var pipeline = mlContext.Transforms.Text.ProduceWordBags("SentimentBag", "SentimentText")
.Append(mlContext.Transforms.Text.LatentDirichletAllocation("Features", "SentimentBag", numberOfTopics: numTopics, maximumNumberOfIterations: 10));
.Append(mlContext.Transforms.Text.LatentDirichletAllocation("Features", "SentimentBag",
numberOfTopics: numTopics, maximumNumberOfIterations: 10));
Comment on lines +185 to +186
Copy link
Member

@antoniovs1029 antoniovs1029 Jul 1, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(appended to code to allow for threaded discussion)
Should we also add the outputWordSummary option to the ML.NET public API found here on the TextCatalog?:

https://github.com/frank-dong-ms/machinelearning/blob/aa75df98f549741893deb3a042a5dd287a56ab99/src/Microsoft.ML.Transforms/Text/TextCatalog.cs#L597-L614 #Resolved

Copy link
Contributor Author

@frank-dong-ms-zz frank-dong-ms-zz Jul 1, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That will be break public API, I think we should avoid break public API as much as possible. User still can get summary via the public interface "GetLdaDetails" from lda transformer as this test shows


In reply to: 448136744 [](ancestors = 448136744)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right. I don't think it's a good idea to add this option to this public API. No need for a new API only for this.


In reply to: 448522335 [](ancestors = 448522335,448512997,448136744)


// Fit the pipeline.
var model = pipeline.Fit(data);

// Get the trained LDA model.
// TODO #2197: Get the topics and summaries from the model.
var ldaTransform = model.LastTransformer;

// Get the topics and summaries from the model.
var ldaDetails = ldaTransform.GetLdaDetails(0);
Assert.False(ldaDetails.ItemScoresPerTopic == null && ldaDetails.WordScoresPerTopic == null);
if(ldaDetails.ItemScoresPerTopic != null)
Assert.Equal(numTopics, ldaDetails.ItemScoresPerTopic.Count);
if (ldaDetails.WordScoresPerTopic != null)
Assert.Equal(numTopics, ldaDetails.WordScoresPerTopic.Count);


// Transform the data.
var transformedData = model.Transform(data);

Expand Down
14 changes: 14 additions & 0 deletions test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Linq;
using Microsoft.ML.CommandLine;
using Microsoft.ML.Data;
using Microsoft.ML.Internal.Utilities;
Expand Down Expand Up @@ -1047,6 +1049,18 @@ public void SavePipeLda()
"loader=Text{col=F1V:Num:0-2}",
"xf=Lda{col={name=Result src=F1V numtopic=3 alphasum=3 ns=3 reset=+ t=1} summary=+}",
}, forceDense: true);

// topic summary text file saved inside the model.zip file.
string name = TestName + ".zip";
string modelPath = GetOutputPath("SavePipe", name);
using (var file = Env.OpenInputFile(modelPath))
using (var strm = file.OpenReadStream())
using (var zip = new ZipArchive(strm, ZipArchiveMode.Read))
{
Comment on lines +1052 to +1059
Copy link
Member

@antoniovs1029 antoniovs1029 Jun 26, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(appended to code to allow for threaded comment)

Can you please also add a test to test your PR on ML.NET's API instead of MAML?
I think modifying a test such as InspectLdaModelParameters would be enough. Notice that in particular that test was waiting for this issue to be fixed in order to test the output summary:

// Get the trained LDA model.
// TODO #2197: Get the topics and summaries from the model.

Thanks.! #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great idea, I will test both MAML and from transformer. Actually test from MAML and check summary was how TLC originally test topic summary.


In reply to: 446344620 [](ancestors = 446344620)

var entry = zip.Entries.First(source => source.Name == "word_topic_summary-Result.txt");
Assert.True(entry != null);
}

Done();
}

Expand Down