-
Notifications
You must be signed in to change notification settings - Fork 1.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix issue 4322, enable lda summary output #5260
Changes from 1 commit
a41290c
5d7bc1c
aa75df9
cd7fc3e
2a7c2d9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -103,7 +103,7 @@ internal sealed class Options : TransformInputBase | |
public bool ResetRandomGenerator = LatentDirichletAllocationEstimator.Defaults.ResetRandomGenerator; | ||
|
||
[Argument(ArgumentType.AtMostOnce, HelpText = "Whether to output the topic-word summary in text format", ShortName = "summary")] | ||
public bool OutputTopicWordSummary; | ||
public bool OutputTopicWordSummary = LatentDirichletAllocationEstimator.Defaults.OutputTopicWordSummary; | ||
} | ||
|
||
internal sealed class Column : OneToOneColumn | ||
|
@@ -141,6 +141,9 @@ internal sealed class Column : OneToOneColumn | |
[Argument(ArgumentType.AtMostOnce, HelpText = "Reset the random number generator for each document", ShortName = "reset")] | ||
public bool? ResetRandomGenerator; | ||
|
||
[Argument(ArgumentType.AtMostOnce, HelpText = "Whether to output the topic-word summary in text format", ShortName = "summary")] | ||
public bool? OutputTopicWordSummary; | ||
|
||
internal static Column Parse(string str) | ||
{ | ||
Contracts.AssertNonEmpty(str); | ||
|
@@ -206,13 +209,17 @@ internal ModelParameters(IReadOnlyList<IReadOnlyList<WordItemScore>> wordScoresP | |
} | ||
} | ||
|
||
[BestFriend] | ||
internal ModelParameters GetLdaDetails(int iinfo) | ||
/// <summary> | ||
/// Method to provide details about the topic discovered by LightLDA | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: "about the topics discovered by LightLDA" #Resolved |
||
/// </summary> | ||
/// <param name="topicIndex">index of topic</param> | ||
/// <returns></returns> | ||
public ModelParameters GetLdaDetails(int topicIndex) | ||
{ | ||
Contracts.Assert(0 <= iinfo && iinfo < _ldas.Length); | ||
Contracts.Assert(0 <= topicIndex && topicIndex < _ldas.Length); | ||
|
||
var ldaState = _ldas[iinfo]; | ||
var mapping = _columnMappings[iinfo]; | ||
var ldaState = _ldas[topicIndex]; | ||
var mapping = _columnMappings[topicIndex]; | ||
|
||
return ldaState.GetLdaSummary(mapping); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The way I understand it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are right! I confused this with topic index, thanks In reply to: 445984832 [](ancestors = 445984832) |
||
} | ||
|
@@ -760,9 +767,48 @@ private protected override void SaveModel(ModelSaveContext ctx) | |
for (int i = 0; i < _ldas.Length; i++) | ||
{ | ||
_ldas[i].Save(ctx); | ||
|
||
if(_columns[i].OutputTopicWordSummary) | ||
SaveTopicWordSummary(ctx, i); | ||
} | ||
} | ||
|
||
private void SaveTopicWordSummary(ModelSaveContext ctx, int i) | ||
{ | ||
var summary = GetLdaDetails(i); | ||
|
||
ctx.SaveTextStream(WordTopicModelFilename, writer => | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know that the The potential problem I see is that for every _columns[i] that we save, we'll call this method (if _columns[i].OutputTopicWordSummary == true, as seen in your code above), and all of them will try to save their WordSummary to the same So if a given LdaTransformer has more than 1 column with Looking at the implementation of So adding text to identify where the output of one column begins would be useful. Another sugestion would be to actually use one file per column (for example, something like ""word_topic_summary-col{i}.txt" might be sufficient. (PS: I don't know what was the behavior back on TLC, but maybe there we couldn't have 1 Lda transformer acting on several columns). #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sounds good, I will add column option name (which is output column name from options) as part of summary text file In reply to: 445988583 [](ancestors = 445988583) |
||
{ | ||
if (summary.WordScoresPerTopic != null) | ||
{ | ||
int topId = 0; | ||
foreach (var wordScores in summary.WordScoresPerTopic) | ||
{ | ||
foreach (var wordScore in wordScores) | ||
{ | ||
writer.WriteLine($"Top[{topId}]: {wordScore.Word}\t{wordScore.Score}"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I think it might be clearer to use |
||
} | ||
|
||
topId++; | ||
} | ||
} | ||
|
||
if (summary.ItemScoresPerTopic != null) | ||
{ | ||
int topId = 0; | ||
foreach (var itemScores in summary.ItemScoresPerTopic) | ||
{ | ||
foreach (var itemScore in itemScores) | ||
{ | ||
writer.WriteLine($"Top[{topId}]: {itemScore.Item}\t{itemScore.Score}"); | ||
} | ||
|
||
topId++; | ||
} | ||
} | ||
}); | ||
} | ||
|
||
private static int GetFrequency(double value) | ||
{ | ||
int result = (int)value; | ||
|
@@ -994,6 +1040,7 @@ internal static class Defaults | |
public const int NumberOfSummaryTermsPerTopic = 10; | ||
public const int NumberOfBurninIterations = 10; | ||
public const bool ResetRandomGenerator = false; | ||
public const bool OutputTopicWordSummary = false; | ||
} | ||
|
||
private readonly IHost _host; | ||
|
@@ -1100,6 +1147,10 @@ internal sealed class ColumnOptions | |
/// Reset the random number generator for each document. | ||
/// </summary> | ||
public readonly bool ResetRandomGenerator; | ||
/// <summary> | ||
/// Whether to output the topic-word summary in text format | ||
/// </summary> | ||
public readonly bool OutputTopicWordSummary; | ||
|
||
/// <summary> | ||
/// Describes how the transformer handles one column pair. | ||
|
@@ -1117,6 +1168,7 @@ internal sealed class ColumnOptions | |
/// <param name="numberOfSummaryTermsPerTopic">The number of words to summarize the topic.</param> | ||
/// <param name="numberOfBurninIterations">The number of burn-in iterations.</param> | ||
/// <param name="resetRandomGenerator">Reset the random number generator for each document.</param> | ||
/// <param name="outputTopicWordSummary">Whether to output the topic-word summary in text format.</param> | ||
public ColumnOptions(string name, | ||
string inputColumnName = null, | ||
int numberOfTopics = LatentDirichletAllocationEstimator.Defaults.NumberOfTopics, | ||
|
@@ -1129,7 +1181,8 @@ public ColumnOptions(string name, | |
int maximumTokenCountPerDocument = LatentDirichletAllocationEstimator.Defaults.MaximumTokenCountPerDocument, | ||
int numberOfSummaryTermsPerTopic = LatentDirichletAllocationEstimator.Defaults.NumberOfSummaryTermsPerTopic, | ||
int numberOfBurninIterations = LatentDirichletAllocationEstimator.Defaults.NumberOfBurninIterations, | ||
bool resetRandomGenerator = LatentDirichletAllocationEstimator.Defaults.ResetRandomGenerator) | ||
bool resetRandomGenerator = LatentDirichletAllocationEstimator.Defaults.ResetRandomGenerator, | ||
bool outputTopicWordSummary = LatentDirichletAllocationEstimator.Defaults.OutputTopicWordSummary) | ||
{ | ||
Contracts.CheckValue(name, nameof(name)); | ||
Contracts.CheckValueOrNull(inputColumnName); | ||
|
@@ -1155,6 +1208,7 @@ public ColumnOptions(string name, | |
NumberOfSummaryTermsPerTopic = numberOfSummaryTermsPerTopic; | ||
NumberOfBurninIterations = numberOfBurninIterations; | ||
ResetRandomGenerator = resetRandomGenerator; | ||
OutputTopicWordSummary = outputTopicWordSummary; | ||
} | ||
|
||
internal ColumnOptions(LatentDirichletAllocationTransformer.Column item, LatentDirichletAllocationTransformer.Options options) : | ||
|
@@ -1170,7 +1224,8 @@ internal ColumnOptions(LatentDirichletAllocationTransformer.Column item, LatentD | |
item.NumMaxDocToken ?? options.NumMaxDocToken, | ||
item.NumSummaryTermPerTopic ?? options.NumSummaryTermPerTopic, | ||
item.NumBurninIterations ?? options.NumBurninIterations, | ||
item.ResetRandomGenerator ?? options.ResetRandomGenerator) | ||
item.ResetRandomGenerator ?? options.ResetRandomGenerator, | ||
item.OutputTopicWordSummary ?? options.OutputTopicWordSummary) | ||
{ | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -5,6 +5,8 @@ | |||||
using System; | ||||||
using System.Collections.Generic; | ||||||
using System.IO; | ||||||
using System.IO.Compression; | ||||||
using System.Linq; | ||||||
using Microsoft.ML.CommandLine; | ||||||
using Microsoft.ML.Data; | ||||||
using Microsoft.ML.Internal.Utilities; | ||||||
|
@@ -1047,6 +1049,18 @@ public void SavePipeLda() | |||||
"loader=Text{col=F1V:Num:0-2}", | ||||||
"xf=Lda{col={name=Result src=F1V numtopic=3 alphasum=3 ns=3 reset=+ t=1} summary=+}", | ||||||
}, forceDense: true); | ||||||
|
||||||
// topic summary text file saved inside the model.zip file. | ||||||
string name = TestName + ".zip"; | ||||||
string modelPath = GetOutputPath("SavePipe", name); | ||||||
using (var file = Env.OpenInputFile(modelPath)) | ||||||
using (var strm = file.OpenReadStream()) | ||||||
using (var zip = new ZipArchive(strm, ZipArchiveMode.Read)) | ||||||
{ | ||||||
Comment on lines
+1052
to
+1059
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (appended to code to allow for threaded comment) Can you please also add a test to test your PR on ML.NET's API instead of MAML? machinelearning/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs Lines 190 to 191 in 45a16dc
Thanks.! #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great idea, I will test both MAML and from transformer. Actually test from MAML and check summary was how TLC originally test topic summary. In reply to: 446344620 [](ancestors = 446344620) |
||||||
var entry = zip.Entries.First(source => source.Name == "word_topic_summary.txt"); | ||||||
Assert.True(entry != null); | ||||||
} | ||||||
|
||||||
Done(); | ||||||
} | ||||||
|
||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Suggestion, change the HelpText of this argument and the one on
internal Options.OutputTopicWordSummary
to be "Whether to output the topic-word summary in text format when saving the model to disk"`, else it isn't understood that users will actually need to save the model to disk in order to get that output. (see microsoft/NimbusML#405 where the user actually expected to see the output on the console).Also update the docs elsewhere where you've added this option. #Resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sounds good, thanks
In reply to: 445990136 [](ancestors = 445990136)