From dbcb08f012a057d6edc276c96e59183d562c7f90 Mon Sep 17 00:00:00 2001 From: niemi Date: Wed, 5 Oct 2022 14:38:25 +0300 Subject: [PATCH] fixed slow preprocessing due to non-cached/compiled regexes, fixed crash when deleting model with running pipes, fixed languages for language group models with no yaml file, fixed bug with Tatoeba model list parsing --- OpusCatMTEngine/App.xaml.cs | 2 + OpusCatMTEngine/MTModel.cs | 63 +++-- OpusCatMTEngine/MTModelConfig.cs | 6 + OpusCatMTEngine/ModelManager.cs | 16 +- .../Preprocessing/MosesPreprocess.cs | 243 +++++++++++------- OpusCatMTEngine/UI/EditRulesView.xaml.cs | 2 +- OpusCatMTEngine/UI/OnlineModelView.xaml.cs | 6 +- 7 files changed, 218 insertions(+), 120 deletions(-) diff --git a/OpusCatMTEngine/App.xaml.cs b/OpusCatMTEngine/App.xaml.cs index 9789945..ca54ab5 100644 --- a/OpusCatMTEngine/App.xaml.cs +++ b/OpusCatMTEngine/App.xaml.cs @@ -11,6 +11,7 @@ using System.Net; using System.Reflection; using System.ServiceModel; +using System.Text.RegularExpressions; using System.Threading.Tasks; using System.Windows; using System.Windows.Threading; @@ -119,6 +120,7 @@ private void Application_Startup(object sender, StartupEventArgs e) this.InitializePythonEngine(); this.CheckForUpdatesAsync(); + } private async void CheckForUpdatesAsync() diff --git a/OpusCatMTEngine/MTModel.cs b/OpusCatMTEngine/MTModel.cs index fe08990..26de99b 100644 --- a/OpusCatMTEngine/MTModel.cs +++ b/OpusCatMTEngine/MTModel.cs @@ -160,7 +160,7 @@ internal void Shutdown() foreach (var langpair in this.marianProcesses.Keys) { this.marianProcesses[langpair].ShutdownMtPipe(); - this.marianProcesses[langpair] = null; + //this.marianProcesses[langpair] = null; } } this.marianProcesses = null; @@ -285,13 +285,7 @@ internal void DownloadProgressChanged(object sender, DownloadProgressChangedEven { this.InstallProgress = e.ProgressPercentage; } - - public List TargetLanguages - { - get => targetLanguages; - set => targetLanguages = value; - } - + public MTModelStatus Status { get => status; set { status = value; NotifyPropertyChanged(); } } //This creates a zip package of the model that can be moved to another computer @@ -460,7 +454,25 @@ internal void ExitHandler(object sender, EventArgs e) } } - public List SourceLanguages { get => sourceLanguages; set => sourceLanguages = value; } + public List SourceLanguages + { + get => sourceLanguages; + set + { + sourceLanguages = value; + NotifyPropertyChanged("SourceLanguageString"); + } + } + + public List TargetLanguages + { + get => targetLanguages; + set + { + targetLanguages = value; + NotifyPropertyChanged("TargetLanguageString"); + } + } public string Name { get => name; set => name = value; } @@ -633,14 +645,23 @@ public MTModel( this.UpdateModelYamlPath(); this.SupportsWordAlignment = this.decoderSettings.models[0].Contains("-align"); - - if (this.modelYaml == null) - { - this.ParseModelPathForLanguages(modelPath); - } this.ParseModelConfig(); + if (this.modelYaml == null) + { + if (this.modelConfig.SourceLanguageCodes == null || + this.modelConfig.TargetLanguageCodes == null) + { + this.ParseModelPathForLanguages(modelPath); + } + else + { + this.SourceLanguages = this.modelConfig.SourceLanguageCodes.Select(x => new IsoLanguage(x)).ToList(); + this.TargetLanguages = this.modelConfig.TargetLanguageCodes.Select(x => new IsoLanguage(x)).ToList(); + } + } + // this.AutoPreEditRuleCollections = new ObservableCollection( this.ModelConfig.AutoPreEditRuleCollectionGuids.Select(x => autoPreEditRuleCollections.SingleOrDefault( @@ -654,6 +675,17 @@ public MTModel( internal void SaveModelConfig() { + if (this.ModelConfig == null) + { + this.ModelConfig = new MTModelConfig(); + } + + if (this.SourceLanguages != null && this.TargetLanguages != null) + { + this.ModelConfig.SourceLanguageCodes = new ObservableCollection(this.SourceLanguages.Select(x => x.OriginalCode)); + this.ModelConfig.TargetLanguageCodes = new ObservableCollection(this.TargetLanguages.Select(x => x.OriginalCode)); + } + //The directory might not exists yet in case of customized models (i.e. copying of the base model //is not complete) if (Directory.Exists(this.InstallDir)) @@ -797,7 +829,6 @@ public MTModel( //This is used for online models, model uri is included for later download of models public MTModel(string modelPath, Uri modelUri, string yamlString = null) { - this.ModelPath = modelPath; this.modelYaml = yamlString; if (yamlString != null) @@ -1006,7 +1037,7 @@ public string TargetCodesString } public string ModelPath { get; internal set; } - public string InstallDir { get; } + public string InstallDir { get; set; } public bool Prioritized { get => _prioritized; set { _prioritized = value; NotifyPropertyChanged(); } } private MarianLog TrainingLog; diff --git a/OpusCatMTEngine/MTModelConfig.cs b/OpusCatMTEngine/MTModelConfig.cs index 35086d1..9456c48 100644 --- a/OpusCatMTEngine/MTModelConfig.cs +++ b/OpusCatMTEngine/MTModelConfig.cs @@ -52,6 +52,12 @@ private void NotifyPropertyChanged([CallerMemberName] string propertyName = "") [YamlMember(Alias = "auto-post-edit-rule-collection-guids", ApplyNamingConventions = false)] public ObservableCollection AutoPostEditRuleCollectionGuids { get; internal set; } + [YamlMember(Alias = "source-languages", ApplyNamingConventions = false)] + public ObservableCollection SourceLanguageCodes { get; internal set; } + + [YamlMember(Alias = "target-languages", ApplyNamingConventions = false)] + public ObservableCollection TargetLanguageCodes { get; internal set; } + private bool finetuningComplete; public MTModelConfig() diff --git a/OpusCatMTEngine/ModelManager.cs b/OpusCatMTEngine/ModelManager.cs index 105cfdd..95a568b 100644 --- a/OpusCatMTEngine/ModelManager.cs +++ b/OpusCatMTEngine/ModelManager.cs @@ -520,7 +520,7 @@ private void TatoebaModelListDownloaded(object sender, DownloadStringCompletedEv //Use distinct to remove duplicate entries foreach (var line in modelList.Split('\n').Distinct()) { - var split = line.Split('\t'); + var split = line.Split(new char[] { '\t' }); if (split.Length >= 4) { var modelPath = split[0]; @@ -529,17 +529,17 @@ private void TatoebaModelListDownloaded(object sender, DownloadStringCompletedEv IEnumerable sourceLangs = split[2].Split(','); IEnumerable targetLangs; - //Multilingual models have a use-target-labels field, which contains the target + //Multilingual models have a use-target-labels field (fifth column in tsv), which contains the target //labels of the model, use that instead of target languages, as it will include //script info (e.g. Latn or Cyrl). - if (split.Length > 4) + if (String.IsNullOrWhiteSpace(split[4])) { - //Remove the target code delimiters >>code<< -> code - targetLangs = split[4].Replace("<","").Replace(">","").Split(','); + targetLangs = split[3].Split(','); } else { - targetLangs = split[3].Split(','); + //Remove the target code delimiters >>code<< -> code + targetLangs = split[4].Replace("<", "").Replace(">", "").Split(','); } //Some entries might have empty source and target languages @@ -549,8 +549,10 @@ private void TatoebaModelListDownloaded(object sender, DownloadStringCompletedEv } var model = new MTModel(modelPath.Replace(".zip", ""), modelUri); model.ModelType = modelType; - model.SourceLanguages = sourceLangs.Select(x => new IsoLanguage(x)).ToList(); + model.TargetLanguages = targetLangs.Select(x => new IsoLanguage(x)).ToList(); + model.SourceLanguages = sourceLangs.Select(x => new IsoLanguage(x)).ToList(); + this.onlineModels.Add(model); } } diff --git a/OpusCatMTEngine/Preprocessing/MosesPreprocess.cs b/OpusCatMTEngine/Preprocessing/MosesPreprocess.cs index 7b00acd..5be517b 100644 --- a/OpusCatMTEngine/Preprocessing/MosesPreprocess.cs +++ b/OpusCatMTEngine/Preprocessing/MosesPreprocess.cs @@ -14,7 +14,7 @@ namespace OpusCatMTEngine /// public static class MosesPreprocessor { - + static MosesPreprocessor() { @@ -29,119 +29,173 @@ public static string PreprocessSpaces(string sourceSentence) public static string RunMosesPreprocessing(string input, string language) { - input = ReplaceUnicodePunctuation(input); - input = RemoveNonPrintingChar(input); - input = NormalizePunctuation(input,language); + var inputBuilder = new StringBuilder(input); + ReplaceUnicodePunctuation(inputBuilder); + input = RemoveNonPrintingChar(inputBuilder.ToString()); + //input = NormalizePunctuation(input, language); return input; } - public static string ReplaceUnicodePunctuation(string input) + public static void ReplaceUnicodePunctuation(StringBuilder input) { - input = Regex.Replace(input, ",", ","); - input = Regex.Replace(input, "。 *", ". "); - input = Regex.Replace(input, "、", ","); - input = Regex.Replace(input, "”", "\""); - input = Regex.Replace(input, "“", "\""); - input = Regex.Replace(input, "∶", ":"); - input = Regex.Replace(input, ":", ":"); - input = Regex.Replace(input, "?", "?"); - input = Regex.Replace(input, "《", "\""); - input = Regex.Replace(input, "》", "\""); - input = Regex.Replace(input, ")", ")"); - input = Regex.Replace(input, "!", "!"); - input = Regex.Replace(input, "(", "("); - input = Regex.Replace(input, ";", ";"); - input = Regex.Replace(input, "1", "\""); - input = Regex.Replace(input, "」", "\""); - input = Regex.Replace(input, "「", "\""); - input = Regex.Replace(input, "0", "0"); - input = Regex.Replace(input, "3", "3"); - input = Regex.Replace(input, "2", "2"); - input = Regex.Replace(input, "5", "5"); - input = Regex.Replace(input, "6", "6"); - input = Regex.Replace(input, "9", "9"); - input = Regex.Replace(input, "7", "7"); - input = Regex.Replace(input, "8", "8"); - input = Regex.Replace(input, "4", "4"); - input = Regex.Replace(input, ". *", ". "); - input = Regex.Replace(input, "~", "~"); - input = Regex.Replace(input, "’", "'"); - input = Regex.Replace(input, "…", "..."); - input = Regex.Replace(input, "━", "-"); - input = Regex.Replace(input, "〈", "<"); - input = Regex.Replace(input, "〉", ">"); - input = Regex.Replace(input, "【", "["); - input = Regex.Replace(input, "】", "]"); - input = Regex.Replace(input, "%", "%"); + input.Replace(",", ","); + input.Replace("。", ". "); + input.Replace("、", ","); + input.Replace("”", "\""); + input.Replace("“", "\""); + input.Replace("∶", ":"); + input.Replace(":", ":"); + input.Replace("?", "?"); + input.Replace("《", "\""); + input.Replace("》", "\""); + input.Replace(")", ")"); + input.Replace("!", "!"); + input.Replace("(", "("); + input.Replace(";", ";"); + input.Replace("1", "\""); + input.Replace("」", "\""); + input.Replace("「", "\""); + input.Replace("0", "0"); + input.Replace("3", "3"); + input.Replace("2", "2"); + input.Replace("5", "5"); + input.Replace("6", "6"); + input.Replace("9", "9"); + input.Replace("7", "7"); + input.Replace("8", "8"); + input.Replace("4", "4"); + input.Replace(". *", ". "); + input.Replace("~", "~"); + input.Replace("’", "'"); + input.Replace("…", "..."); + input.Replace("━", "-"); + input.Replace("〈", "<"); + input.Replace("〉", ">"); + input.Replace("【", "["); + input.Replace("】", "]"); + input.Replace("%", "%"); - return input; } + private static Regex NonPrintingCharRegex = new Regex(@"\p{C}", RegexOptions.Compiled); + public static string RemoveNonPrintingChar(string input) { - input = Regex.Replace(input,@"\p{C}"," "); + input = NonPrintingCharRegex.Replace(input, " "); + return input; + } + + private static List> PunctuationRegexes1 = new List>() + { + MosesPreprocessor.CreateRegexWithReplacement("\r",""), + // remove extra spaces + MosesPreprocessor.CreateRegexWithReplacement(@"\("," ("), + MosesPreprocessor.CreateRegexWithReplacement(@"\)",") "), + MosesPreprocessor.CreateRegexWithReplacement(@" +", " "), + MosesPreprocessor.CreateRegexWithReplacement(@"\) ([\.\!\:\?\;\,])", ")$1"), + MosesPreprocessor.CreateRegexWithReplacement(@"\( ", "("), + MosesPreprocessor.CreateRegexWithReplacement(@" \)", ")"), + MosesPreprocessor.CreateRegexWithReplacement(@"(\d) \%", "$1%"), + MosesPreprocessor.CreateRegexWithReplacement(@" :", ":"), + MosesPreprocessor.CreateRegexWithReplacement(@" ;", ";") + }; + + private static List> PunctuationRegexes2 = new List>() + { + MosesPreprocessor.CreateRegexWithReplacement("\r",""), + MosesPreprocessor.CreateRegexWithReplacement( @"„", "\""), + MosesPreprocessor.CreateRegexWithReplacement(@"“","\""), + MosesPreprocessor.CreateRegexWithReplacement(@"”", "\""), + MosesPreprocessor.CreateRegexWithReplacement(@"–"," - "), + MosesPreprocessor.CreateRegexWithReplacement(@"—", " - "), + MosesPreprocessor.CreateRegexWithReplacement(@" +", " "), + MosesPreprocessor.CreateRegexWithReplacement(@"´", "'"), + MosesPreprocessor.CreateRegexWithReplacement(@"([A-Za-z])‘([A-Za-z])", @"$1\'$2"), + MosesPreprocessor.CreateRegexWithReplacement(@"([A-Za-z])’([A-Za-z])", @"$1\'$2"), + MosesPreprocessor.CreateRegexWithReplacement(@"‘", "\""), + MosesPreprocessor.CreateRegexWithReplacement(@"‚","\""), + MosesPreprocessor.CreateRegexWithReplacement(@"’", "\""), + MosesPreprocessor.CreateRegexWithReplacement(@"''","\""), + MosesPreprocessor.CreateRegexWithReplacement(@"´´", "\""), + MosesPreprocessor.CreateRegexWithReplacement(@"…",@"..."), + // French quotes + MosesPreprocessor.CreateRegexWithReplacement(@" « ", " \""), + MosesPreprocessor.CreateRegexWithReplacement(@"« ","\""), + MosesPreprocessor.CreateRegexWithReplacement(@"«", "\""), + MosesPreprocessor.CreateRegexWithReplacement(@" » ","\" "), + MosesPreprocessor.CreateRegexWithReplacement(@" »", "\""), + MosesPreprocessor.CreateRegexWithReplacement(@"»","\""), + // handle pseudo-spaces + MosesPreprocessor.CreateRegexWithReplacement( @" \%", "%"), + MosesPreprocessor.CreateRegexWithReplacement( @"nº ", "nº "), + MosesPreprocessor.CreateRegexWithReplacement( @" :", ":"), + MosesPreprocessor.CreateRegexWithReplacement( @" ºC", " ºC"), + MosesPreprocessor.CreateRegexWithReplacement( @" cm", " cm"), + MosesPreprocessor.CreateRegexWithReplacement( @" \?", "?"), + MosesPreprocessor.CreateRegexWithReplacement( @" \!", "!"), + MosesPreprocessor.CreateRegexWithReplacement( @" ;", ";"), + MosesPreprocessor.CreateRegexWithReplacement( @", ", ", "), + MosesPreprocessor.CreateRegexWithReplacement( @" +", " ") + }; + + private static List> PennRegexes = new List>() + { + MosesPreprocessor.CreateRegexWithReplacement(@"\`","'"), + MosesPreprocessor.CreateRegexWithReplacement(@"\'\'"," \" ") + }; + + private static List> EngRegexes = new List>() + { + MosesPreprocessor.CreateRegexWithReplacement("\"([,\\.]+)", "$1\"") + }; + + private static List> GerSpaPunctRegexes = new List>() + { + MosesPreprocessor.CreateRegexWithReplacement(",\"","\","), + MosesPreprocessor.CreateRegexWithReplacement(" (\\.+)\"(\\s*[^<])","\"$1$2") // don't fix period at end of sentence + }; + + private static List> CommaNumberRegex = new List>() + { + MosesPreprocessor.CreateRegexWithReplacement(@"(\d) (\d)","$1,$2") + }; + + private static List> DotNumberRegex = new List>() + { + MosesPreprocessor.CreateRegexWithReplacement(@"(\d) (\d)","$1.$2") + }; + + private static Tuple CreateRegexWithReplacement(String pattern, String replacement) + { + return new Tuple(new Regex(pattern, RegexOptions.Compiled), replacement); + } + + private static String ApplyRegexReplacementCollection(String input, List> collection) + { + foreach (var regexReplacement in collection) + { + input = regexReplacement.Item1.Replace(input, regexReplacement.Item2); + } return input; } public static string NormalizePunctuation(string input,string language) { var penn = 0; - input = Regex.Replace(input, "\r", ""); - // remove extra spaces - input = Regex.Replace(input, @"\(", " ("); - input = Regex.Replace(input, @"\)", ") "); - input = Regex.Replace(input, @" +", " "); - input = Regex.Replace(input, @"\) ([\.\!\:\?\;\,])", ")$1"); - input = Regex.Replace(input, @"\( ", "("); - input = Regex.Replace(input, @" \)", ")"); - input = Regex.Replace(input, @"(\d) \%", "$1%"); - input = Regex.Replace(input, @" :", ":"); - input = Regex.Replace(input, @" ;", ";"); + + input = MosesPreprocessor.ApplyRegexReplacementCollection(input, MosesPreprocessor.PunctuationRegexes1); + // normalize unicode punctuation if (penn == 0) { - input = Regex.Replace(input, @"\`", "'"); - input = Regex.Replace(input, @"\'\'", " \" "); + input = MosesPreprocessor.ApplyRegexReplacementCollection(input, MosesPreprocessor.PennRegexes); } - input = Regex.Replace(input, @"„", "\""); - input = Regex.Replace(input,@"“","\""); - input = Regex.Replace(input, @"”", "\""); - input = Regex.Replace(input,@"–"," - "); - - input = Regex.Replace(input, @"—", " - "); - input = Regex.Replace(input, @" +", " "); - input = Regex.Replace(input, @"´", "'"); - input = Regex.Replace(input, @"([a-z])‘([a-z])", @"$1\'$2",RegexOptions.IgnoreCase); - input = Regex.Replace(input, @"([a-z])’([a-z])", @"$1\'$2", RegexOptions.IgnoreCase); - input = Regex.Replace(input, @"‘", "\""); - input = Regex.Replace(input,@"‚","\""); - input = Regex.Replace(input, @"’", "\""); - input = Regex.Replace(input,@"''","\""); - input = Regex.Replace(input, @"´´", "\""); - input = Regex.Replace(input,@"…",@"..."); - // French quotes - input = Regex.Replace(input, @" « ", " \""); - input = Regex.Replace(input,@"« ","\""); - input = Regex.Replace(input, @"«", "\""); - input = Regex.Replace(input,@" » ","\" "); - input = Regex.Replace(input, @" »", "\""); - input = Regex.Replace(input,@"»","\""); - // handle pseudo-spaces - input = Regex.Replace(input, @" \%", "%"); - input = Regex.Replace(input, @"nº ", "nº "); - input = Regex.Replace(input, @" :", ":"); - input = Regex.Replace(input, @" ºC", " ºC"); - input = Regex.Replace(input, @" cm", " cm"); - input = Regex.Replace(input, @" \?", "?"); - input = Regex.Replace(input, @" \!", "!"); - input = Regex.Replace(input, @" ;", ";"); - input = Regex.Replace(input, @", ", ", "); - input = Regex.Replace(input, @" +", " "); + input = MosesPreprocessor.ApplyRegexReplacementCollection(input, MosesPreprocessor.PunctuationRegexes2); // English "quotation," followed by comma, style if (language == "en") { - input = Regex.Replace(input, "\"([,\\.]+)", "$1\""); + input = MosesPreprocessor.ApplyRegexReplacementCollection(input, MosesPreprocessor.EngRegexes); } // Czech is confused else if(language == "cs" || language == "cz") { @@ -149,15 +203,14 @@ public static string NormalizePunctuation(string input,string language) // German/Spanish/French "quotation", followed by comma, style else { - input = Regex.Replace(input, ",\"","\","); - input = Regex.Replace(input, " (\\.+)\"(\\s*[^<])","\"$1$2"); // don't fix period at end of sentence + input = MosesPreprocessor.ApplyRegexReplacementCollection(input, MosesPreprocessor.GerSpaPunctRegexes); } if (language == "de" || language == "es" || language == "cz" || language == "cs" || language == "fr") { - input = Regex.Replace(input,@"(\d) (\d)","$1,$2"); + input = MosesPreprocessor.ApplyRegexReplacementCollection(input, MosesPreprocessor.CommaNumberRegex); } else { - input = Regex.Replace(input,@" (\d) (\d)","$1.$2"); + input = MosesPreprocessor.ApplyRegexReplacementCollection(input, MosesPreprocessor.DotNumberRegex); } return input; diff --git a/OpusCatMTEngine/UI/EditRulesView.xaml.cs b/OpusCatMTEngine/UI/EditRulesView.xaml.cs index 2199976..d3d49f9 100644 --- a/OpusCatMTEngine/UI/EditRulesView.xaml.cs +++ b/OpusCatMTEngine/UI/EditRulesView.xaml.cs @@ -361,7 +361,7 @@ private void TestRules_Click(object sender, RoutedEventArgs e) var mtResult = this.Model.Translate( previousTesterOutput, this.Model.SourceLanguages.First(), - this.Model.SourceLanguages.First(), + this.Model.TargetLanguages.First(), applyEditRules:false).Result; previousTesterOutput = mtResult.Translation; diff --git a/OpusCatMTEngine/UI/OnlineModelView.xaml.cs b/OpusCatMTEngine/UI/OnlineModelView.xaml.cs index ed8758b..c0862e9 100644 --- a/OpusCatMTEngine/UI/OnlineModelView.xaml.cs +++ b/OpusCatMTEngine/UI/OnlineModelView.xaml.cs @@ -118,7 +118,7 @@ internal void DownloadCompleted(MTModel model, object sender, AsyncCompletedEven try { var installPath = this.ModelManager.ExtractModel(model.ModelPath,true); - + model.InstallDir = installPath; //If model has yaml config, check whether it was included in the zip package (Tatoeba models) if (!String.IsNullOrEmpty(model.TatoebaConfigString)) { @@ -135,8 +135,12 @@ internal void DownloadCompleted(MTModel model, object sender, AsyncCompletedEven { writer.Write(model.TatoebaConfigString); } + + } + model.SaveModelConfig(); + model.InstallStatus = OpusCatMTEngine.Properties.Resources.Online_InstalledStatus; this.ModelManager.GetLocalModels(); }