diff --git a/Seq2SeqSharp/Applications/Options.cs b/Seq2SeqSharp/Applications/Options.cs index 9d4db375..8ab2b70b 100644 --- a/Seq2SeqSharp/Applications/Options.cs +++ b/Seq2SeqSharp/Applications/Options.cs @@ -259,7 +259,6 @@ public class Options [Range(1, 9999999)] public int SaveModelEveryUpdates = 10000; - [Arg("Valid corpus folder path", nameof(ValidCorpusPaths))] public string ValidCorpusPaths = null; @@ -318,6 +317,9 @@ public class Options [Arg("The level of logs to be printed out. Supported Values: none = 0, err = 1, warn = 2, info = 4 and debug = 8. These values can be combined. For example: Value 15 means err, warn, info and debug will be outputted.", nameof(LogLevel))] public Logger.Level LogLevel = (Logger.Level.err | Logger.Level.warn | Logger.Level.info | Logger.Level.debug); + + [Arg("It indicates if checking tensor corrupted is enabled. Default is enabled", nameof(CheckTensorCorrupted))] + public bool CheckTensorCorrupted = true; public void ValidateOptions() { if (AMP == true && ProcessorType != ProcessorTypeEnums.GPU) diff --git a/Seq2SeqSharp/Optimizer/AdamOptimizer.cs b/Seq2SeqSharp/Optimizer/AdamOptimizer.cs index f7ce6ed9..72d7e950 100644 --- a/Seq2SeqSharp/Optimizer/AdamOptimizer.cs +++ b/Seq2SeqSharp/Optimizer/AdamOptimizer.cs @@ -10,6 +10,7 @@ using AdvUtils; using Seq2SeqSharp.Tools; +using Seq2SeqSharp.Utils; using System; using System.Collections.Concurrent; using System.Collections.Generic; @@ -31,10 +32,11 @@ public class AdamOptimizer : IOptimizer private readonly ConcurrentDictionary m_cacheName2M; private readonly float m_clipval; private readonly bool m_saveGPUMemoryMode = false; + private readonly bool m_checkTensorCorrupted = true; - public AdamOptimizer(float clipval, float beta1 = 0.9f, float beta2 = 0.98f, bool saveGPUMemoryMode = false) + public AdamOptimizer(float clipval, float beta1 = 0.9f, float beta2 = 0.98f, bool saveGPUMemoryMode = false, bool checkTensorCorrupted = true) { - Logger.WriteLine(Logger.Level.debug, $"Creating Adam optimizer. GradClip = '{clipval}', Beta1 = '{beta1}', Beta2 = '{beta2}', SaveGPUMemoryMode = '{saveGPUMemoryMode}'"); + Logger.WriteLine(Logger.Level.debug, $"Creating Adam optimizer. GradClip = '{clipval}', Beta1 = '{beta1}', Beta2 = '{beta2}', SaveGPUMemoryMode = '{saveGPUMemoryMode}', CheckTensorCorrupted = '{checkTensorCorrupted}'"); m_cacheName2V = new ConcurrentDictionary(); m_cacheName2M = new ConcurrentDictionary(); @@ -43,6 +45,7 @@ public AdamOptimizer(float clipval, float beta1 = 0.9f, float beta2 = 0.98f, boo m_beta1 = beta1; m_beta2 = beta2; m_saveGPUMemoryMode = saveGPUMemoryMode; + m_checkTensorCorrupted = checkTensorCorrupted; } public void UpdateWeights(List model, int batchSize, float step_size, float regc, int iter) @@ -57,6 +60,11 @@ public void UpdateWeights(List model, int batchSize, float step_s continue; } + if (m_checkTensorCorrupted == true && item.IsGradientCorrupted()) + { + throw new GradientsCorruptedException($"The gradients of '{item.Name}' is corrupted."); + } + if (name2tensor.ContainsKey(item.Name)) { throw new ArgumentException($"Found duplicated weights '{item.Name}'."); diff --git a/Seq2SeqSharp/Tools/BaseSeq2SeqFramework.cs b/Seq2SeqSharp/Tools/BaseSeq2SeqFramework.cs index ca5abd51..3004d5af 100644 --- a/Seq2SeqSharp/Tools/BaseSeq2SeqFramework.cs +++ b/Seq2SeqSharp/Tools/BaseSeq2SeqFramework.cs @@ -413,6 +413,24 @@ public void Train(int maxTrainingEpoch, ICorpus trainCorpus, ICorpus Train(maxTrainingEpoch, trainCorpus, validCorpusList, learningRate, taskId2metrics, optimizer, decodingOptions); } + + private void DumpBatchToLogger(List batchs) + { + foreach (var batch in batchs) + { + var srcTokensList = batch.GetSrcTokens(); + var tgtTokensList = batch.GetTgtTokens(); + + for (int i = 0; i < srcTokensList.Count; i++) + { + var srcSent = String.Join(" ", srcTokensList[i]); + var tgtSent = String.Join(" ", tgtTokensList[i]); + + Logger.WriteLine(Logger.Level.debug, $"Src = '{srcSent}', Tgt = '{tgtSent}'"); + } + } + } + internal void TrainOneEpoch(int ep, ICorpus trainCorpus, ICorpus[] validCorpusList, ILearningRate learningRate, IOptimizer solver, Dictionary> taskId2metrics, DecodingOptions decodingOptions, Func> forwardOnSingleDevice) { @@ -551,8 +569,15 @@ internal void TrainOneEpoch(int ep, ICorpus trainCorpus, ICorpus