Skip to content

Commit

Permalink
Add gradients corruption check
Browse files Browse the repository at this point in the history
  • Loading branch information
zhongkaifu committed Nov 17, 2023
1 parent e1c034d commit 27e2109
Show file tree
Hide file tree
Showing 12 changed files with 190 additions and 20 deletions.
4 changes: 3 additions & 1 deletion Seq2SeqSharp/Applications/Options.cs
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,6 @@ public class Options
[Range(1, 9999999)]
public int SaveModelEveryUpdates = 10000;


[Arg("Valid corpus folder path", nameof(ValidCorpusPaths))]
public string ValidCorpusPaths = null;

Expand Down Expand Up @@ -318,6 +317,9 @@ public class Options
[Arg("The level of logs to be printed out. Supported Values: none = 0, err = 1, warn = 2, info = 4 and debug = 8. These values can be combined. For example: Value 15 means err, warn, info and debug will be outputted.", nameof(LogLevel))]
public Logger.Level LogLevel = (Logger.Level.err | Logger.Level.warn | Logger.Level.info | Logger.Level.debug);


[Arg("It indicates if checking tensor corrupted is enabled. Default is enabled", nameof(CheckTensorCorrupted))]
public bool CheckTensorCorrupted = true;
public void ValidateOptions()
{
if (AMP == true && ProcessorType != ProcessorTypeEnums.GPU)
Expand Down
12 changes: 10 additions & 2 deletions Seq2SeqSharp/Optimizer/AdamOptimizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

using AdvUtils;
using Seq2SeqSharp.Tools;
using Seq2SeqSharp.Utils;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
Expand All @@ -31,10 +32,11 @@ public class AdamOptimizer : IOptimizer
private readonly ConcurrentDictionary<string, Tensor> m_cacheName2M;
private readonly float m_clipval;
private readonly bool m_saveGPUMemoryMode = false;
private readonly bool m_checkTensorCorrupted = true;

public AdamOptimizer(float clipval, float beta1 = 0.9f, float beta2 = 0.98f, bool saveGPUMemoryMode = false)
public AdamOptimizer(float clipval, float beta1 = 0.9f, float beta2 = 0.98f, bool saveGPUMemoryMode = false, bool checkTensorCorrupted = true)
{
Logger.WriteLine(Logger.Level.debug, $"Creating Adam optimizer. GradClip = '{clipval}', Beta1 = '{beta1}', Beta2 = '{beta2}', SaveGPUMemoryMode = '{saveGPUMemoryMode}'");
Logger.WriteLine(Logger.Level.debug, $"Creating Adam optimizer. GradClip = '{clipval}', Beta1 = '{beta1}', Beta2 = '{beta2}', SaveGPUMemoryMode = '{saveGPUMemoryMode}', CheckTensorCorrupted = '{checkTensorCorrupted}'");

m_cacheName2V = new ConcurrentDictionary<string, Tensor>();
m_cacheName2M = new ConcurrentDictionary<string, Tensor>();
Expand All @@ -43,6 +45,7 @@ public AdamOptimizer(float clipval, float beta1 = 0.9f, float beta2 = 0.98f, boo
m_beta1 = beta1;
m_beta2 = beta2;
m_saveGPUMemoryMode = saveGPUMemoryMode;
m_checkTensorCorrupted = checkTensorCorrupted;
}

public void UpdateWeights(List<IWeightTensor> model, int batchSize, float step_size, float regc, int iter)
Expand All @@ -57,6 +60,11 @@ public void UpdateWeights(List<IWeightTensor> model, int batchSize, float step_s
continue;
}

if (m_checkTensorCorrupted == true && item.IsGradientCorrupted())
{
throw new GradientsCorruptedException($"The gradients of '{item.Name}' is corrupted.");
}

if (name2tensor.ContainsKey(item.Name))
{
throw new ArgumentException($"Found duplicated weights '{item.Name}'.");
Expand Down
25 changes: 25 additions & 0 deletions Seq2SeqSharp/Tools/BaseSeq2SeqFramework.cs
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,24 @@ public void Train(int maxTrainingEpoch, ICorpus<IPairBatch> trainCorpus, ICorpus
Train(maxTrainingEpoch, trainCorpus, validCorpusList, learningRate, taskId2metrics, optimizer, decodingOptions);
}


private void DumpBatchToLogger(List<IPairBatch> batchs)
{
foreach (var batch in batchs)
{
var srcTokensList = batch.GetSrcTokens();
var tgtTokensList = batch.GetTgtTokens();

for (int i = 0; i < srcTokensList.Count; i++)
{
var srcSent = String.Join(" ", srcTokensList[i]);
var tgtSent = String.Join(" ", tgtTokensList[i]);

Logger.WriteLine(Logger.Level.debug, $"Src = '{srcSent}', Tgt = '{tgtSent}'");
}
}
}

internal void TrainOneEpoch(int ep, ICorpus<IPairBatch> trainCorpus, ICorpus<IPairBatch>[] validCorpusList, ILearningRate learningRate, IOptimizer solver, Dictionary<int, List<IMetric>> taskId2metrics, DecodingOptions decodingOptions,
Func<IComputeGraph, IPairBatch, DecodingOptions, bool, List<NetworkResult>> forwardOnSingleDevice)
{
Expand Down Expand Up @@ -551,8 +569,15 @@ internal void TrainOneEpoch(int ep, ICorpus<IPairBatch> trainCorpus, ICorpus<IPa
{
Logger.WriteLine(Logger.Level.err, ConsoleColor.Red, $"Exception: {err.Message}.");
Logger.WriteLine(Logger.Level.debug, ConsoleColor.Red, $"Call stack: {err.StackTrace}");
DumpBatchToLogger(sntPairBatchs);
throw;
}
catch (GradientsCorruptedException err)
{
Logger.WriteLine(Logger.Level.warn, ConsoleColor.Yellow, $"We got gradients corruption, ignore current batch: {err.Message}");
DumpBatchToLogger(sntPairBatchs);
break;
}
catch (Exception err)
{
Logger.WriteLine(Logger.Level.err, ConsoleColor.Red, $"Exception: {err.Message}.");
Expand Down
1 change: 1 addition & 0 deletions Seq2SeqSharp/Tools/IWeightTensor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,6 @@ public interface IWeightTensor : INeuralUnit, IDisposable
long ElementCount { get; }
void PrintWeights();
bool IsWeightsCorrupted();
bool IsGradientCorrupted();
}
}
17 changes: 5 additions & 12 deletions Seq2SeqSharp/Tools/WeightTensor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -240,23 +240,16 @@ public INeuralUnit CloneToDeviceAt(int deviceId)
return new WeightTensor(Sizes, deviceId, Name, IsTrainable, initType: m_normType, fanIn: m_fanIn, fanOut: m_fanOut, needGradient: NeedGradient, dtype: m_elementType);
}

public bool IsGradientCorrupted()
{
return Ops.IsCorrupted(TGradient);
}

public bool IsWeightsCorrupted()
{
float[] weights = ToWeightArray();

for (int i = 0; i < weights.Length; i++)
{
if (float.IsNaN(weights[i]) || float.IsInfinity(weights[i]))
{
return true;
}
}

return false;
return Ops.IsCorrupted(TWeight);
}


public void ZeroGradient()
{
Ops.Fill(TGradient, 0.0f);
Expand Down
2 changes: 1 addition & 1 deletion Seq2SeqSharp/Utils/Misc.cs
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ public static IOptimizer CreateOptimizer(Options opts)
IOptimizer optimizer = null;
if (string.Equals(opts.Optimizer, "Adam", StringComparison.InvariantCultureIgnoreCase))
{
optimizer = new AdamOptimizer(opts.GradClip, opts.Beta1, opts.Beta2, opts.SaveGPUMemoryMode);
optimizer = new AdamOptimizer(opts.GradClip, opts.Beta1, opts.Beta2, opts.SaveGPUMemoryMode, opts.CheckTensorCorrupted);
}
else
{
Expand Down
7 changes: 7 additions & 0 deletions Seq2SeqSharp/Utils/WeightsCorruptedException.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,11 @@ public WeightsCorruptedException() { }

public WeightsCorruptedException(string message) : base(message) { }
}

public class GradientsCorruptedException : Exception
{
public GradientsCorruptedException() { }

public GradientsCorruptedException(string message) : base(message) { }
}
}
4 changes: 4 additions & 0 deletions TensorSharp.CUDA/CudaBasicOps.cs
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,10 @@ public Tensor BuildTriMask(Tensor result, float value, float maskedValue)
return advFuncKernels.BuildTriMask(result, value, maskedValue);
}


[RegisterOpStorageType("iscorrupted", typeof(CudaStorage))]
public bool IsCorrupted(Tensor src) { return advFuncKernels.IsCorrupted(src); }

[RegisterOpStorageType("softmax", typeof(CudaStorage))]
public Tensor Softmax(Tensor result, Tensor src) { return advFuncKernels.Softmax(result, src); }

Expand Down
99 changes: 99 additions & 0 deletions TensorSharp.CUDA/DeviceCode/AdvFuncKernels.cs
Original file line number Diff line number Diff line change
Expand Up @@ -918,6 +918,28 @@ __global__ void gSoftmaxGrad(float* grad, float* adj, float* val, int rows, int
}
}
__global__ void IsCorrupted(float *in, unsigned rows, unsigned cols, int *result)
{
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
if(j < rows) {
const float* sp = in + j * cols;
for(int tid = 0; tid < cols; tid += blockDim.x) {
int i = tid + threadIdx.x;
if(i < cols) {
if (!isfinite(sp[i]))
{
*result = 1;
return;
}
}
}
}
}
}
__global__ void gSoftmax(float* out, float* in, unsigned rows, unsigned cols)
{
for(int bid = 0; bid < rows; bid += gridDim.x) {
Expand Down Expand Up @@ -1765,6 +1787,27 @@ __global__ void gSoftmaxGradHalf(__half* grad, __half* adj, __half* val, int row
}
}
__global__ void IsCorruptedHalf(__half *in, unsigned rows, unsigned cols, int *result)
{
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
if(j < rows) {
const __half* sp = in + j * cols;
for(int tid = 0; tid < cols; tid += blockDim.x) {
int i = tid + threadIdx.x;
if(i < cols) {
if (!isfinite(__half2float(sp[i])))
{
*result = 1;
return;
}
}
}
}
}
}
__global__ void gSoftmaxHalf(__half* out, __half* in, unsigned rows, unsigned cols)
{
for(int bid = 0; bid < rows; bid += gridDim.x) {
Expand Down Expand Up @@ -2508,6 +2551,57 @@ private void IndexSelectGrad(TSCudaContext context, Tensor grad, Tensor adj, Ten
}


private bool IsCorrupted(TSCudaContext context, Tensor src)
{
CudaContext cudaContext = context.CudaContextForTensor(src);
cudaContext.SetCurrent();

if (src.IsContiguous() == false)
{
throw new Exception($"Tensor {nameof(src)} is not contiguous.");
}

int ndim = src.DimensionCount;
long storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides);
long cols = src.Sizes[ndim - 1];

if (storageSize % cols != 0)
{
throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
}

long rows = storageSize / cols;


dim3 block = new dim3((uint)Math.Min(512, cols));
dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, block.y)));

int[] rets = new int[1];
rets[0] = 0;
Tensor result = new Tensor(src.Allocator, DType.Int32, sizes: new long[] { 1, 1 });
result.SetElementsAsInt(rets);

CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result);
CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src);

string kernelName = "IsCorrupted";
if (src.ElementType == DType.Float16)
{
kernelName = "IsCorruptedHalf";
}

Invoke(context, cudaContext, kernelName, grid, block, block.x * sizeof(float), CUstream.NullStream, srcPtr, rows, cols, resultPtr);

rets = result.GetElementsAsInt(1);
if (rets[0] == 0)
{
return false;
}
else
{
return true;
}
}

private void Softmax(TSCudaContext context, Tensor result, Tensor src)
{
Expand Down Expand Up @@ -2689,6 +2783,11 @@ private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor
Invoke(context, cudaContext, kernelName, grid, block, block.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad);
}

public bool IsCorrupted(Tensor src)
{
TSCudaContext context = CudaHelpers.TSContextForTensor(src);
return IsCorrupted(context, src);
}

public Tensor Softmax(Tensor result, Tensor src)
{
Expand Down
17 changes: 17 additions & 0 deletions TensorSharp/Cpu/CpuBasicOps.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

using AdvUtils;
using System;
using System.Data;
using System.Reflection;
using TensorSharp.Core;

Expand Down Expand Up @@ -1064,6 +1065,22 @@ public Tensor RoPEGrad(Tensor grad_, Tensor adj_, int seqLen)
return writeTarget;
}

[RegisterOpStorageType("iscorrupted", typeof(CpuStorage))]
public bool IsCorrupted(Tensor src)
{
int ndim = src.DimensionCount;
long storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides);
long cols = src.Sizes[ndim - 1];

if (storageSize % cols != 0)
{
throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
}

long rows = storageSize / cols;
return TensorApplyCPU.IsCorrupted(src, (int)rows, (int)cols);
}


[RegisterOpStorageType("softmax", typeof(CpuStorage))]
public Tensor Softmax(Tensor result, Tensor src)
Expand Down
2 changes: 1 addition & 1 deletion TensorSharp/Ops.cs
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ public static Tensor Concat(Tensor result, int dimension, params Tensor[] inputs
public static Tensor Std(Tensor result, Tensor src, int dimension, bool normByN) { return (Tensor)OpRegistry.Invoke("std", result, src, dimension, normByN); }
public static Tensor Var(Tensor result, Tensor src, int dimension, bool normByN) { return (Tensor)OpRegistry.Invoke("var", result, src, dimension, normByN); }


public static bool IsCorrupted(Tensor src) { return (bool)OpRegistry.Invoke("iscorrupted", src); }
public static Tensor Softmax(Tensor result, Tensor src) { return (Tensor)OpRegistry.Invoke("softmax", result, src); }
public static Tensor SoftmaxGrad(Tensor grad, Tensor adj, Tensor val, bool addGrad = true) { return (Tensor)OpRegistry.Invoke("softmaxgrad", grad, adj, val, addGrad); }

Expand Down
20 changes: 17 additions & 3 deletions TensorSharp/TensorApplyCPU.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1181,6 +1181,23 @@ unsafe static public void RoPEGrad(Tensor tOut, Tensor tIn, int rows, int cols,
}
}

unsafe static public bool IsCorrupted(Tensor tIn, int rows, int cols)
{
float* pIn = (float*)CpuNativeHelpers.GetBufferStart(tIn);

for (int j = 0; j < rows; ++j)
{
float* sp = pIn + j * cols;
for (int i = 0; i < cols; ++i)
{
if (float.IsFinite(sp[i]) == false)
{
return true;
}
}
}
return false;
}

unsafe static public void Softmax(Tensor tOut, Tensor tIn, int rows, int cols)
{
Expand Down Expand Up @@ -1219,9 +1236,6 @@ unsafe static public void Softmax(Tensor tOut, Tensor tIn, int rows, int cols)
{
so[k] /= sum;
}



}
}

Expand Down

0 comments on commit 27e2109

Please sign in to comment.