From fddf8f57004e151648f6ae5f47c9dfd540a453e6 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Wed, 10 Jan 2024 16:36:32 -0500 Subject: [PATCH] Initial stgab at improved health checking * upgrade to net8.0 * Add health check endpoint to GRPC engine proto (from serval) * Combine health reports into rich data --- dockerfile | 4 +- dockerfile.development | 2 +- .../IMachineBuilderExtensions.cs | 39 +++++++++++++------ .../SIL.Machine.AspNetCore.csproj | 3 +- .../ServalTranslationEngineServiceV1.cs | 15 ++++++- .../Utils/WriteHealthCheckResponse.cs | 34 ++++++++++++++++ ....Machine.Morphology.HermitCrab.Tool.csproj | 2 +- .../SIL.Machine.Plugin.csproj | 2 +- .../Program.cs | 1 - .../SIL.Machine.Serval.EngineServer.csproj | 4 +- src/SIL.Machine.Serval.JobServer/Program.cs | 2 - .../SIL.Machine.Serval.JobServer.csproj | 4 +- src/SIL.Machine.Tool/SIL.Machine.Tool.csproj | 2 +- .../SIL.Machine.AspNetCore.Tests.csproj | 2 +- ...Machine.Morphology.HermitCrab.Tests.csproj | 2 +- .../SIL.Machine.Tests.csproj | 2 +- ...ne.Tokenization.SentencePiece.Tests.csproj | 2 +- .../SIL.Machine.Translation.Thot.Tests.csproj | 2 +- 18 files changed, 93 insertions(+), 31 deletions(-) create mode 100644 src/SIL.Machine.AspNetCore/Utils/WriteHealthCheckResponse.cs diff --git a/dockerfile b/dockerfile index d04eb9ef1..da052071b 100644 --- a/dockerfile +++ b/dockerfile @@ -1,4 +1,4 @@ -FROM mcr.microsoft.com/dotnet/sdk:6.0-jammy AS build-env +FROM mcr.microsoft.com/dotnet/sdk:8.0-jammy AS build-env WORKDIR /app RUN apt-get update && apt-get install -y g++ curl cmake @@ -12,7 +12,7 @@ RUN dotnet publish ./src/SIL.Machine.Serval.EngineServer/SIL.Machine.Serval.Engi RUN dotnet publish ./src/SIL.Machine.Serval.JobServer/SIL.Machine.Serval.JobServer.csproj -c Release -o out_job_server # Build runtime image -FROM mcr.microsoft.com/dotnet/aspnet:6.0-jammy as production +FROM mcr.microsoft.com/dotnet/aspnet:8.0-jammy as production # libgomp needed for thot RUN apt-get update && apt-get install -y libgomp1 WORKDIR /app diff --git a/dockerfile.development b/dockerfile.development index cd6847185..a395da6dd 100644 --- a/dockerfile.development +++ b/dockerfile.development @@ -1,4 +1,4 @@ -FROM mcr.microsoft.com/dotnet/sdk:6.0-jammy +FROM mcr.microsoft.com/dotnet/sdk:8.0-jammy # libgomp needed for thot RUN apt update && apt install -y unzip libgomp1 && \ curl -sSL https://aka.ms/getvsdbgsh | /bin/sh /dev/stdin -v latest -l /remote_debugger \ No newline at end of file diff --git a/src/SIL.Machine.AspNetCore/Configuration/IMachineBuilderExtensions.cs b/src/SIL.Machine.AspNetCore/Configuration/IMachineBuilderExtensions.cs index e5e948573..ffe7cb119 100644 --- a/src/SIL.Machine.AspNetCore/Configuration/IMachineBuilderExtensions.cs +++ b/src/SIL.Machine.AspNetCore/Configuration/IMachineBuilderExtensions.cs @@ -103,10 +103,10 @@ public static IMachineBuilder AddUnigramTruecaser(this IMachineBuilder builder) public static IMachineBuilder AddClearMLService(this IMachineBuilder builder, string? connectionString = null) { - connectionString ??= builder.Configuration.GetConnectionString("ClearML"); + connectionString ??= builder.Configuration!.GetConnectionString("ClearML"); builder.Services .AddHttpClient("ClearML") - .ConfigureHttpClient(httpClient => httpClient.BaseAddress = new Uri(connectionString)) + .ConfigureHttpClient(httpClient => httpClient.BaseAddress = new Uri(connectionString!)) // Add retry policy; fail after approx. 2 + 4 + 8 = 14 seconds .AddTransientHttpErrorPolicy( b => b.WaitAndRetryAsync(3, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt))) @@ -120,8 +120,9 @@ public static IMachineBuilder AddClearMLService(this IMachineBuilder builder, st builder.Services .AddHttpClient("ClearML-NoRetry") - .ConfigureHttpClient(httpClient => httpClient.BaseAddress = new Uri(connectionString)); + .ConfigureHttpClient(httpClient => httpClient.BaseAddress = new Uri(connectionString!)); builder.Services.AddSingleton(); + builder.Services.AddHealthChecks().AddCheck("ClearML Health Check"); return builder; @@ -158,7 +159,7 @@ public static IMachineBuilder AddMongoHangfireJobClient( .UseSimpleAssemblyNameTypeSerializer() .UseRecommendedSerializerSettings() .UseMongoStorage( - connectionString ?? builder.Configuration.GetConnectionString("Hangfire"), + connectionString ?? builder.Configuration!.GetConnectionString("Hangfire"), new MongoStorageOptions { MigrationOptions = new MongoMigrationOptions @@ -220,9 +221,9 @@ public static IMachineBuilder AddMemoryDataAccess(this IMachineBuilder builder) public static IMachineBuilder AddMongoDataAccess(this IMachineBuilder builder, string? connectionString = null) { - connectionString ??= builder.Configuration.GetConnectionString("Mongo"); + connectionString ??= builder.Configuration!.GetConnectionString("Mongo"); builder.Services.AddMongoDataAccess( - connectionString, + connectionString!, "SIL.Machine.AspNetCore.Models", o => { @@ -257,7 +258,7 @@ await c.Indexes.CreateOrUpdateAsync( ); } ); - builder.Services.AddHealthChecks().AddMongoDb(connectionString, name: "Mongo"); + builder.Services.AddHealthChecks().AddMongoDb(connectionString!, name: "Mongo"); return builder; } @@ -271,7 +272,7 @@ public static IMachineBuilder AddServalPlatformService( builder.Services .AddGrpcClient(o => { - o.Address = new Uri(connectionString ?? builder.Configuration.GetConnectionString("Serval")); + o.Address = new Uri(connectionString ?? builder.Configuration!.GetConnectionString("Serval")!); }) .ConfigureChannel(o => { @@ -321,7 +322,7 @@ public static IMachineBuilder AddServalTranslationEngineService( options.Interceptors.Add(); options.Interceptors.Add(); }); - builder.AddServalPlatformService(connectionString ?? builder.Configuration.GetConnectionString("Serval")); + builder.AddServalPlatformService(connectionString ?? builder.Configuration!.GetConnectionString("Serval")); engineTypes ??= builder.Configuration?.GetSection("TranslationEngines").Get() ?? new[] { TranslationEngineType.SmtTransfer, TranslationEngineType.Nmt }; @@ -340,7 +341,6 @@ public static IMachineBuilder AddServalTranslationEngineService( break; } } - builder.Services.AddGrpcHealthChecks(); return builder; } @@ -359,7 +359,7 @@ Action configureOptions public static IMachineBuilder AddBuildJobService(this IMachineBuilder builder, IConfiguration config) { builder.Services.Configure(config); - var options = config.Get(); + var options = config.Get()!; return builder.AddBuildJobService(options); } @@ -368,7 +368,24 @@ public static IMachineBuilder AddBuildJobService(this IMachineBuilder builder) if (builder.Configuration is null) builder.AddBuildJobService(o => { }); else + { builder.AddBuildJobService(builder.Configuration.GetSection(BuildJobOptions.Key)); + + string EnginesDir = builder.Configuration + .GetSection(SmtTransferEngineOptions.Key)! + .GetValue("EnginesDir")!; + + string driveLetter = Path.GetPathRoot(EnginesDir)![..1]; + // add health check for disk storage capacity + builder.Services + .AddHealthChecks() + .AddDiskStorageHealthCheck( + x => x.AddDrive(driveLetter, 2_000_000), + "SMT Engine Storage Capacity", + HealthStatus.Degraded + ); + } + return builder; } diff --git a/src/SIL.Machine.AspNetCore/SIL.Machine.AspNetCore.csproj b/src/SIL.Machine.AspNetCore/SIL.Machine.AspNetCore.csproj index ab401c602..549e009ff 100644 --- a/src/SIL.Machine.AspNetCore/SIL.Machine.AspNetCore.csproj +++ b/src/SIL.Machine.AspNetCore/SIL.Machine.AspNetCore.csproj @@ -1,7 +1,7 @@  - net6.0 + net8.0 An ASP.NET Core web API middleware for the Machine library. 1591 enable @@ -26,6 +26,7 @@ + diff --git a/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs b/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs index a0b010c61..20333359c 100644 --- a/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs +++ b/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs @@ -9,9 +9,15 @@ public class ServalTranslationEngineServiceV1 : TranslationEngineApi.Translation private readonly Dictionary _engineServices; - public ServalTranslationEngineServiceV1(IEnumerable engineServices) + private readonly HealthCheckService _healthCheckService; + + public ServalTranslationEngineServiceV1( + IEnumerable engineServices, + HealthCheckService healthCheckService + ) { _engineServices = engineServices.ToDictionary(es => es.Type); + _healthCheckService = healthCheckService; } public override async Task Create(CreateRequest request, ServerCallContext context) @@ -127,6 +133,13 @@ ServerCallContext context return new GetQueueSizeResponse { Size = await engineService.GetQueueSizeAsync(context.CancellationToken) }; } + public override async Task HealthCheck(Empty request, ServerCallContext context) + { + HealthReport healthReport = await _healthCheckService.CheckHealthAsync(); + HealthCheckResponse healthCheckResponse = WriteHealthCheckResponse.Generate(healthReport); + return healthCheckResponse; + } + private ITranslationEngineService GetEngineService(string engineTypeStr) { if (_engineServices.TryGetValue(GetEngineType(engineTypeStr), out ITranslationEngineService? service)) diff --git a/src/SIL.Machine.AspNetCore/Utils/WriteHealthCheckResponse.cs b/src/SIL.Machine.AspNetCore/Utils/WriteHealthCheckResponse.cs new file mode 100644 index 000000000..402deee4b --- /dev/null +++ b/src/SIL.Machine.AspNetCore/Utils/WriteHealthCheckResponse.cs @@ -0,0 +1,34 @@ +using Serval.Translation.V1; + +namespace SIL.Machine.AspNetCore.Utils; + +public class WriteHealthCheckResponse +{ + public static HealthCheckResponse Generate(HealthReport healthReport) + { + Dictionary healthCheckResultData = []; + string? healthCheckResultException = null; + + // Combine data and exceptions from all health checks + foreach (KeyValuePair entry in healthReport.Entries) + { + healthCheckResultData.Add(entry.Key, $"{entry.Value.Status}: {entry.Value.Description ?? ""}"); + if ((entry.Value.Exception?.ToString() ?? "") != "") + if(healthCheckResultException is null) + healthCheckResultException = $"{entry.Key}: {entry.Value.Exception}"; + else + healthCheckResultException += $"\n{entry.Key}: {entry.Value.Exception}"; + } + // Assemble response + HealthCheckResponse healthCheckReponse = new HealthCheckResponse{ + Status = (HealthCheckStatus)healthReport.Status, + Duration = healthReport.TotalDuration.ToString(), + Exception = healthCheckResultException + }; + foreach (KeyValuePair entry in healthCheckResultData) + { + healthCheckReponse.Data.Add(entry.Key, entry.Value); + } + return healthCheckReponse; + } +} \ No newline at end of file diff --git a/src/SIL.Machine.Morphology.HermitCrab.Tool/SIL.Machine.Morphology.HermitCrab.Tool.csproj b/src/SIL.Machine.Morphology.HermitCrab.Tool/SIL.Machine.Morphology.HermitCrab.Tool.csproj index 9517bc14e..4b8769225 100644 --- a/src/SIL.Machine.Morphology.HermitCrab.Tool/SIL.Machine.Morphology.HermitCrab.Tool.csproj +++ b/src/SIL.Machine.Morphology.HermitCrab.Tool/SIL.Machine.Morphology.HermitCrab.Tool.csproj @@ -2,7 +2,7 @@ Exe - net6.0 + net8.0 SIL.Machine.Morphology.HermitCrab true hc diff --git a/src/SIL.Machine.Plugin/SIL.Machine.Plugin.csproj b/src/SIL.Machine.Plugin/SIL.Machine.Plugin.csproj index edc7ad063..7b503e74d 100644 --- a/src/SIL.Machine.Plugin/SIL.Machine.Plugin.csproj +++ b/src/SIL.Machine.Plugin/SIL.Machine.Plugin.csproj @@ -1,7 +1,7 @@ - net6.0 + net8.0 A plugin framework for the Machine library. diff --git a/src/SIL.Machine.Serval.EngineServer/Program.cs b/src/SIL.Machine.Serval.EngineServer/Program.cs index fa843e3ad..50c389b71 100644 --- a/src/SIL.Machine.Serval.EngineServer/Program.cs +++ b/src/SIL.Machine.Serval.EngineServer/Program.cs @@ -29,7 +29,6 @@ app.UseHttpsRedirection(); app.MapServalTranslationEngineService(); -app.MapGrpcHealthChecksService(); app.MapHangfireDashboard(); app.Run(); diff --git a/src/SIL.Machine.Serval.EngineServer/SIL.Machine.Serval.EngineServer.csproj b/src/SIL.Machine.Serval.EngineServer/SIL.Machine.Serval.EngineServer.csproj index 9a6962da2..244245baa 100644 --- a/src/SIL.Machine.Serval.EngineServer/SIL.Machine.Serval.EngineServer.csproj +++ b/src/SIL.Machine.Serval.EngineServer/SIL.Machine.Serval.EngineServer.csproj @@ -1,7 +1,7 @@  - net6.0 + net8.0 enable enable 34e222a9-ef76-48f9-869e-338547f9bd25 @@ -23,7 +23,7 @@ - + icu.net.dll.config diff --git a/src/SIL.Machine.Serval.JobServer/Program.cs b/src/SIL.Machine.Serval.JobServer/Program.cs index 30c2fa593..38b147243 100644 --- a/src/SIL.Machine.Serval.JobServer/Program.cs +++ b/src/SIL.Machine.Serval.JobServer/Program.cs @@ -25,6 +25,4 @@ var app = builder.Build(); -app.MapHealthChecks("/health"); - app.Run(); diff --git a/src/SIL.Machine.Serval.JobServer/SIL.Machine.Serval.JobServer.csproj b/src/SIL.Machine.Serval.JobServer/SIL.Machine.Serval.JobServer.csproj index 0e6059cc0..949f3e0b2 100644 --- a/src/SIL.Machine.Serval.JobServer/SIL.Machine.Serval.JobServer.csproj +++ b/src/SIL.Machine.Serval.JobServer/SIL.Machine.Serval.JobServer.csproj @@ -1,7 +1,7 @@ - net6.0 + net8.0 enable enable aa9e7440-5a04-4de6-ba51-bab9ef4a62e1 @@ -25,7 +25,7 @@ - + icu.net.dll.config diff --git a/src/SIL.Machine.Tool/SIL.Machine.Tool.csproj b/src/SIL.Machine.Tool/SIL.Machine.Tool.csproj index 61a0d22ca..25e5634d7 100644 --- a/src/SIL.Machine.Tool/SIL.Machine.Tool.csproj +++ b/src/SIL.Machine.Tool/SIL.Machine.Tool.csproj @@ -2,7 +2,7 @@ Exe - net6.0 + net8.0 SIL.Machine true machine diff --git a/tests/SIL.Machine.AspNetCore.Tests/SIL.Machine.AspNetCore.Tests.csproj b/tests/SIL.Machine.AspNetCore.Tests/SIL.Machine.AspNetCore.Tests.csproj index dd09631e7..7a2923cf6 100644 --- a/tests/SIL.Machine.AspNetCore.Tests/SIL.Machine.AspNetCore.Tests.csproj +++ b/tests/SIL.Machine.AspNetCore.Tests/SIL.Machine.AspNetCore.Tests.csproj @@ -1,7 +1,7 @@  - net6.0 + net8.0 SIL.Machine.AspNetCore enable enable diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/SIL.Machine.Morphology.HermitCrab.Tests.csproj b/tests/SIL.Machine.Morphology.HermitCrab.Tests/SIL.Machine.Morphology.HermitCrab.Tests.csproj index dbea327e6..c419d0f9a 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/SIL.Machine.Morphology.HermitCrab.Tests.csproj +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/SIL.Machine.Morphology.HermitCrab.Tests.csproj @@ -1,7 +1,7 @@  - net6.0 + net8.0 SIL.Machine.Morphology.HermitCrab enable enable diff --git a/tests/SIL.Machine.Tests/SIL.Machine.Tests.csproj b/tests/SIL.Machine.Tests/SIL.Machine.Tests.csproj index ed2c113fe..beb8a730b 100644 --- a/tests/SIL.Machine.Tests/SIL.Machine.Tests.csproj +++ b/tests/SIL.Machine.Tests/SIL.Machine.Tests.csproj @@ -1,7 +1,7 @@  - net6.0 + net8.0 SIL.Machine enable enable diff --git a/tests/SIL.Machine.Tokenization.SentencePiece.Tests/SIL.Machine.Tokenization.SentencePiece.Tests.csproj b/tests/SIL.Machine.Tokenization.SentencePiece.Tests/SIL.Machine.Tokenization.SentencePiece.Tests.csproj index 4e875a4b4..7f64c54bf 100644 --- a/tests/SIL.Machine.Tokenization.SentencePiece.Tests/SIL.Machine.Tokenization.SentencePiece.Tests.csproj +++ b/tests/SIL.Machine.Tokenization.SentencePiece.Tests/SIL.Machine.Tokenization.SentencePiece.Tests.csproj @@ -1,7 +1,7 @@ - net6.0 + net8.0 enable enable false diff --git a/tests/SIL.Machine.Translation.Thot.Tests/SIL.Machine.Translation.Thot.Tests.csproj b/tests/SIL.Machine.Translation.Thot.Tests/SIL.Machine.Translation.Thot.Tests.csproj index 4ebd167f9..a79528cc3 100644 --- a/tests/SIL.Machine.Translation.Thot.Tests/SIL.Machine.Translation.Thot.Tests.csproj +++ b/tests/SIL.Machine.Translation.Thot.Tests/SIL.Machine.Translation.Thot.Tests.csproj @@ -1,7 +1,7 @@  - net6.0 + net8.0 SIL.Machine.Translation.Thot enable enable