Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial stgab at improved health checking
Browse files Browse the repository at this point in the history
* upgrade to net8.0
* Add health check endpoint to GRPC engine proto (from serval)
* Combine health reports into rich data
johnml1135 committed Jan 10, 2024
1 parent d941cd2 commit fddf8f5
Showing 18 changed files with 93 additions and 31 deletions.
4 changes: 2 additions & 2 deletions dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM mcr.microsoft.com/dotnet/sdk:6.0-jammy AS build-env
FROM mcr.microsoft.com/dotnet/sdk:8.0-jammy AS build-env
WORKDIR /app

RUN apt-get update && apt-get install -y g++ curl cmake
@@ -12,7 +12,7 @@ RUN dotnet publish ./src/SIL.Machine.Serval.EngineServer/SIL.Machine.Serval.Engi
RUN dotnet publish ./src/SIL.Machine.Serval.JobServer/SIL.Machine.Serval.JobServer.csproj -c Release -o out_job_server

# Build runtime image
FROM mcr.microsoft.com/dotnet/aspnet:6.0-jammy as production
FROM mcr.microsoft.com/dotnet/aspnet:8.0-jammy as production
# libgomp needed for thot
RUN apt-get update && apt-get install -y libgomp1
WORKDIR /app
2 changes: 1 addition & 1 deletion dockerfile.development
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM mcr.microsoft.com/dotnet/sdk:6.0-jammy
FROM mcr.microsoft.com/dotnet/sdk:8.0-jammy
# libgomp needed for thot
RUN apt update && apt install -y unzip libgomp1 && \
curl -sSL https://aka.ms/getvsdbgsh | /bin/sh /dev/stdin -v latest -l /remote_debugger
Original file line number Diff line number Diff line change
@@ -103,10 +103,10 @@ public static IMachineBuilder AddUnigramTruecaser(this IMachineBuilder builder)

public static IMachineBuilder AddClearMLService(this IMachineBuilder builder, string? connectionString = null)
{
connectionString ??= builder.Configuration.GetConnectionString("ClearML");
connectionString ??= builder.Configuration!.GetConnectionString("ClearML");
builder.Services
.AddHttpClient("ClearML")
.ConfigureHttpClient(httpClient => httpClient.BaseAddress = new Uri(connectionString))
.ConfigureHttpClient(httpClient => httpClient.BaseAddress = new Uri(connectionString!))
// Add retry policy; fail after approx. 2 + 4 + 8 = 14 seconds
.AddTransientHttpErrorPolicy(
b => b.WaitAndRetryAsync(3, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt)))
@@ -120,8 +120,9 @@ public static IMachineBuilder AddClearMLService(this IMachineBuilder builder, st

builder.Services
.AddHttpClient("ClearML-NoRetry")
.ConfigureHttpClient(httpClient => httpClient.BaseAddress = new Uri(connectionString));
.ConfigureHttpClient(httpClient => httpClient.BaseAddress = new Uri(connectionString!));
builder.Services.AddSingleton<ClearMLHealthCheck>();

builder.Services.AddHealthChecks().AddCheck<ClearMLHealthCheck>("ClearML Health Check");

return builder;
@@ -158,7 +159,7 @@ public static IMachineBuilder AddMongoHangfireJobClient(
.UseSimpleAssemblyNameTypeSerializer()
.UseRecommendedSerializerSettings()
.UseMongoStorage(
connectionString ?? builder.Configuration.GetConnectionString("Hangfire"),
connectionString ?? builder.Configuration!.GetConnectionString("Hangfire"),
new MongoStorageOptions
{
MigrationOptions = new MongoMigrationOptions
@@ -220,9 +221,9 @@ public static IMachineBuilder AddMemoryDataAccess(this IMachineBuilder builder)

public static IMachineBuilder AddMongoDataAccess(this IMachineBuilder builder, string? connectionString = null)
{
connectionString ??= builder.Configuration.GetConnectionString("Mongo");
connectionString ??= builder.Configuration!.GetConnectionString("Mongo");
builder.Services.AddMongoDataAccess(
connectionString,
connectionString!,
"SIL.Machine.AspNetCore.Models",
o =>
{
@@ -257,7 +258,7 @@ await c.Indexes.CreateOrUpdateAsync(
);
}
);
builder.Services.AddHealthChecks().AddMongoDb(connectionString, name: "Mongo");
builder.Services.AddHealthChecks().AddMongoDb(connectionString!, name: "Mongo");

return builder;
}
@@ -271,7 +272,7 @@ public static IMachineBuilder AddServalPlatformService(
builder.Services
.AddGrpcClient<TranslationPlatformApi.TranslationPlatformApiClient>(o =>
{
o.Address = new Uri(connectionString ?? builder.Configuration.GetConnectionString("Serval"));
o.Address = new Uri(connectionString ?? builder.Configuration!.GetConnectionString("Serval")!);
})
.ConfigureChannel(o =>
{
@@ -321,7 +322,7 @@ public static IMachineBuilder AddServalTranslationEngineService(
options.Interceptors.Add<CancellationInterceptor>();
options.Interceptors.Add<UnimplementedInterceptor>();
});
builder.AddServalPlatformService(connectionString ?? builder.Configuration.GetConnectionString("Serval"));
builder.AddServalPlatformService(connectionString ?? builder.Configuration!.GetConnectionString("Serval"));
engineTypes ??=
builder.Configuration?.GetSection("TranslationEngines").Get<TranslationEngineType[]?>()
?? new[] { TranslationEngineType.SmtTransfer, TranslationEngineType.Nmt };
@@ -340,7 +341,6 @@ public static IMachineBuilder AddServalTranslationEngineService(
break;
}
}
builder.Services.AddGrpcHealthChecks();

return builder;
}
@@ -359,7 +359,7 @@ Action<BuildJobOptions> configureOptions
public static IMachineBuilder AddBuildJobService(this IMachineBuilder builder, IConfiguration config)
{
builder.Services.Configure<BuildJobOptions>(config);
var options = config.Get<BuildJobOptions>();
var options = config.Get<BuildJobOptions>()!;
return builder.AddBuildJobService(options);
}

@@ -368,7 +368,24 @@ public static IMachineBuilder AddBuildJobService(this IMachineBuilder builder)
if (builder.Configuration is null)
builder.AddBuildJobService(o => { });
else
{
builder.AddBuildJobService(builder.Configuration.GetSection(BuildJobOptions.Key));

string EnginesDir = builder.Configuration
.GetSection(SmtTransferEngineOptions.Key)!
.GetValue<string>("EnginesDir")!;

string driveLetter = Path.GetPathRoot(EnginesDir)![..1];
// add health check for disk storage capacity
builder.Services
.AddHealthChecks()
.AddDiskStorageHealthCheck(
x => x.AddDrive(driveLetter, 2_000_000),
"SMT Engine Storage Capacity",
HealthStatus.Degraded
);
}

return builder;
}

3 changes: 2 additions & 1 deletion src/SIL.Machine.AspNetCore/SIL.Machine.AspNetCore.csproj
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<Description>An ASP.NET Core web API middleware for the Machine library.</Description>
<NoWarn>1591</NoWarn>
<ImplicitUsings>enable</ImplicitUsings>
@@ -26,6 +26,7 @@

<ItemGroup>
<PackageReference Include="AspNetCore.HealthChecks.MongoDb" Version="6.0.2" />
<PackageReference Include="AspNetCore.HealthChecks.System" Version="6.0.2" />
<PackageReference Include="AWSSDK.S3" Version="3.7.205.8" />
<PackageReference Include="Grpc.AspNetCore" Version="2.57.0" />
<PackageReference Include="Grpc.AspNetCore.HealthChecks" Version="2.57.0" />
Original file line number Diff line number Diff line change
@@ -9,9 +9,15 @@ public class ServalTranslationEngineServiceV1 : TranslationEngineApi.Translation

private readonly Dictionary<TranslationEngineType, ITranslationEngineService> _engineServices;

public ServalTranslationEngineServiceV1(IEnumerable<ITranslationEngineService> engineServices)
private readonly HealthCheckService _healthCheckService;

public ServalTranslationEngineServiceV1(
IEnumerable<ITranslationEngineService> engineServices,
HealthCheckService healthCheckService
)
{
_engineServices = engineServices.ToDictionary(es => es.Type);
_healthCheckService = healthCheckService;
}

public override async Task<Empty> Create(CreateRequest request, ServerCallContext context)
@@ -127,6 +133,13 @@ ServerCallContext context
return new GetQueueSizeResponse { Size = await engineService.GetQueueSizeAsync(context.CancellationToken) };
}

public override async Task<HealthCheckResponse> HealthCheck(Empty request, ServerCallContext context)

Check failure on line 136 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

GitHub Actions / Build on ubuntu-20.04

The type or namespace name 'HealthCheckResponse' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 136 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

GitHub Actions / Build on ubuntu-20.04

'ServalTranslationEngineServiceV1.HealthCheck(Empty, ServerCallContext)': no suitable method found to override

Check failure on line 136 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

GitHub Actions / Build on ubuntu-20.04

The type or namespace name 'HealthCheckResponse' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 136 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

GitHub Actions / Build on ubuntu-20.04

'ServalTranslationEngineServiceV1.HealthCheck(Empty, ServerCallContext)': no suitable method found to override

Check failure on line 136 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

GitHub Actions / Build on windows-latest

The type or namespace name 'HealthCheckResponse' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 136 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

GitHub Actions / Build on windows-latest

'ServalTranslationEngineServiceV1.HealthCheck(Empty, ServerCallContext)': no suitable method found to override

Check failure on line 136 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

GitHub Actions / Build on windows-latest

The type or namespace name 'HealthCheckResponse' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 136 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

GitHub Actions / Build on windows-latest

'ServalTranslationEngineServiceV1.HealthCheck(Empty, ServerCallContext)': no suitable method found to override
{
HealthReport healthReport = await _healthCheckService.CheckHealthAsync();
HealthCheckResponse healthCheckResponse = WriteHealthCheckResponse.Generate(healthReport);
return healthCheckResponse;
}

private ITranslationEngineService GetEngineService(string engineTypeStr)
{
if (_engineServices.TryGetValue(GetEngineType(engineTypeStr), out ITranslationEngineService? service))
34 changes: 34 additions & 0 deletions src/SIL.Machine.AspNetCore/Utils/WriteHealthCheckResponse.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
using Serval.Translation.V1;

namespace SIL.Machine.AspNetCore.Utils;

public class WriteHealthCheckResponse
{
public static HealthCheckResponse Generate(HealthReport healthReport)

Check failure on line 7 in src/SIL.Machine.AspNetCore/Utils/WriteHealthCheckResponse.cs

GitHub Actions / Build on ubuntu-20.04

The type or namespace name 'HealthCheckResponse' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 7 in src/SIL.Machine.AspNetCore/Utils/WriteHealthCheckResponse.cs

GitHub Actions / Build on ubuntu-20.04

The type or namespace name 'HealthCheckResponse' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 7 in src/SIL.Machine.AspNetCore/Utils/WriteHealthCheckResponse.cs

GitHub Actions / Build on windows-latest

The type or namespace name 'HealthCheckResponse' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 7 in src/SIL.Machine.AspNetCore/Utils/WriteHealthCheckResponse.cs

GitHub Actions / Build on windows-latest

The type or namespace name 'HealthCheckResponse' could not be found (are you missing a using directive or an assembly reference?)
{
Dictionary<string, string> healthCheckResultData = [];
string? healthCheckResultException = null;

// Combine data and exceptions from all health checks
foreach (KeyValuePair<string, HealthReportEntry> entry in healthReport.Entries)
{
healthCheckResultData.Add(entry.Key, $"{entry.Value.Status}: {entry.Value.Description ?? ""}");
if ((entry.Value.Exception?.ToString() ?? "") != "")
if(healthCheckResultException is null)
healthCheckResultException = $"{entry.Key}: {entry.Value.Exception}";
else
healthCheckResultException += $"\n{entry.Key}: {entry.Value.Exception}";
}
// Assemble response
HealthCheckResponse healthCheckReponse = new HealthCheckResponse{
Status = (HealthCheckStatus)healthReport.Status,
Duration = healthReport.TotalDuration.ToString(),
Exception = healthCheckResultException
};
foreach (KeyValuePair<string, string> entry in healthCheckResultData)
{
healthCheckReponse.Data.Add(entry.Key, entry.Value);
}
return healthCheckReponse;
}
}
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>SIL.Machine.Morphology.HermitCrab</RootNamespace>
<PackAsTool>true</PackAsTool>
<ToolCommandName>hc</ToolCommandName>
2 changes: 1 addition & 1 deletion src/SIL.Machine.Plugin/SIL.Machine.Plugin.csproj
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<Description>A plugin framework for the Machine library.</Description>
</PropertyGroup>

1 change: 0 additions & 1 deletion src/SIL.Machine.Serval.EngineServer/Program.cs
Original file line number Diff line number Diff line change
@@ -29,7 +29,6 @@
app.UseHttpsRedirection();

app.MapServalTranslationEngineService();
app.MapGrpcHealthChecksService();
app.MapHangfireDashboard();

app.Run();
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk.Web">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<UserSecretsId>34e222a9-ef76-48f9-869e-338547f9bd25</UserSecretsId>
@@ -23,7 +23,7 @@

<!-- Include icu.net.dll.config - which is only available after the package is built -->
<ItemGroup>
<ResolvedFileToPublish Include=".\bin\Release\net6.0\icu.net.dll.config">
<ResolvedFileToPublish Include=".\bin\Release\net8.0\icu.net.dll.config">
<RelativePath>icu.net.dll.config</RelativePath>
</ResolvedFileToPublish>
</ItemGroup>
2 changes: 0 additions & 2 deletions src/SIL.Machine.Serval.JobServer/Program.cs
Original file line number Diff line number Diff line change
@@ -25,6 +25,4 @@

var app = builder.Build();

app.MapHealthChecks("/health");

app.Run();
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk.Web">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<UserSecretsId>aa9e7440-5a04-4de6-ba51-bab9ef4a62e1</UserSecretsId>
@@ -25,7 +25,7 @@

<!-- Include icu.net.dll.config - which is only available after the package is built -->
<ItemGroup>
<ResolvedFileToPublish Include=".\bin\Release\net6.0\icu.net.dll.config">
<ResolvedFileToPublish Include=".\bin\Release\net8.0\icu.net.dll.config">
<RelativePath>icu.net.dll.config</RelativePath>
</ResolvedFileToPublish>
</ItemGroup>
2 changes: 1 addition & 1 deletion src/SIL.Machine.Tool/SIL.Machine.Tool.csproj
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>SIL.Machine</RootNamespace>
<PackAsTool>true</PackAsTool>
<ToolCommandName>machine</ToolCommandName>
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>SIL.Machine.AspNetCore</RootNamespace>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>SIL.Machine.Morphology.HermitCrab</RootNamespace>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
2 changes: 1 addition & 1 deletion tests/SIL.Machine.Tests/SIL.Machine.Tests.csproj
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>SIL.Machine</RootNamespace>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<IsPackable>false</IsPackable>
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>SIL.Machine.Translation.Thot</RootNamespace>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>

0 comments on commit fddf8f5

Please sign in to comment.