Skip to content

Commit

Permalink
feat(infra): added metrics and pagerduty alert for scheduled tasks (#447
Browse files Browse the repository at this point in the history
)
  • Loading branch information
lucianHymer authored Oct 27, 2023
1 parent 4dea465 commit fd99efb
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 17 deletions.
91 changes: 83 additions & 8 deletions infra/lib/scorer/scheduledTasks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,22 @@ export type ScheduledTaskConfig = Pick<
| "cluster"
| "subnets"
| "securityGroup"
> & {
command: string[];
scheduleExpression: string;
ephemeralStorageSizeInGiB?: number;
cpu?: number;
memory?: number;
};
| "cpu"
| "memory"
> &
Required<Pick<ScorerService, "alertTopic">> & {
command: string;
scheduleExpression: string;
ephemeralStorageSizeInGiB?: number;
};

export function createScheduledTask(
name: string,
config: ScheduledTaskConfig,
envConfig: ScorerEnvironmentConfig
) {
const {
alertTopic,
executionRole,
subnets,
dockerImageScorer,
Expand All @@ -43,6 +45,17 @@ export function createScheduledTask(
memory,
} = config;

const commandSuccessMessage = `SUCCESS <${name}>`;
const commandWithTest = [
"/bin/bash",
"-c",
command + ` && echo "${commandSuccessMessage}"`,
];

const logGroup = new aws.cloudwatch.LogGroup(`scheduled-${name}`, {
retentionInDays: 90,
});

const task = new awsx.ecs.FargateTaskDefinition(name, {
executionRole: {
roleArn: executionRole.arn,
Expand All @@ -52,6 +65,11 @@ export function createScheduledTask(
sizeInGib: ephemeralStorageSizeInGiB,
}
: undefined,
logGroup: {
existing: {
arn: logGroup.arn,
},
},
containers: {
web: {
name: `${name}-container`,
Expand All @@ -60,7 +78,7 @@ export function createScheduledTask(
memory: memory ? memory : 2048,
secrets,
environment: getEnvironment(envConfig),
command,
command: commandWithTest,
},
},
});
Expand Down Expand Up @@ -179,5 +197,62 @@ export function createScheduledTask(
},
});

const metricNamespace = "/scheduled-tasks/runs/success";
const metricName = `SuccessfulRun-${name}`;

new aws.cloudwatch.LogMetricFilter(metricName, {
logGroupName: logGroup.name,
metricTransformation: {
defaultValue: "0",
name: metricName,
namespace: metricNamespace,
unit: "Count",
value: "1",
},
name: metricName,
pattern: `"${commandSuccessMessage}"`,
});

const SIX_HOURS_IN_SECONDS = 6 * 60 * 60;

new aws.cloudwatch.MetricAlarm("UnsuccessfulRuns-" + name, {
alarmActions: [alertTopic.arn],
comparisonOperator: "GreaterThanOrEqualToThreshold",
datapointsToAlarm: 1,
evaluationPeriods: 1,
metricQueries: [
{
id: "m1",
metric: {
metricName,
namespace: metricNamespace,
period: SIX_HOURS_IN_SECONDS,
stat: "Sum",
},
},
{
id: "m2",
metric: {
dimensions: {
RuleName: scheduledEventRule.name,
},
metricName: "Invocations",
namespace: "AWS/Events",
period: SIX_HOURS_IN_SECONDS,
stat: "Sum",
},
},
{
expression: "m2 - m1",
id: "e1",
label: "UnsuccessfulRuns",
returnData: true,
},
],
threshold: 1,
name: "UnsuccessfulRuns-" + name,
treatMissingData: "notBreaching",
});

return task.taskDefinition.id;
}
15 changes: 8 additions & 7 deletions infra/prod/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1282,8 +1282,9 @@ export const weeklyDataDumpTaskDefinition = createScheduledTask(
{
...baseScorerServiceConfig,
securityGroup: secgrp,
command: ["python", "manage.py", "dump_stamp_data"],
command: "python manage.py dump_stamp_data",
scheduleExpression: "cron(30 23 ? * FRI *)", // Run the task every friday at 23:30 UTC
alertTopic: pagerdutyTopic,
},
envConfig
);
Expand Down Expand Up @@ -1313,9 +1314,9 @@ export const dailyDataDumpTaskDefinition = createScheduledTask(
]),
"--s3-uri=s3://passport-scorer/daily_data_dumps/",
"--batch-size=20000",
],

].join(" "),
scheduleExpression: "cron(30 0 ? * * *)", // Run the task daily at 00:30 UTC
alertTopic: pagerdutyTopic,
},
envConfig
);
Expand All @@ -1335,9 +1336,9 @@ export const dailyDataDumpTaskDefinitionParquet = createScheduledTask(
"--apps=registry,ceramic_cache,account,scorer_weighted,trusta_labs",
"--s3-uri=s3://passport-scorer/daily_data_dumps/",
"--batch-size=20000",
],

].join(" "),
scheduleExpression: "cron(30 0 ? * * *)", // Run the task daily at 00:30 UTC
alertTopic: pagerdutyTopic,
},
envConfig
);
Expand All @@ -1364,9 +1365,9 @@ export const frequentAlloScorerDataDumpTaskDefinition = createScheduledTask(
`--s3-uri=s3://${publicDataDomain}/passport_scores/`,
// "--summary-extra-args",
// JSON.stringify({ ACL: "public-read" }),
],

].join(" "),
scheduleExpression: "cron(*/30 * ? * * *)", // Run the task every 30 min
alertTopic: pagerdutyTopic,
},
envConfig
);
Expand Down
4 changes: 2 additions & 2 deletions infra/staging/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1297,7 +1297,7 @@ new aws.lb.TargetGroupAttachment("redashTargetAttachment", {
// {
// ...baseScorerServiceConfig,
// securityGroup: secgrp,
// command: ["python", "manage.py", "dump_stamp_data"],
// command: "python manage.py dump_stamp_data",
// scheduleExpression: "cron(30 23 ? * FRI *)", // Run the task every friday at 23:30 UTC
// },
// envConfig
Expand All @@ -1324,7 +1324,7 @@ new aws.lb.TargetGroupAttachment("redashTargetAttachment", {
// `--s3-uri=s3://public.${domain}`,
// "--summary-extra-args",
// JSON.stringify({ ACL: "public-read" }),
// ],
// ].join(" "),

// scheduleExpression: "cron(*/30 * ? * * *)", // Run the task every 30 min
// },
Expand Down

0 comments on commit fd99efb

Please sign in to comment.