diff --git a/.gitignore b/.gitignore index ef96d844..2f85bb1a 100644 --- a/.gitignore +++ b/.gitignore @@ -25,4 +25,6 @@ site .DS_STORE # Python virtual env directory -*.venv* \ No newline at end of file +*.venv* + +*otel-collector-config-new.yml \ No newline at end of file diff --git a/aws-quickstart-eks-blueprints-1.13.1.tgz b/aws-quickstart-eks-blueprints-1.13.1.tgz new file mode 100644 index 00000000..5b7fd7de Binary files /dev/null and b/aws-quickstart-eks-blueprints-1.13.1.tgz differ diff --git a/bin/multi-cluster-conformitron.ts b/bin/multi-cluster-conformitron.ts new file mode 100644 index 00000000..b77229e8 --- /dev/null +++ b/bin/multi-cluster-conformitron.ts @@ -0,0 +1,13 @@ +import { configureApp, errorHandler } from '../lib/common/construct-utils'; +import { PipelineMultiCluster } from '../lib/multi-cluster-construct/pipeline'; + + +const app = configureApp(); + +//------------------------------------------- +// Multiple clusters, multiple regions. +//------------------------------------------- + +new PipelineMultiCluster().buildAsync(app).catch((error) => { + errorHandler(app, "Multi cluster pattern is not setup. It may be due to missing secrets: ", error); +}); \ No newline at end of file diff --git a/cdk.json b/cdk.json index 3d667b3e..1f546e27 100644 --- a/cdk.json +++ b/cdk.json @@ -1,3 +1,32 @@ { - "app": "npx ts-node dist/lib/common/default-main.js" + "app": "npx ts-node dist/lib/common/default-main.js", + "context": { + "conformitron.amp.endpoint": "https://aps-workspaces.us-east-1.amazonaws.com/workspaces/ws-77b8828d-0985-49e0-9268-2e0e8f3ba758/", + "conformitron.amp.arn":"arn:aws:aps:us-east-1:975050283200:workspace/ws-77b8828d-0985-49e0-9268-2e0e8f3ba758", + "conformitron.amg.endpoint": "https://g-75bcfc519c.grafana-workspace.us-east-1.amazonaws.com", + "conformitron.version": ["1.28","1.29","1.30"], + "fluxRepository": { + "name": "grafana-dashboards", + "namespace": "grafana-operator", + "repository": { + "repoUrl": "https://github.com/aws-observability/aws-observability-accelerator", + "name": "grafana-dashboards", + "targetRevision": "main", + "path": "./artifacts/grafana-operator-manifests/eks/infrastructure" + }, + "values": { + "GRAFANA_CLUSTER_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/cluster.json", + "GRAFANA_KUBELET_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/kubelet.json", + "GRAFANA_NSWRKLDS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/namespace-workloads.json", + "GRAFANA_NODEEXP_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/nodeexporter-nodes.json", + "GRAFANA_NODES_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/nodes.json", + "GRAFANA_WORKLOADS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/workloads.json" + }, + "kustomizations": [ + { + "kustomizationPath": "./artifacts/grafana-operator-manifests/eks/infrastructure" + } + ] + } + } } diff --git a/docs/patterns/images/ConformitronDashboard1.png b/docs/patterns/images/ConformitronDashboard1.png new file mode 100644 index 00000000..ec1d2b58 Binary files /dev/null and b/docs/patterns/images/ConformitronDashboard1.png differ diff --git a/docs/patterns/images/ConformitronDashboard2.png b/docs/patterns/images/ConformitronDashboard2.png new file mode 100644 index 00000000..20608316 Binary files /dev/null and b/docs/patterns/images/ConformitronDashboard2.png differ diff --git a/docs/patterns/images/ConformitronDashboard3.png b/docs/patterns/images/ConformitronDashboard3.png new file mode 100644 index 00000000..3b4dc3b9 Binary files /dev/null and b/docs/patterns/images/ConformitronDashboard3.png differ diff --git a/docs/patterns/images/CostOptimizationEventBridge.png b/docs/patterns/images/CostOptimizationEventBridge.png new file mode 100644 index 00000000..ab45c15e Binary files /dev/null and b/docs/patterns/images/CostOptimizationEventBridge.png differ diff --git a/docs/patterns/images/CostOptimizationEventBridge2.png b/docs/patterns/images/CostOptimizationEventBridge2.png new file mode 100644 index 00000000..f757c167 Binary files /dev/null and b/docs/patterns/images/CostOptimizationEventBridge2.png differ diff --git a/docs/patterns/images/CostOptimizationEventBridge3.png b/docs/patterns/images/CostOptimizationEventBridge3.png new file mode 100644 index 00000000..a99c5d58 Binary files /dev/null and b/docs/patterns/images/CostOptimizationEventBridge3.png differ diff --git a/docs/patterns/images/CostOptimizationSSM1.png b/docs/patterns/images/CostOptimizationSSM1.png new file mode 100644 index 00000000..d6c7cfc9 Binary files /dev/null and b/docs/patterns/images/CostOptimizationSSM1.png differ diff --git a/docs/patterns/images/CostOptimizationSSM2.png b/docs/patterns/images/CostOptimizationSSM2.png new file mode 100644 index 00000000..ff7bca85 Binary files /dev/null and b/docs/patterns/images/CostOptimizationSSM2.png differ diff --git a/docs/patterns/images/conformitron.png b/docs/patterns/images/conformitron.png new file mode 100644 index 00000000..61c0592c Binary files /dev/null and b/docs/patterns/images/conformitron.png differ diff --git a/docs/patterns/kubeflow.md b/docs/patterns/kubeflow.md index 6f5b5663..0be800d4 100644 --- a/docs/patterns/kubeflow.md +++ b/docs/patterns/kubeflow.md @@ -70,7 +70,7 @@ kubectl port-forward svc/ml-pipeline-ui 9000:80 -n =kubeflow-pipelines ``` and open this browser: http://localhost:9000/#/pipelines -more pipeline examples can be found at https://www.kubeflow.org/docs/components/pipelines/tutorials/ +more pipeline examples can be found at https://www.kubeflow.org/docs/components/pipelines/legacy-v1/tutorials/ ## Cleanup diff --git a/docs/patterns/multi-cluster-conformitron.md b/docs/patterns/multi-cluster-conformitron.md new file mode 100644 index 00000000..c6c7dd79 --- /dev/null +++ b/docs/patterns/multi-cluster-conformitron.md @@ -0,0 +1,243 @@ +# Multi-cluster pattern with observability, cost optimizations and metrics aggregation + +## Objective + +This pattern was started to solve a problem faced at AWS. We often get third-party software for validation and need a consistent automated approach to run Kubernetes evaluator testing, deployment of containerized products, and validation in Kubernetes environments on a variety of Amazon EKS environments. + +In this pattern we: + +1. Automate deployment of multiple EKS cluster in a region, with a Continuous Deployment pipeline triggered upon a commit to the GitHub repository that hosts the pipeline configuration. + +1. Configure the EKS clusters to deploy with different architectures (x86 or ARM or Bottlerocket) and different Kubernetes versions (3 most recent by default). + +1. Automate testing of all the available [EKS Anywhere Addons](https://github.com/aws-samples/eks-anywhere-addons), on each of the clusters, essentially testing their compatibility across all the potential architecture/version available today on AWS. + +1. Deploying this pattern 24x7 we observed high costs (300$ a day). By using the AWS Systems Manager Automations and AutoScaling Groups we scale-down to zero during non-business hours resulting in 60% cost savings. We also borrowed optimized OTEL collector configurations from [CDK Observability Accelerator](https://github.com/aws-observability/cdk-aws-observability-accelerator) to further reduce Prometheus storage costs. + +To learn more about our EKS Addon validation checkout our [blog](https://aws.amazon.com/blogs/containers/conformitron-validate-third-party-software-with-amazon-eks-and-amazon-eks-anywhere/) + +![Architecture of multi-cluster deployment](images/conformitron.png) + +### GitOps confguration + +GitOps is a branch of DevOps that focuses on using Git code repositories to manage infrastructure and application code deployments. + +For this pattern there is a git driven deployment using GitHub and Codepipeline which automatically redploys the EKS Clusters when modifications are made to the GitHub repo. + +Secondly, for the deployment of workloads on the cluster we leverage FluxCD, this a GitOps approach for the workloads i.e. the third-party-software we want to validate on our hardware. + +We require some additional secrets to be created in Secrets Manager for the pattern to function properly + +1. AWS CodePipeline Bootstrap - The AWS CodePipeline points to the GitHub fork of this repository i.e [cdk-eks-blueprint-patterns] (https://github.com/aws-samples/cdk-eks-blueprints-patterns). + +A `github-token` secret must be stored as plaintext in AWS Secrets Manager for the CodePipeline to access the webhooks on GitHub. For more information on how/why to set it up, please refer to the [docs](https://docs.aws.amazon.com/codepipeline/latest/userguide/GitHub-create-personal-token-CLI.html). The GitHub Personal Access Token should have these scopes: + 1. *repo* - to read your forked cdk-blueprint-patterns repostiory + 1. *admin:repo_hook* - if you plan to use webhooks (enabled by default) + +1. FluxCD Bootstrap - The FluxCD points to the [EKS Anywhere Addons](https://github.com/aws-samples/eks-anywhere-addons) repository. Since this is a public repository you will not need to add a github token to read it. + + As part of the FluxCD configuration, it uses Kustomize to apply all the addons that are in the repository along with deploying their functional tests and a custom validator cronJob. + + +## Prerequisites +Start by setting the account and region environment variables: + +```sh +export ACCOUNT_ID=$(aws sts get-caller-identity --query 'Account' --output text) +export AWS_REGION=$(aws configure get region) +``` +1. In case you haven't done this before, bootstrap your AWS Account for AWS CDK use using: + + ```bash + cdk bootstrap + ``` + +1. Fork this repository (cdk-eks-blueprints-patterns) to your GitHub organisation/user +1. Git clone your forked repository onto your machine +1. Install the AWS CDK Toolkit globally on your machine using + + ```bash + npm install -g aws-cdk@2.133.0 + ``` + +1. Increase AWS service quota for required resources, navigate to [Service Quota Tutorial](https://aws.amazon.com/getting-started/hands-on/request-service-quota-increase/) to learn more +``` + SERVICE | QUOTA NAME | REQUESTED QUOTA + Amazon Virtual Private Cloud (Amazon VPC) | NAT gateways per Availability Zone | 30 + Amazon Virtual Private Cloud (Amazon VPC) | VPCs per region | 30 + Amazon Elastic Compute Cloud (Amazon EC2) | EC2-VPC Elastic IPs | 30 +``` +We are using seperate VPC as a best practice, but you can use default vpc if you prefer. Also, If you decide to use different regions for each cluster you dont need quota increase, please reach out if you have need for this use case. + +1. Amazon Managed Grafana Workspace: To visualize metrics collected, you need an Amazon Managed Grafana workspace. If you have an existing workspace, create environment variables `AMG_ENDPOINT_URL` as described below. + +Else, to create a new workspace, visit and run our [supporting example for Grafana Deployment](https://aws-observability.github.io/terraform-aws-observability-accelerator/helpers/managed-grafana/) + +```bash +export AMG_ENDPOINT_URL=https://g-xxx.grafana-workspace.region.amazonaws.com +export AMG_WORKSPACE_ID=g-xxx +``` + +1. Grafana API Key: Amazon Managed Grafana provides a control plane API for generating Grafana API keys or Service Account Tokens. This allows programatic provisioning of Grafana dashboards using the EKS grafana operator. + + ```bash + export AMG_API_KEY=$(aws grafana create-workspace-api-key \ + --key-name "grafana-operator-key" \ + --key-role "ADMIN" \ + --seconds-to-live 432000 \ + --workspace-id $AMG_WORKSPACE_ID \ + --query key \ + --output text) + ``` + +1. AWS SSM Parameter Store for GRAFANA API KEY: Update the Grafana API key secret in AWS SSM Parameter Store using the above new Grafana API key. This will be referenced by Grafana Operator deployment of our solution to access and provision Grafana dashboards from Amazon EKS monitoring Cluster + +```bash +aws ssm put-parameter --name "/grafana-api-key" \ + --type "SecureString" \ + --value $AMG_API_KEY \ + --region $AWS_REGION +``` + +1. Amazon Managed Prometheus Workspace: To store observability metrics from all clusters we will use Amazon Managed Prometheus due to it's ease of setup and easy integration with other AWS services. We recommend setting up a new seperate Prometheus workspace using the CLI commands below. The provisioning of a new AMP workspace can be automated by leveraging the `.resourceProvider` in our CDK blueprints. See [Example](https://github.com/aws-observability/cdk-aws-observability-accelerator/blob/main/lib/existing-eks-opensource-observability-pattern/index.ts). We intentionally left this out to allow to connecting with existing AMP deployments, but please reach out to us if you need guidance on automate this provisioning. + +```bash +aws amp create-workspace --alias conformitron +``` + +Copy the `workspaceID` from the output and export it as a variable + +```bash +export AMP_WS_ID=ws-xxxxxxx-xxxx-xxxx-xxxx-xxxxxx +``` + + +1. Modify the code in your forked repo to point to your GitHub username/organisation. Open the [pattern file source code](../../lib/multi-cluster-construct/pipeline.ts) and look for the declared const of `gitOwner`. Change it to your GitHub username. + + +## Deploying + +Clone the repository: + +```sh +git clone https://github.com/aws-samples/cdk-eks-blueprints-patterns.git +cd cdk-eks-blueprints-patterns +``` + +Set the pattern's parameters in the CDK context by overriding the _cdk.json_ file (edit _PARENT_DOMAIN_NAME_ as it fits): +```sh +cat << EOF > cdk.json +{ + "app": "npx ts-node dist/lib/common/default-main.js", + "context": { + "conformitron.amp.endpoint": "https://aps-workspaces.${AWS_REGION}.amazonaws.com/workspaces/${AMP_WS_ID}/", + "conformitron.amp.arn":"arn:aws:aps:${AWS_REGION}:${ACCOUNT_ID}:workspace/${AMP_WS_ID}", + "conformitron.amg.endpoint": "${AMG_ENDPOINT_URL}", + "conformitron.version": ["1.28","1.29","1.30"], + "fluxRepository": { + "name": "grafana-dashboards", + "namespace": "grafana-operator", + "repository": { + "repoUrl": "https://github.com/aws-observability/aws-observability-accelerator", + "name": "grafana-dashboards", + "targetRevision": "main", + "path": "./artifacts/grafana-operator-manifests/eks/infrastructure" + }, + "values": { + "GRAFANA_CLUSTER_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/cluster.json", + "GRAFANA_KUBELET_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/kubelet.json", + "GRAFANA_NSWRKLDS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/namespace-workloads.json", + "GRAFANA_NODEEXP_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/nodeexporter-nodes.json", + "GRAFANA_NODES_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/nodes.json", + "GRAFANA_WORKLOADS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/workloads.json" + }, + "kustomizations": [ + { + "kustomizationPath": "./artifacts/grafana-operator-manifests/eks/infrastructure" + } + ] + } + } +} + +EOF +``` + +You are now ready to deploy the pipeline. Run the following command from the root of this repository to deploy the pipeline stack: + +```bash +make pattern multi-cluster-conformitron deploy multi-cluster-central-pipeline +``` + +Now you can go to [AWS CodePipeline console](https://eu-west-1.console.aws.amazon.com/codesuite/codepipeline/pipelines), and see how it was automatically created to deploy multiple Amazon EKS clusters to different environments. + +## Grafana Dashboards +![Dashboard 1](images/ConformitronDashboard1.png) + +![Dashboard 2](images/ConformitronDashboard2.png) + +![Dashboard 3](images/ConformitronDashboard3.png) + + +# SSM Cost Optimizations for conformitron clusters + +Running all the clusters by default for 24 hours results in a daily spend of $300+ + +To minimize these costs we have written a systems manager automation which automatically scales down autoscaling group to 0 desired nodes during off-business hours. + +On weekdays 5 PM PST clusters are scaled to 0 -> CRON EXPRESSION: `0 17 ? * MON-FRI *` +On weekdays 5 AM PST clusters are scaled to 1 -> CRON EXPRESSION: `0 05 ? * MON-FRI *` +On weekends clusters stay scaled to 0. + +These optimizations bring down the weekly cost to less than 1000$ essentially for a more than 60% cost savings. + +Please find the SSM Automation documents `lib/multi-cluster-construct/resources/cost-optimization/scaleDownEksToZero.yml` and `lib/multi-cluster-construct/resources/cost-optimization/scaleUpEksToOne.yml`. + +Lets take a look at one of the scripts `scaleDownEksToZero.yml` + +```yaml +schemaVersion: '0.3' +... +... +mainSteps: + ... + ... + inputs: + Service: eks + Api: UpdateNodegroupConfig <---- Update the managed node group + clusterName: arm-1-26-blueprint <---- Modify according to your naming convention + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 0 <---- New Scaling Configuration + maxSize: 1 + desiredSize: 0 <---- Scale To zero +``` +By triggering this automation at 5PM on Weekdays we automatically scale down clusters during off-hours. + +To run these scripts first you will have to modify update them with your own account_ID +We will use `sed` command to automatically update the files +```bash +sed "s/ACCOUNT_ID/$ACCOUNT_ID/g" scaleDownEksToZero.yml > scaleDownEksToZeroNew.yml +sed "s/ACCOUNT_ID/$ACCOUNT_ID/g" scaleUpEksToOne.yml > scaleUpEksToOneNew.yml +``` + +1. Then navigate to the Systems Manager > Documents and Create a new Automation. + +![Cost Optimization Step 1](images/CostOptimizationSSM1.png) + +1. Click on JSON and copy over the yml content to create a new runbook + +![Cost Optimization Step 2](images/CostOptimizationSSM2.png) + +1. Once saved, navigate to EventBridge > Scheduler > Schedules + +![Cost Optimization Step 3](images/CostOptimizationEventBridge.png) + +1. Create a new schedule with the CRON expression specified aboce + +![Cost Optimization Step 4](images/CostOptimizationEventBridge2.png) + +1. For Target select "StartAutomationExecution" and type in the document name from step 2 + +![Cost Optimization Step 5](images/CostOptimizationEventBridge3.png) + diff --git a/lib/multi-cluster-construct/cluster-secret-store-addon.ts b/lib/multi-cluster-construct/cluster-secret-store-addon.ts new file mode 100644 index 00000000..a29f010f --- /dev/null +++ b/lib/multi-cluster-construct/cluster-secret-store-addon.ts @@ -0,0 +1,69 @@ +import * as blueprints from '@aws-quickstart/eks-blueprints'; +import * as eks from "aws-cdk-lib/aws-eks"; +import { Construct } from 'constructs'; +import { dependable } from '@aws-quickstart/eks-blueprints/dist/utils'; + +export class ClusterSecretStoreAddon implements blueprints.ClusterAddOn { + id?: string | undefined; + @dependable(blueprints.addons.ExternalsSecretsAddOn.name) + deploy(clusterInfo: blueprints.ClusterInfo): void | Promise { + const cluster = clusterInfo.cluster; + + const clusterSecretStore = new eks.KubernetesManifest(clusterInfo.cluster, "ClusterSecretStore", { + cluster: cluster, + manifest: [ + { + apiVersion: "external-secrets.io/v1beta1", + kind: "ClusterSecretStore", + metadata: {name: "eksa-secret-store"}, + spec: { + provider: { + aws: { + service: "SecretsManager", + region: clusterInfo.cluster.stack.region, + auth: { + jwt: { + serviceAccountRef: { + name: "external-secrets-sa", + namespace: "external-secrets", + }, + }, + }, + }, + }, + }, + }, + ], + }); + + const clusterConfigMapStore = new eks.KubernetesManifest(clusterInfo.cluster, "ClusterConfigMap", { + cluster: cluster, + manifest: [ + { + apiVersion: "external-secrets.io/v1beta1", + kind: "ClusterSecretStore", + metadata: {name: "eksa-configmap-store"}, + spec: { + provider: { + aws: { + service: "ParameterStore", + region: clusterInfo.cluster.stack.region, + auth: { + jwt: { + serviceAccountRef: { + name: "external-secrets-sa", + namespace: "external-secrets", + }, + }, + }, + }, + }, + }, + }, + ], + }); + + clusterConfigMapStore.node.addDependency(clusterSecretStore); + return Promise.resolve(clusterSecretStore); + } +} \ No newline at end of file diff --git a/lib/multi-cluster-construct/clusterMapping.ts b/lib/multi-cluster-construct/clusterMapping.ts new file mode 100644 index 00000000..173091e4 --- /dev/null +++ b/lib/multi-cluster-construct/clusterMapping.ts @@ -0,0 +1,44 @@ +import * as eks from 'aws-cdk-lib/aws-eks'; +import * as ec2 from 'aws-cdk-lib/aws-ec2'; + +/** + * Instance Mapping for fields such as chart, version, managed IAM policy. + */ +export interface InstanceMapping { + amiType: eks.NodegroupAmiType, + instanceType: ec2.InstanceType, +} +/** + * List of all clusters deployed by conformitron + */ +export enum ClusterName { + ARM = "arm", + X86 = "x86", + BR_X86 = "br-x86", + BR_ARM = "br-arm", + MONITORING = "grafana-monitoring" +} + + +export const clusterMappings : {[key in ClusterName]?: InstanceMapping } = { + [ClusterName.ARM]: { + amiType: eks.NodegroupAmiType.AL2_ARM_64, + instanceType: ec2.InstanceType.of(ec2.InstanceClass.M7G, ec2.InstanceSize.XLARGE2) + }, + [ClusterName.X86]: { + amiType: eks.NodegroupAmiType.AL2_X86_64, + instanceType: ec2.InstanceType.of(ec2.InstanceClass.M5, ec2.InstanceSize.XLARGE2) + }, + [ClusterName.BR_ARM]: { + amiType: eks.NodegroupAmiType.BOTTLEROCKET_ARM_64, + instanceType: ec2.InstanceType.of(ec2.InstanceClass.M7G, ec2.InstanceSize.XLARGE2) + }, + [ClusterName.BR_X86]: { + amiType: eks.NodegroupAmiType.BOTTLEROCKET_X86_64, + instanceType: ec2.InstanceType.of(ec2.InstanceClass.M5, ec2.InstanceSize.XLARGE2) + }, + [ClusterName.MONITORING]: { + amiType: eks.NodegroupAmiType.AL2_X86_64, + instanceType: ec2.InstanceType.of(ec2.InstanceClass.M5, ec2.InstanceSize.LARGE) + } +}; diff --git a/lib/multi-cluster-construct/grafana-monitor-builder.ts b/lib/multi-cluster-construct/grafana-monitor-builder.ts new file mode 100644 index 00000000..506c3fc4 --- /dev/null +++ b/lib/multi-cluster-construct/grafana-monitor-builder.ts @@ -0,0 +1,126 @@ +import { Construct } from 'constructs'; +import * as blueprints from '@aws-quickstart/eks-blueprints'; +import * as eks from 'aws-cdk-lib/aws-eks'; +import { GrafanaOperatorSecretAddon } from './grafana-operator-secret-addon'; +import * as fs from 'fs'; + +export class GrafanaMonitoringConstruct { + + build(scope: Construct, id: string, contextAccount?: string, contextRegion?: string ) { + + const stackId = `${id}-grafana-monitor`; + + const account = contextAccount! || process.env.ACCOUNT_ID! || process.env.CDK_DEFAULT_ACCOUNT!; + const region = contextRegion! || process.env.AWS_REGION! || process.env.CDK_DEFAULT_REGION!; + + this.create(scope, account, region) + .build(scope, stackId); + } + + create(scope: Construct, contextAccount?: string, contextRegion?: string ) { + + const account = contextAccount! || process.env.ACCOUNT_ID! || process.env.CDK_DEFAULT_ACCOUNT!; + const region = contextRegion! || process.env.AWS_REGION! || process.env.CDK_DEFAULT_REGION!; + + // TODO: CFN import https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.Fn.html#static-importwbrvaluesharedvaluetoimport + const ampWorkspaceName = "conformitronWorkspace"; + const ampEndpoint = blueprints.utils.valueFromContext(scope, "conformitron.amp.endpoint", "https://aps-workspaces..amazonaws.com/workspaces//"); + const ampWorkspaceArn = blueprints.utils.valueFromContext(scope, "conformitron.amp.arn", "arn:aws:aps:::workspace/"); + + const ampAddOnProps: blueprints.AmpAddOnProps = { + ampPrometheusEndpoint: ampEndpoint, + ampRules: { + ampWorkspaceArn: ampWorkspaceArn, + ruleFilePaths: [ + __dirname + '/resources/amp-config/alerting-rules.yml', + __dirname + '/resources/amp-config/recording-rules.yml' + ] + } + }; + + let doc = blueprints.utils.readYamlDocument(__dirname + '/resources/otel-collector-config.yml'); + doc = blueprints.utils.changeTextBetweenTokens( + doc, + "{{ start enableJavaMonJob }}", + "{{ stop enableJavaMonJob }}", + false + ); + doc = blueprints.utils.changeTextBetweenTokens( + doc, + "{{ start enableNginxMonJob }}", + "{{ stop enableNginxMonJob }}", + false + ); + doc = blueprints.utils.changeTextBetweenTokens( + doc, + "{{ start enableIstioMonJob }}", + "{{ stop enableIstioMonJob }}", + false + ); + doc = blueprints.utils.changeTextBetweenTokens( + doc, + "{{ start enableAPIserverJob }}", + "{{ stop enableAPIserverJob }}", + false + ); + doc = blueprints.utils.changeTextBetweenTokens( + doc, + "{{ start enableAdotMetricsCollectionJob}}", + "{{ stop enableAdotMetricsCollectionJob }}", + false + ); + doc = blueprints.utils.changeTextBetweenTokens( + doc, + "{{ start enableAdotMetricsCollectionTelemetry }}", + "{{ stop enableAdotMetricsCollectionTelemetry }}", + true + ); + + doc = blueprints.utils.changeTextBetweenTokens( + doc, + "{{ start enableAdotContainerLogsReceiver }}", + "{{ stop enableAdotContainerLogsReceiver }}", + true + ); + doc = blueprints.utils.changeTextBetweenTokens( + doc, + "{{ start enableAdotContainerLogsExporter }}", + "{{ stop enableAdotContainerLogsExporter }}", + true + ); + + fs.writeFileSync(__dirname + '/resources/otel-collector-config-new.yml', doc); + + ampAddOnProps.openTelemetryCollector = { + manifestPath: __dirname + '/resources/otel-collector-config-new.yml', + manifestParameterMap: { + logGroupName: `/aws/eks/conformitron/workspace`, + logStreamName: `$NODE_NAME`, + logRetentionDays: 30, + awsRegion: region + } + }; + + const fluxRepository: blueprints.FluxGitRepo = blueprints.utils.valueFromContext(scope, "fluxRepository", undefined); + fluxRepository.values!.AMG_AWS_REGION = region; + fluxRepository.values!.AMG_ENDPOINT_URL = blueprints.utils.valueFromContext(scope, "conformitron.amg.endpoint","https://.grafana-workspace..amazonaws.com"); + + Reflect.defineMetadata("ordered", true, blueprints.addons.GrafanaOperatorAddon); //sets metadata ordered to true for GrafanaOperatorAddon + const addOns: Array = [ + new blueprints.addons.FluxCDAddOn({"repositories": [fluxRepository]}), + new GrafanaOperatorSecretAddon(), + new blueprints.addons.SSMAgentAddOn() + ]; + + return blueprints.ObservabilityBuilder.builder() + .account(account) + .region(region) + .version(eks.KubernetesVersion.V1_27) + .resourceProvider(ampWorkspaceName, new blueprints.CreateAmpProvider(ampWorkspaceName, ampWorkspaceName)) + .withAmpProps(ampAddOnProps) + .enableOpenSourcePatternAddOns() + .addOns( + ...addOns + ); + } +} \ No newline at end of file diff --git a/lib/multi-cluster-construct/grafana-operator-secret-addon.ts b/lib/multi-cluster-construct/grafana-operator-secret-addon.ts new file mode 100644 index 00000000..28ce3fe9 --- /dev/null +++ b/lib/multi-cluster-construct/grafana-operator-secret-addon.ts @@ -0,0 +1,75 @@ +import 'source-map-support/register'; // to get better stack traces and debugging +import * as blueprints from '@aws-quickstart/eks-blueprints'; +import * as eks from "aws-cdk-lib/aws-eks"; +import { Construct } from 'constructs'; +import { dependable } from '@aws-quickstart/eks-blueprints/dist/utils'; + +export class GrafanaOperatorSecretAddon implements blueprints.ClusterAddOn { + id?: string | undefined; + @dependable(blueprints.addons.ExternalsSecretsAddOn.name, blueprints.addons.GrafanaOperatorAddon.name) + deploy(clusterInfo: blueprints.ClusterInfo): void | Promise { + const cluster = clusterInfo.cluster; + const secretStore = new eks.KubernetesManifest(clusterInfo.cluster.stack, "ClusterSecretStore", { + cluster: cluster, + manifest: [ + { + apiVersion: "external-secrets.io/v1beta1", + kind: "ClusterSecretStore", + metadata: { + name: "ssm-parameter-store", + namespace: "default" + }, + spec: { + provider: { + aws: { + service: "ParameterStore", + region: clusterInfo.cluster.stack.region, + auth: { + jwt: { + serviceAccountRef: { + name: "external-secrets-sa", + namespace: "external-secrets", + }, + }, + }, + }, + }, + }, + }, + ], + }); + + const externalSecret = new eks.KubernetesManifest(clusterInfo.cluster.stack, "ExternalSecret", { + cluster: cluster, + manifest: [ + { + apiVersion: "external-secrets.io/v1beta1", + kind: "ExternalSecret", + metadata: { + name: "external-grafana-admin-credentials", + namespace: "grafana-operator" + }, + spec: { + secretStoreRef: { + name: "ssm-parameter-store", + kind: "ClusterSecretStore", + }, + target: { + name: "grafana-admin-credentials" + }, + data: [ + { + secretKey: "GF_SECURITY_ADMIN_APIKEY", + remoteRef: { + key: "/grafana-api-key" + }, + }, + ], + }, + }, + ], + }); + externalSecret.node.addDependency(secretStore); + return Promise.resolve(secretStore); + } +} \ No newline at end of file diff --git a/lib/multi-cluster-construct/multi-cluster-builder.ts b/lib/multi-cluster-construct/multi-cluster-builder.ts new file mode 100644 index 00000000..479ce49e --- /dev/null +++ b/lib/multi-cluster-construct/multi-cluster-builder.ts @@ -0,0 +1,76 @@ +import { Construct } from 'constructs'; + +// Blueprints Lib +import * as blueprints from '@aws-quickstart/eks-blueprints'; +import { ClusterSecretStoreAddon } from './cluster-secret-store-addon'; + + +export default class MultiClusterBuilderConstruct { + build(scope: Construct, id: string, account?: string, region?: string ) { + // Setup platform team + const accountID = account ?? process.env.CDK_DEFAULT_ACCOUNT! ; + const awsRegion = region ?? process.env.CDK_DEFAULT_REGION! ; + + const stackID = `${id}-blueprint`; + this.create(scope, accountID, awsRegion) + .build(scope, stackID); + } + + + create(scope: Construct, account?: string, region?: string ) { + // Setup platform team + const accountID = account ?? process.env.CDK_DEFAULT_ACCOUNT! ; + const awsRegion = region ?? process.env.CDK_DEFAULT_REGION! ; + + const ampEndpoint = blueprints.utils.valueFromContext(scope, "conformitron.amp.endpoint", "https://aps-workspaces..amazonaws.com/workspaces//"); + + const ampAddOnProps: blueprints.AmpAddOnProps = { + ampPrometheusEndpoint: ampEndpoint, + }; + + ampAddOnProps.openTelemetryCollector = { + manifestPath: __dirname + '/resources/otel-collector-config-new.yml', + manifestParameterMap: { + logGroupName: `/aws/eks/conformitron/cluster`, + logStreamName: `$NODE_NAME`, + logRetentionDays: 30, + awsRegion: region + } + }; + + + + return blueprints.ObservabilityBuilder.builder() + .account(accountID) + .region(awsRegion) + .withAmpProps(ampAddOnProps) + .enableOpenSourcePatternAddOns() + .addOns( + new blueprints.addons.FluxCDAddOn({ + repositories:[{ + name: "eks-cloud-addons-conformance", + namespace: "flux-system", + repository: { + repoUrl: 'https://github.com/aws-samples/eks-anywhere-addons', + targetRevision: "main", + }, + values: { + }, + kustomizations: [ + {kustomizationPath: "./eks-anywhere-common/Addons/Core/Botkube"}, + {kustomizationPath: "./eks-anywhere-common/Addons/Core/Kube-Observer"}, + {kustomizationPath: "./eks-anywhere-common/Testers/"}, + {kustomizationPath: "./eks-cloud/Testers"}, + {kustomizationPath: "./eks-anywhere-common/Addons/Partner"}, + {kustomizationPath: "./eks-cloud/Partner"}, + ], + }], + }), + new ClusterSecretStoreAddon(), + new blueprints.addons.EbsCsiDriverAddOn(), + new blueprints.addons.ClusterAutoScalerAddOn() + ); + } +} + + diff --git a/lib/multi-cluster-construct/pipeline.ts b/lib/multi-cluster-construct/pipeline.ts new file mode 100644 index 00000000..7ced903d --- /dev/null +++ b/lib/multi-cluster-construct/pipeline.ts @@ -0,0 +1,149 @@ +import * as blueprints from '@aws-quickstart/eks-blueprints'; +import * as eks from 'aws-cdk-lib/aws-eks'; +import * as ec2 from 'aws-cdk-lib/aws-ec2'; +import { Construct } from 'constructs'; +import MultiClusterBuilderConstruct from './multi-cluster-builder'; +import { GrafanaMonitoringConstruct } from './grafana-monitor-builder'; +import { ClusterName, clusterMappings } from './clusterMapping'; + +/** + * Main multi-cluster deployment pipeline. + */ +export class PipelineMultiCluster { + + async buildAsync(scope: Construct) { + const accountID = process.env.ACCOUNT_ID! || process.env.CDK_DEFAULT_ACCOUNT! ; + const region = process.env.AWS_REGION! || process.env.CDK_DEFAULT_REGION!; + + const versions = blueprints.utils.valueFromContext(scope, "conformitron.versions", ["1.28","1.29","1.30"]); + + const CLUSTER_VERSIONS = versions.map((v: string) => eks.KubernetesVersion.of(v)); + + // Stages in codepipeline + const stages : blueprints.StackStage[] = []; + + const blueprintGrafanaConstruct = new GrafanaMonitoringConstruct(); + const blueprintGrafana = blueprintGrafanaConstruct.create(scope, accountID, region); + + stages.push({ + id: ClusterName.MONITORING, + stackBuilder: blueprintGrafana + .clone(region, accountID) + }); + + /* TODO: Seperate region for clusters than infra account region, + trust policy is created when pipeline is bootstrapped. + It will be helpful for enterprise customers. + Similar to approach in multi-region-construct pattern + */ + + let clusterProps: blueprints.MngClusterProviderProps; + + for(const version of CLUSTER_VERSIONS) { + const blueprintBuilderX86 = new MultiClusterBuilderConstruct().create(scope, accountID, region); + + clusterProps = this.buildClusterProps( + clusterMappings[ClusterName.X86]!.amiType, + clusterMappings[ClusterName.X86]!.instanceType + ); + + const blueprintX86 = blueprintBuilderX86 + .version(version) + .clusterProvider(new blueprints.MngClusterProvider(clusterProps)) + .useDefaultSecretEncryption(true); + + stages.push({ + id: ClusterName.X86 + "-" + version.version.replace(".", "-"), + stackBuilder : blueprintX86.clone(region) + }); + + const blueprintBuilderArm = new MultiClusterBuilderConstruct().create(scope, accountID, region); + clusterProps = this.buildClusterProps( + clusterMappings[ClusterName.ARM]!.amiType, + clusterMappings[ClusterName.ARM]!.instanceType + ); + const blueprintARM = blueprintBuilderArm + .version(version) + .clusterProvider(new blueprints.MngClusterProvider(clusterProps)) + .useDefaultSecretEncryption(true); + + stages.push({ + id: ClusterName.ARM + "-" + version.version.replace(".", "-"), + stackBuilder : blueprintARM.clone(region) + }); + } + + // Only deploy lates kube version on BR Clusters + const LATEST_VERSION = CLUSTER_VERSIONS.at(CLUSTER_VERSIONS.length-1)!; + + const blueprintBuilderBrX86= new MultiClusterBuilderConstruct().create(scope, accountID, region); + + clusterProps = this.buildClusterProps( + clusterMappings[ClusterName.BR_X86]!.amiType, + clusterMappings[ClusterName.BR_X86]!.instanceType + ); + + const blueprintBrX86 = blueprintBuilderBrX86 + .version(LATEST_VERSION) + .clusterProvider(new blueprints.MngClusterProvider(clusterProps)) + .useDefaultSecretEncryption(true); + + stages.push({ + id: ClusterName.BR_X86 + "-" + LATEST_VERSION.version.replace(".", "-"), + stackBuilder : blueprintBrX86.clone(region) + }); + + const blueprintBuilderBrArm = new MultiClusterBuilderConstruct().create(scope, accountID, region); + + clusterProps = this.buildClusterProps( + clusterMappings[ClusterName.BR_ARM]!.amiType, + clusterMappings[ClusterName.BR_ARM]!.instanceType + ); + + const blueprintBottleRocketArm = blueprintBuilderBrArm + .version(LATEST_VERSION) + .clusterProvider(new blueprints.MngClusterProvider(clusterProps)) + .useDefaultSecretEncryption(true); + + stages.push({ + id: ClusterName.BR_ARM + "-" + LATEST_VERSION.version.replace(".", "-"), + stackBuilder : blueprintBottleRocketArm.clone(region) + }); + + const gitOwner = 'Howlla'; + const gitRepositoryName = 'cdk-eks-blueprints-patterns'; + + blueprints.CodePipelineStack.builder() + .application('npx ts-node bin/multi-cluster-conformitron.ts') + .name('multi-cluster-central-pipeline') + .owner(gitOwner) + .codeBuildPolicies(blueprints.DEFAULT_BUILD_POLICIES) + .repository({ + repoUrl: gitRepositoryName, + credentialsSecretName: 'github-token', + targetRevision: 'conformitronPipeline', + trigger: blueprints.GitHubTrigger.POLL + }) + .wave({ + id: "prod-test", + stages + }) + .build(scope, "multi-cluster-central-pipeline", { + env: { + account: process.env.CDK_DEFAULT_ACCOUNT, + region: region, + } + }); + } + buildClusterProps(amiType:eks.NodegroupAmiType,instanceType:ec2.InstanceType) : blueprints.MngClusterProviderProps{ + let clusterProps : blueprints.MngClusterProviderProps = { + maxSize : 2, + minSize : 1, + desiredSize: 1, + diskSize: 100, + amiType: amiType, + instanceTypes:[instanceType] + }; + return clusterProps; + } +} diff --git a/lib/multi-cluster-construct/resources/amp-config/alerting-rules.yml b/lib/multi-cluster-construct/resources/amp-config/alerting-rules.yml new file mode 100644 index 00000000..a5535e7c --- /dev/null +++ b/lib/multi-cluster-construct/resources/amp-config/alerting-rules.yml @@ -0,0 +1,719 @@ +groups: + - name: infra-alerts-01 + rules: + - alert: NodeNetworkInterfaceFlapping + expr: changes(node_network_up{device!~"veth.+",job="node-exporter"}[2m]) > 2 + for: 2m + labels: + severity: warning + annotations: + description: Network interface "{{ $labels.device }}" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }} + summary: Network interface is often changing its status + - alert: NodeFilesystemSpaceFillingUp + expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 15 and predict_linear(node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0) + for: 1h + labels: + severity: warning + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. + summary: Filesystem is predicted to run out of space within the next 24 hours. + - alert: NodeFilesystemSpaceFillingUp + expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 10 and predict_linear(node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0) + for: 1h + labels: + severity: critical + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. + summary: Filesystem is predicted to run out of space within the next 4 hours. + - alert: NodeFilesystemAlmostOutOfSpace + expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 3 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0) + for: 30m + labels: + severity: warning + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. + summary: Filesystem has less than 3% space left. + - alert: NodeFilesystemAlmostOutOfSpace + expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 5 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0) + for: 30m + labels: + severity: critical + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. + summary: Filesystem has less than 5% space left. + - alert: NodeFilesystemFilesFillingUp + expr: (node_filesystem_files_free{fstype!="",job="node-exporter"} / node_filesystem_files{fstype!="",job="node-exporter"} * 100 < 40 and predict_linear(node_filesystem_files_free{fstype!="",job="node-exporter"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0) + for: 1h + labels: + severity: warning + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. + summary: Filesystem is predicted to run out of inodes within the next 24 hours. + - alert: NodeFilesystemFilesFillingUp + expr: (node_filesystem_files_free{fstype!="",job="node-exporter"} / node_filesystem_files{fstype!="",job="node-exporter"} * 100 < 20 and predict_linear(node_filesystem_files_free{fstype!="",job="node-exporter"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0) + for: 1h + labels: + severity: critical + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. + summary: Filesystem is predicted to run out of inodes within the next 4 hours. + - alert: NodeFilesystemAlmostOutOfFiles + expr: (node_filesystem_files_free{fstype!="",job="node-exporter"} / node_filesystem_files{fstype!="",job="node-exporter"} * 100 < 5 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0) + for: 1h + labels: + severity: warning + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 5% inodes left. + - alert: NodeFilesystemAlmostOutOfFiles + expr: (node_filesystem_files_free{fstype!="",job="node-exporter"} / node_filesystem_files{fstype!="",job="node-exporter"} * 100 < 3 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0) + for: 1h + labels: + severity: critical + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 3% inodes left. + - alert: NodeNetworkReceiveErrs + expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + annotations: + description: The {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes. + summary: Network interface is reporting many receive errors. + - alert: NodeNetworkTransmitErrs + expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + annotations: + description: The {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes. + summary: Network interface is reporting many transmit errors. + - alert: NodeHighNumberConntrackEntriesUsed + expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning + annotations: + description: The {{ $value | humanizePercentage }} of conntrack entries are used. + summary: Number of conntrack are getting close to the limit. + - alert: NodeTextFileCollectorScrapeError + expr: node_textfile_scrape_error{job="node-exporter"} == 1 + labels: + severity: warning + annotations: + description: Node Exporter text file collector failed to scrape. + summary: Node Exporter text file collector failed to scrape. + - alert: NodeClockSkewDetected + expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) + for: 10m + labels: + severity: warning + annotations: + description: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. + summary: Clock skew detected. + - alert: NodeClockNotSynchronising + expr: min_over_time(node_timex_sync_status[5m]) == 0 and node_timex_maxerror_seconds >= 16 + for: 10m + labels: + severity: warning + annotations: + description: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. + summary: Clock not synchronising. + - alert: NodeRAIDDegraded + expr: node_md_disks_required - ignoring(state) (node_md_disks{state="active"}) > 0 + for: 15m + labels: + severity: critical + annotations: + description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. + summary: RAID Array is degraded + - alert: NodeRAIDDiskFailure + expr: node_md_disks{state="failed"} > 0 + labels: + severity: warning + annotations: + description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. + summary: Failed device in RAID array + - alert: NodeFileDescriptorLimit + expr: (node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70) + for: 15m + labels: + severity: warning + annotations: + description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%. + summary: Kernel is predicted to exhaust file descriptors limit soon. + - alert: NodeFileDescriptorLimit + expr: (node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90) + for: 15m + labels: + severity: critical + annotations: + description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%. + summary: Kernel is predicted to exhaust file descriptors limit soon. + - name: infra-alerts-02 + rules: + - alert: KubeNodeNotReady + expr: kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} == 0 + for: 15m + labels: + severity: warning + annotations: + description: The {{ $labels.node }} has been unready for more than 15 minutes. + summary: Node is not ready. + - alert: KubeNodeUnreachable + expr: (kube_node_spec_taint{effect="NoSchedule",job="kube-state-metrics",key="node.kubernetes.io/unreachable"} unless ignoring(key, value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 + for: 15m + labels: + severity: warning + annotations: + description: The {{ $labels.node }} is unreachable and some workloads may be rescheduled. + summary: Node is unreachable. + - alert: KubeletTooManyPods + expr: count by(cluster, node) ((kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance, pod, namespace, cluster) group_left(node) topk by(instance, pod, namespace, cluster) (1, kube_pod_info{job="kube-state-metrics"})) / max by(cluster, node) (kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1) > 0.95 + for: 15m + labels: + severity: info + annotations: + description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. + summary: Kubelet is running at capacity. + - alert: KubeNodeReadinessFlapping + expr: sum by(cluster, node) (changes(kube_node_status_condition{condition="Ready",status="true"}[15m])) > 2 + for: 15m + labels: + severity: warning + annotations: + description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. + summary: Node readiness status is flapping. + - alert: KubeletPlegDurationHigh + expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 + for: 5m + labels: + severity: warning + annotations: + description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. + summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. + - alert: KubeletPodStartUpLatencyHigh + expr: histogram_quantile(0.99, sum by(cluster, instance, le) (rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m]))) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet"} > 60 + for: 15m + labels: + severity: warning + annotations: + description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. + summary: Kubelet Pod startup latency is too high. + - alert: KubeletClientCertificateExpiration + expr: kubelet_certificate_manager_client_ttl_seconds < 604800 + labels: + severity: warning + annotations: + description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + summary: Kubelet client certificate is about to expire. + - alert: KubeletClientCertificateExpiration + expr: kubelet_certificate_manager_client_ttl_seconds < 86400 + labels: + severity: critical + annotations: + description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + summary: Kubelet client certificate is about to expire. + - alert: KubeletServerCertificateExpiration + expr: kubelet_certificate_manager_server_ttl_seconds < 604800 + labels: + severity: warning + annotations: + description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + summary: Kubelet server certificate is about to expire. + - alert: KubeletServerCertificateExpiration + expr: kubelet_certificate_manager_server_ttl_seconds < 86400 + labels: + severity: critical + annotations: + description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + summary: Kubelet server certificate is about to expire. + - alert: KubeletClientCertificateRenewalErrors + expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + annotations: + description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes). + summary: Kubelet has failed to renew its client certificate. + - alert: KubeletServerCertificateRenewalErrors + expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + annotations: + description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes). + summary: Kubelet has failed to renew its server certificate. + - alert: KubeletDown + expr: absent(up{job="kubelet"} == 1) + for: 15m + labels: + severity: critical + annotations: + description: Kubelet has disappeared from Prometheus target discovery. + summary: Target disappeared from Prometheus target discovery. + - alert: KubeVersionMismatch + expr: count by(cluster) (count by(git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"}, "git_version", "$1", "git_version", "(v[0-9]*.[0-9]*).*"))) > 1 + for: 15m + labels: + severity: warning + annotations: + description: There are {{ $value }} different semantic versions of Kubernetes components running. + summary: Different semantic versions of Kubernetes components running. + - alert: KubeClientErrors + expr: (sum by(cluster, instance, job, namespace) (rate(rest_client_requests_total{code=~"5.."}[5m])) / sum by(cluster, instance, job, namespace) (rate(rest_client_requests_total[5m]))) > 0.01 + for: 15m + labels: + severity: warning + annotations: + description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' + summary: Kubernetes API server client is experiencing errors. + - alert: KubeClientCertificateExpiration + expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + labels: + severity: warning + annotations: + description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days. + summary: Client certificate is about to expire. + - alert: KubeClientCertificateExpiration + expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + labels: + severity: critical + annotations: + description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours. + summary: Client certificate is about to expire. + - alert: KubeAggregatedAPIErrors + expr: sum by(name, namespace, cluster) (increase(aggregator_unavailable_apiservice_total[10m])) > 4 + labels: + severity: warning + annotations: + description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m. + summary: Kubernetes aggregated API has reported errors. + - name: infra-alerts-03 + rules: + - alert: KubeAggregatedAPIDown + expr: (1 - max by(name, namespace, cluster) (avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 + for: 5m + labels: + severity: warning + annotations: + description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. + summary: Kubernetes aggregated API is down. + - alert: KubeAPIDown + expr: absent(up{job="kube-admin"} == 1) + for: 15m + labels: + severity: critical + annotations: + description: KubeAPI has disappeared from Prometheus target discovery. + summary: Target disappeared from Prometheus target discovery. + - alert: KubeAPITerminatedRequests + expr: sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / (sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m]))) > 0.2 + for: 5m + labels: + severity: warning + annotations: + description: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. + summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. + - alert: KubePersistentVolumeFillingUp + expr: (kubelet_volume_stats_available_bytes{job="kubelet",namespace=~".*"} / kubelet_volume_stats_capacity_bytes{job="kubelet",namespace=~".*"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet",namespace=~".*"} > 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1m + labels: + severity: critical + annotations: + description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free. + summary: PersistentVolume is filling up. + - alert: KubePersistentVolumeFillingUp + expr: (kubelet_volume_stats_available_bytes{job="kubelet",namespace=~".*"} / kubelet_volume_stats_capacity_bytes{job="kubelet",namespace=~".*"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet",namespace=~".*"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet",namespace=~".*"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1h + labels: + severity: warning + annotations: + description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. + summary: PersistentVolume is filling up. + - alert: KubePersistentVolumeInodesFillingUp + expr: (kubelet_volume_stats_inodes_free{job="kubelet",namespace=~".*"} / kubelet_volume_stats_inodes{job="kubelet",namespace=~".*"}) < 0.03 and kubelet_volume_stats_inodes_used{job="kubelet",namespace=~".*"} > 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1m + labels: + severity: critical + annotations: + description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} only has {{ $value | humanizePercentage }} free inodes. + summary: PersistentVolumeInodes is filling up. + - alert: KubePersistentVolumeInodesFillingUp + expr: (kubelet_volume_stats_inodes_free{job="kubelet",namespace=~".*"} / kubelet_volume_stats_inodes{job="kubelet",namespace=~".*"}) < 0.15 and kubelet_volume_stats_inodes_used{job="kubelet",namespace=~".*"} > 0 and predict_linear(kubelet_volume_stats_inodes_free{job="kubelet",namespace=~".*"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1h + labels: + severity: warning + annotations: + description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to run out of inodes within four days. Currently {{ $value | humanizePercentage }} of its inodes are free. + summary: PersistentVolumeInodes are filling up. + - alert: KubePersistentVolumeErrors + expr: kube_persistentvolume_status_phase{job="kube-state-metrics",phase=~"Failed|Pending"} > 0 + for: 5m + labels: + severity: critical + annotations: + description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. + summary: PersistentVolume is having issues with provisioning. + - alert: KubeCPUOvercommit + expr: sum(namespace_cpu:kube_pod_container_resource_requests:sum) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0 and (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0 + for: 10m + labels: + severity: warning + annotations: + description: Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure. + summary: Cluster has overcommitted CPU resource requests. + - alert: KubeMemoryOvercommit + expr: sum(namespace_memory:kube_pod_container_resource_requests:sum) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0 and (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0 + for: 10m + labels: + severity: warning + annotations: + description: Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure. + summary: Cluster has overcommitted memory resource requests. + - alert: KubeCPUQuotaOvercommit + expr: sum(min without(resource) (kube_resourcequota{job="kube-state-metrics",resource=~"(cpu|requests.cpu)",type="hard"})) / sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) > 1.5 + for: 5m + labels: + severity: warning + annotations: + description: Cluster has overcommitted CPU resource requests for Namespaces. + summary: Cluster has overcommitted CPU resource requests. + - alert: KubeMemoryQuotaOvercommit + expr: sum(min without(resource) (kube_resourcequota{job="kube-state-metrics",resource=~"(memory|requests.memory)",type="hard"})) / sum(kube_node_status_allocatable{job="kube-state-metrics",resource="memory"}) > 1.5 + for: 5m + labels: + severity: warning + annotations: + description: Cluster has overcommitted memory resource requests for Namespaces. + summary: Cluster has overcommitted memory resource requests. + - alert: KubeQuotaAlmostFull + expr: kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) > 0.9 < 1 + for: 15m + labels: + severity: info + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. + summary: Namespace quota is going to be full. + - alert: KubeQuotaFullyUsed + expr: kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) == 1 + for: 15m + labels: + severity: info + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. + summary: Namespace quota is fully used. + - alert: KubeQuotaExceeded + expr: kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) > 1 + for: 15m + labels: + severity: warning + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. + summary: Namespace quota has exceeded the limits. + - alert: CPUThrottlingHigh + expr: sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) / sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m])) > (25 / 100) + for: 15m + labels: + severity: info + annotations: + description: The {{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}. + summary: Processes experience elevated CPU throttling. + - alert: KubePodCrashLooping + expr: max_over_time(kube_pod_container_status_waiting_reason{job="kube-state-metrics",namespace=~".*",reason="CrashLoopBackOff"}[5m]) >= 1 + for: 15m + labels: + severity: warning + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason:"CrashLoopBackOff"). + summary: Pod is crash looping. + - alert: KubePodNotReady + expr: sum by(namespace, pod, cluster) (max by(namespace, pod, cluster) (kube_pod_status_phase{job="kube-state-metrics",namespace=~".*",phase=~"Pending|Unknown"}) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}))) > 0 + for: 15m + labels: + severity: warning + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) has been in a non-ready state for longer than 15 minutes. + summary: Pod has been in a non-ready state for more than 15 minutes. + - alert: KubeDeploymentGenerationMismatch + expr: kube_deployment_status_observed_generation{job="kube-state-metrics",namespace=~".*"} != kube_deployment_metadata_generation{job="kube-state-metrics",namespace=~".*"} + for: 15m + labels: + severity: warning + annotations: + description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. + summary: Deployment generation mismatch due to possible roll-back + - alert: KubeDeploymentReplicasMismatch + expr: (kube_deployment_spec_replicas{job="kube-state-metrics",namespace=~".*"} > kube_deployment_status_replicas_available{job="kube-state-metrics",namespace=~".*"}) and (changes(kube_deployment_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}[10m]) == 0) + for: 15m + labels: + severity: warning + annotations: + description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. + summary: Deployment has not matched the expected number of replicas. + - name: infra-alerts-04 + rules: + - alert: KubeStatefulSetReplicasMismatch + expr: (kube_statefulset_status_replicas_ready{job="kube-state-metrics",namespace=~".*"} != kube_statefulset_status_replicas{job="kube-state-metrics",namespace=~".*"}) and (changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}[10m]) == 0) + for: 15m + labels: + severity: warning + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. + summary: Deployment has not matched the expected number of replicas. + - alert: KubeStatefulSetGenerationMismatch + expr: kube_statefulset_status_observed_generation{job="kube-state-metrics",namespace=~".*"} != kube_statefulset_metadata_generation{job="kube-state-metrics",namespace=~".*"} + for: 15m + labels: + severity: warning + annotations: + description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. + summary: StatefulSet generation mismatch due to possible roll-back + - alert: KubeStatefulSetUpdateNotRolledOut + expr: (max without(revision) (kube_statefulset_status_current_revision{job="kube-state-metrics",namespace=~".*"} unless kube_statefulset_status_update_revision{job="kube-state-metrics",namespace=~".*"}) * (kube_statefulset_replicas{job="kube-state-metrics",namespace=~".*"} != kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace=~".*"})) and (changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}[5m]) == 0) + for: 15m + labels: + severity: warning + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. + summary: StatefulSet update has not been rolled out. + - alert: KubeDaemonSetRolloutStuck + expr: ((kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",namespace=~".*"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"}) or (kube_daemonset_status_number_misscheduled{job="kube-state-metrics",namespace=~".*"} != 0) or (kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics",namespace=~".*"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"}) or (kube_daemonset_status_number_available{job="kube-state-metrics",namespace=~".*"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"})) and (changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics",namespace=~".*"}[5m]) == 0) + for: 15m + labels: + severity: warning + annotations: + description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. + summary: DaemonSet rollout is stuck. + - alert: KubeContainerWaiting + expr: sum by(namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",namespace=~".*"}) > 0 + for: 1h + labels: + severity: warning + annotations: + description: Pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. + summary: Pod container waiting longer than 1 hour + - alert: KubeDaemonSetNotScheduled + expr: kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",namespace=~".*"} > 0 + for: 10m + labels: + severity: warning + annotations: + description: The {{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled. + summary: DaemonSet pods are not scheduled. + - alert: KubeDaemonSetMisScheduled + expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics",namespace=~".*"} > 0 + for: 15m + labels: + severity: warning + annotations: + description: The {{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run. + summary: DaemonSet pods are misscheduled. + - alert: KubeJobNotCompleted + expr: time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics",namespace=~".*"} and kube_job_status_active{job="kube-state-metrics",namespace=~".*"} > 0) > 43200 + labels: + severity: warning + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "43200" | humanizeDuration }} to complete. + summary: Job did not complete in time + - alert: KubeJobFailed + expr: kube_job_failed{job="kube-state-metrics",namespace=~".*"} > 0 + for: 15m + labels: + severity: warning + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. + summary: Job failed to complete. + - alert: KubeHpaReplicasMismatch + expr: (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics",namespace=~".*"} != kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics",namespace=~".*"}) and (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics",namespace=~".*"} > kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics",namespace=~".*"}) and (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics",namespace=~".*"} < kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics",namespace=~".*"}) and changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics",namespace=~".*"}[15m]) == 0 + for: 15m + labels: + severity: warning + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes. + summary: HPA has not matched descired number of replicas. + - alert: KubeHpaMaxedOut + expr: kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics",namespace=~".*"} == kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics",namespace=~".*"} + for: 15m + labels: + severity: warning + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes. + summary: HPA is running at max replicas + - alert: KubeStateMetricsListErrors + expr: (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) / sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) > 0.01 + for: 15m + labels: + severity: critical + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects or at all. + summary: kube-state-metrics is experiencing errors in list operations. + - alert: KubeStateMetricsWatchErrors + expr: (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) / sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) > 0.01 + for: 15m + labels: + severity: critical + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects or at all. + summary: kube-state-metrics is experiencing errors in watch operations. + - alert: KubeStateMetricsShardingMismatch + expr: stdvar(kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0 + for: 15m + labels: + severity: critical + annotations: + description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. + summary: kube-state-metrics sharding is misconfigured. + - alert: KubeStateMetricsShardsMissing + expr: 2 ^ max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1 - sum(2 ^ max by(shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"})) != 0 + for: 15m + labels: + severity: critical + annotations: + description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed. + summary: kube-state-metrics shards are missing. + - alert: KubeAPIErrorBudgetBurn + expr: sum(apiserver_request:burnrate1h) > (14.4 * 0.01) and sum(apiserver_request:burnrate5m) > (14.4 * 0.01) + for: 2m + labels: + long: 1h + severity: critical + short: 5m + annotations: + description: The API server is burning too much error budget. + summary: The API server is burning too much error budget. + - alert: KubeAPIErrorBudgetBurn + expr: sum(apiserver_request:burnrate6h) > (6 * 0.01) and sum(apiserver_request:burnrate30m) > (6 * 0.01) + for: 15m + labels: + long: 6h + severity: critical + short: 30m + annotations: + description: The API server is burning too much error budget. + summary: The API server is burning too much error budget. + - alert: KubeAPIErrorBudgetBurn + expr: sum(apiserver_request:burnrate1d) > (3 * 0.01) and sum(apiserver_request:burnrate2h) > (3 * 0.01) + for: 1d + labels: + long: 1d + severity: warning + short: 2h + annotations: + description: The API server is burning too much error budget. + summary: The API server is burning too much error budget. + - alert: KubeAPIErrorBudgetBurn + expr: sum(apiserver_request:burnrate3d) > (1 * 0.01) and sum(apiserver_request:burnrate6h) > (1 * 0.01) + for: 3h + labels: + long: 3d + severity: warning + short: 6h + annotations: + description: The API server is burning too much error budget. + summary: The API server is burning too much error budget. + - alert: TargetDown + expr: 100 * (count by(job, namespace, service) (up == 0) / count by(job, namespace, service) (up)) > 10 + for: 10m + labels: + severity: warning + annotations: + description: The {{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down. + - name: infra-alerts-05 + rules: + - alert: Watchdog + expr: vector(1) + labels: + severity: none + annotations: + description: This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the "DeadMansSnitch" integration in PagerDuty. + - alert: InfoInhibitor + expr: ALERTS{severity="info"} == 1 unless on(namespace) ALERTS{alertname!="InfoInhibitor",alertstate="firing",severity=~"warning|critical"} == 1 + labels: + severity: none + annotations: + description: This is an alert that is used to inhibit info alerts. By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with other alerts. This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a severity of 'warning' or 'critical' starts firing on the same namespace. This alert should be routed to a null receiver and configured to inhibit alerts with severity="info". + - alert: etcdInsufficientMembers + expr: sum by(job) (up{job=~".*etcd.*"} == bool 1) < ((count by(job) (up{job=~".*etcd.*"}) + 1) / 2) + for: 3m + labels: + severity: critical + annotations: + message: etcd cluster "{{ $labels.job }}":insufficient members ({{ $value }}). + - alert: etcdHighNumberOfLeaderChanges + expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3 + for: 15m + labels: + severity: warning + annotations: + message: etcd cluster "{{ $labels.job }}":instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour. + - alert: etcdNoLeader + expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 + for: 1m + labels: + severity: critical + annotations: + message: message:etcd cluster "{{ $labels.job }}":member {{ $labels.instance }} has no leader. + - alert: etcdHighNumberOfFailedGRPCRequests + expr: 100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) > 1 + for: 10m + labels: + severity: warning + annotations: + message: etcd cluster "{{ $labels.job }}":{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}. + - alert: etcdGRPCRequestsSlow + expr: histogram_quantile(0.99, sum by(job, instance, grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type="unary",job=~".*etcd.*"}[5m]))) > 0.15 + for: 10m + labels: + severity: critical + annotations: + message: etcd cluster "{{ $labels.job }}":gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}. + - alert: etcdMemberCommunicationSlow + expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15 + for: 10m + labels: + severity: warning + annotations: + message: message:etcd cluster "{{ $labels.job }}":member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}. + - alert: etcdHighNumberOfFailedProposals + expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 + for: 15m + labels: + severity: warning + annotations: + message: etcd cluster "{{ $labels.job }}":{{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}. + - alert: etcdHighFsyncDurations + expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5 + for: 10m + labels: + severity: warning + annotations: + message: etcd cluster "{{ $labels.job }}":99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}. + - alert: etcdHighCommitDurations + expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.25 + for: 10m + labels: + severity: warning + annotations: + message: etcd cluster "{{ $labels.job }}":99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}. + - alert: etcdHighNumberOfFailedHTTPRequests + expr: sum by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) > 0.01 + for: 10m + labels: + severity: warning + annotations: + message: The {{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }} + - alert: etcdHighNumberOfFailedHTTPRequests + expr: sum by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) > 0.05 + for: 10m + labels: + severity: warning + annotations: + message: The {{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}. + - alert: etcdHTTPRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 + for: 10m + labels: + severity: warning + annotations: + message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow. diff --git a/lib/multi-cluster-construct/resources/amp-config/apiserver/recording-rules.yml b/lib/multi-cluster-construct/resources/amp-config/apiserver/recording-rules.yml new file mode 100644 index 00000000..ccbb028a --- /dev/null +++ b/lib/multi-cluster-construct/resources/amp-config/apiserver/recording-rules.yml @@ -0,0 +1,115 @@ +groups: + - name: apiserver-monitoring + rules: + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: code_resource:apiserver_request_total:rate5m + - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + record: code_resource:apiserver_request_total:rate5m + - expr: sum by (cluster, verb, scope, le) (increase(apiserver_request_slo_duration_seconds_bucket[1h])) + record: cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h + - expr: sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h[30d]) + * 24 * 30) + record: cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d + - expr: |- + 1 - ( + ( + # write too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + ) + + ( + # read too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + - + ( + ( + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + or + vector(0) + ) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) + ) + ) + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d) + labels: + verb: all + record: apiserver_request:availability30d + - expr: |- + 1 - ( + sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + - + ( + # too slow + ( + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + or + vector(0) + ) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) + ) + + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"}) + labels: + verb: read + record: apiserver_request:availability30d + - expr: |- + 1 - ( + ( + # too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + ) + + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"}) + labels: + verb: write + record: apiserver_request:availability30d + - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) + > 0 + labels: + quantile: "0.99" + verb: read + record: cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) + > 0 + labels: + quantile: "0.99" + verb: write + record: cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile \ No newline at end of file diff --git a/lib/multi-cluster-construct/resources/amp-config/istio/alerting-rules.yml b/lib/multi-cluster-construct/resources/amp-config/istio/alerting-rules.yml new file mode 100644 index 00000000..ef9f7fcd --- /dev/null +++ b/lib/multi-cluster-construct/resources/amp-config/istio/alerting-rules.yml @@ -0,0 +1,113 @@ + groups: + - name: "istio.basic.alerting-rules" + rules: + - alert: IngressTrafficMissing + annotations: + summary: 'ingress gateway traffic missing' + description: '[Critical]: ingress gateway traffic missing, likely other monitors are misleading, check client logs' + expr: > + absent(istio_requests_total{destination_service_namespace=~"service-graph.*",reporter="source",source_workload="istio-ingressgateway"})==1 + for: 5m + - alert: IstioMetricsMissing + annotations: + summary: 'Istio Metrics missing' + description: '[Critical]: Check prometheus deployment or whether the prometheus filters are applied correctly' + expr: > + absent(istio_request_total)==1 or absent(istio_request_duration_milliseconds_bucket)==1 + for: 5m + - name: "istio.workload.alerting-rules" + rules: + - alert: HTTP5xxRateHigh + annotations: + summary: '5xx rate too high' + description: 'The HTTP 5xx errors rate higher than 0.05 in 5 mins' + expr: > + sum(irate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(irate(istio_requests_total{reporter="destination"}[5m])) > 0.05 + for: 5m + - alert: WorkloadLatencyP99High + expr: histogram_quantile(0.99, sum(irate(istio_request_duration_milliseconds_bucket{source_workload=~"svc.*"}[5m])) by (source_workload,namespace, le)) > 160 + for: 10m + annotations: + description: 'The workload request latency P99 > 160ms ' + message: "Request duration has slowed down for workload: {{`{{$labels.source_workload}}`}} in namespace: {{`{{$labels.namespace}}`}}. Response duration is {{`{{$value}}`}} milliseconds" + - alert: IngressLatencyP99High + expr: histogram_quantile(0.99, sum(irate(istio_request_duration_milliseconds_bucket{source_workload=~"istio.*"}[5m])) by (source_workload,namespace, le)) > 250 + for: 10m + annotations: + description: 'The ingress latency P99 > 250ms ' + message: "Request duration has slowed down for ingress: {{`{{$labels.source_workload}}`}} in namespace: {{`{{$labels.namespace}}`}}. Response duration is {{`{{$value}}`}} milliseconds" + - name: "istio.infra.alerting-rules" + rules: + - alert: ProxyContainerCPUUsageHigh + expr: (sum(rate(container_cpu_usage_seconds_total{namespace!="kube-system", container=~"istio-proxy", namespace!=""}[5m])) BY (namespace, pod, container) * 100) > 80 + for: 5m + annotations: + summary: "Proxy Container CPU usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n" + description: "Proxy Container CPU usage is above 80%" + - alert: ProxyContainerMemoryUsageHigh + expr: (sum(container_memory_working_set_bytes{namespace!="kube-system", container=~"istio-proxy", namespace!=""}) BY (container, pod, namespace) / (sum(container_spec_memory_limit_bytes{namespace!="kube-system", container!="POD"}) BY (container, pod, namespace) > 0)* 100) > 80 + for: 5m + annotations: + summary: "Proxy Container Memory usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n" + description: "Proxy Container Memory usage is above 80%" + - alert: IngressMemoryUsageIncreaseRateHigh + expr: avg(deriv(container_memory_working_set_bytes{container=~"istio-proxy",namespace="istio-system"}[60m])) > 200 + for: 180m + annotations: + summary: "Ingress proxy Memory change rate, VALUE = {{ $value }}\n" + description: "Ingress proxy Memory Usage increases more than 200 Bytes/sec" + - alert: IstiodContainerCPUUsageHigh + expr: (sum(rate(container_cpu_usage_seconds_total{namespace="istio-system", container="discovery"}[5m])) BY (pod) * 100) > 80 + for: 5m + annotations: + summary: "Istiod Container CPU usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n" + description: "Isitod Container CPU usage is above 80%" + - alert: IstiodMemoryUsageHigh + expr: (sum(container_memory_working_set_bytes{namespace="istio-system", container="discovery"}) BY (pod) / (sum(container_spec_memory_limit_bytes{namespace="istio-system", container="discovery"}) BY (pod) > 0)* 100) > 80 + for: 5m + annotations: + summary: "Istiod Container Memory usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n" + description: "Istiod Container Memory usage is above 80%" + - alert: IstiodMemoryUsageIncreaseRateHigh + expr: sum(deriv(container_memory_working_set_bytes{namespace="istio-system",pod=~"istiod-.*"}[60m])) > 1000 + for: 300m + annotations: + summary: "Istiod Container Memory usage increase rate high, VALUE = {{ $value }}\n" + description: "Istiod Container Memory usage increases more than 1k Bytes/sec" + - name: "istio.controlplane.alerting-rules" + rules: + - alert: IstiodxdsPushErrorsHigh + annotations: + summary: 'istiod push errors is too high' + description: 'istiod push error rate is higher than 0.05' + expr: > + sum(irate(pilot_xds_push_errors{app="istiod"}[5m])) / sum(irate(pilot_xds_pushes{app="istiod"}[5m])) > 0.05 + for: 5m + - alert: IstiodxdsRejectHigh + annotations: + summary: 'istiod rejects rate is too high' + description: 'istiod rejects rate is higher than 0.05' + expr: > + sum(irate(pilot_total_xds_rejects{app="istiod"}[5m])) / sum(irate(pilot_xds_pushes{app="istiod"}[5m])) > 0.05 + for: 5m + - alert: IstiodContainerNotReady + annotations: + summary: 'istiod container not ready' + description: 'container: discovery not running' + expr: > + kube_pod_container_status_running{namespace="istio-system", container="discovery", component=""} == 0 + for: 5m + - alert: IstiodUnavailableReplica + annotations: + summary: 'Istiod unavailable pod' + description: 'Istiod unavailable replica > 0' + expr: > + kube_deployment_status_replicas_unavailable{deployment="istiod", component=""} > 0 + for: 5m + - alert: Ingress200RateLow + annotations: + summary: 'ingress gateway 200 rate drops' + description: 'The expected rate is 100 per ns, the limit is set based on 15ns' + expr: > + sum(rate(istio_requests_total{reporter="source", source_workload="istio-ingressgateway",response_code="200",destination_service_namespace=~"service-graph.*"}[5m])) < 1490 + for: 30m \ No newline at end of file diff --git a/lib/multi-cluster-construct/resources/amp-config/istio/recording-rules.yml b/lib/multi-cluster-construct/resources/amp-config/istio/recording-rules.yml new file mode 100644 index 00000000..c2908934 --- /dev/null +++ b/lib/multi-cluster-construct/resources/amp-config/istio/recording-rules.yml @@ -0,0 +1,59 @@ + groups: + - name: "istio.recording-rules" + interval: 5s + rules: + - record: "workload:istio_requests_total" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_requests_total) + + - record: "workload:istio_request_duration_milliseconds_count" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_count) + + - record: "workload:istio_request_duration_milliseconds_sum" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_sum) + + - record: "workload:istio_request_duration_milliseconds_bucket" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_bucket) + + - record: "workload:istio_request_bytes_count" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_count) + + - record: "workload:istio_request_bytes_sum" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_sum) + + - record: "workload:istio_request_bytes_bucket" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_bucket) + + - record: "workload:istio_response_bytes_count" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_count) + + - record: "workload:istio_response_bytes_sum" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_sum) + + - record: "workload:istio_response_bytes_bucket" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_bucket) + + - record: "workload:istio_tcp_sent_bytes_total" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_sent_bytes_total) + + - record: "workload:istio_tcp_received_bytes_total" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_received_bytes_total) + + - record: "workload:istio_tcp_connections_opened_total" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_connections_opened_total) + + - record: "workload:istio_tcp_connections_closed_total" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_connections_closed_total) \ No newline at end of file diff --git a/lib/multi-cluster-construct/resources/amp-config/java/alerting-rules.yml b/lib/multi-cluster-construct/resources/amp-config/java/alerting-rules.yml new file mode 100644 index 00000000..10c1c8e0 --- /dev/null +++ b/lib/multi-cluster-construct/resources/amp-config/java/alerting-rules.yml @@ -0,0 +1,11 @@ +groups: + - name: default-alert + rules: + - alert: metric:alerting_rule + expr: jvm_memory_bytes_used{job="java", area="heap"} / jvm_memory_bytes_max * 100 > 80 + for: 1m + labels: + severity: warning + annotations: + summary: "JVM heap warning" + description: "JVM heap of instance `{{$labels.instance}}` from application `{{$labels.application}}` is above 80% for one minute. (current=`{{$value}}%`)" diff --git a/lib/multi-cluster-construct/resources/amp-config/java/recording-rules.yml b/lib/multi-cluster-construct/resources/amp-config/java/recording-rules.yml new file mode 100644 index 00000000..fd2feddf --- /dev/null +++ b/lib/multi-cluster-construct/resources/amp-config/java/recording-rules.yml @@ -0,0 +1,5 @@ +groups: + - name: default-metric + rules: + - record: metric:recording_rule + expr: avg(rate(container_cpu_usage_seconds_total[5m])) diff --git a/lib/multi-cluster-construct/resources/amp-config/nginx/alerting-rules.yml b/lib/multi-cluster-construct/resources/amp-config/nginx/alerting-rules.yml new file mode 100644 index 00000000..aa03da81 --- /dev/null +++ b/lib/multi-cluster-construct/resources/amp-config/nginx/alerting-rules.yml @@ -0,0 +1,31 @@ +groups: + - name: Nginx-HTTP-4xx-error-rate + rules: + - alert: metric:alerting_rule + expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 + for: 1m + labels: + severity: critical + annotations: + summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }}) + description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: Nginx-HTTP-5xx-error-rate + rules: + - alert: metric:alerting_rule + expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 + for: 1m + labels: + severity: critical + annotations: + summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }}) + description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: Nginx-high-latency + rules: + - alert: metric:alerting_rule + expr: histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node)) > 3 + for: 2m + labels: + severity: warning + annotations: + summary: Nginx latency high (instance {{ $labels.instance }}) + description: "Nginx p99 latency is higher than 3 seconds\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" \ No newline at end of file diff --git a/lib/multi-cluster-construct/resources/amp-config/recording-rules.yml b/lib/multi-cluster-construct/resources/amp-config/recording-rules.yml new file mode 100644 index 00000000..bd74619b --- /dev/null +++ b/lib/multi-cluster-construct/resources/amp-config/recording-rules.yml @@ -0,0 +1,229 @@ +groups: + - name: infra-rules-01 + rules: + - record: "node_namespace_pod:kube_pod_info:" + expr: topk by(cluster, namespace, pod) (1, max by(cluster, node, namespace, pod) (label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)"))) + - record: node:node_num_cpu:sum + expr: count by(cluster, node) (sum by(node, cpu) (node_cpu_seconds_total{job="node-exporter"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:))) + - record: :node_memory_MemAvailable_bytes:sum + expr: sum by(cluster) (node_memory_MemAvailable_bytes{job="node-exporter"} or (node_memory_Buffers_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Slab_bytes{job="node-exporter"})) + - record: cluster:node_cpu:ratio_rate5m + expr: sum by (cluster) (rate(node_cpu_seconds_total{job="node-exporter",mode!="idle",mode!="iowait",mode!="steal"}[5m])) / count by (cluster) (sum by(cluster, instance, cpu) (node_cpu_seconds_total{job="node-exporter"})) + - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + expr: histogram_quantile(0.99, sum by(cluster, instance, le) (rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet"}) + labels: + quantile: 0.99 + - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + expr: histogram_quantile(0.9, sum by(cluster, instance, le) (rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet"}) + labels: + quantile: 0.9 + - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + expr: histogram_quantile(0.5, sum by(cluster, instance, le) (rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet"}) + labels: + quantile: 0.5 + - record: instance:node_num_cpu:sum + expr: count without(cpu, mode) (node_cpu_seconds_total{job="node-exporter",mode="idle"}) + - record: instance:node_cpu_utilisation:rate5m + expr: 1 - avg without(cpu) (sum without(mode) (rate(node_cpu_seconds_total{job="node-exporter",mode=~"idle|iowait|steal"}[5m]))) + - record: instance:node_load1_per_cpu:ratio + expr: (node_load1{job="node-exporter"} / instance:node_num_cpu:sum{job="node-exporter"}) + - record: instance:node_memory_utilisation:ratio + expr: 1 - ((node_memory_MemAvailable_bytes{job="node-exporter"} or (node_memory_Buffers_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Slab_bytes{job="node-exporter"})) / node_memory_MemTotal_bytes{job="node-exporter"}) + - record: instance:node_vmstat_pgmajfault:rate5m + expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) + - record: instance_device:node_disk_io_time_seconds:rate5m + expr: rate(node_disk_io_time_seconds_total{device=~"mmcblk.p.+|.*nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+",job="node-exporter"}[5m]) + - record: instance_device:node_disk_io_time_weighted_seconds:rate5m + expr: rate(node_disk_io_time_weighted_seconds_total{device=~"mmcblk.p.+|.*nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+",job="node-exporter"}[5m]) + - record: instance:node_network_receive_bytes_excluding_lo:rate5m + expr: sum without(device) (rate(node_network_receive_bytes_total{device!="lo",job="node-exporter"}[5m])) + - record: instance:node_network_transmit_bytes_excluding_lo:rate5m + expr: sum without(device) (rate(node_network_transmit_bytes_total{device!="lo",job="node-exporter"}[5m])) + - record: instance:node_network_receive_drop_excluding_lo:rate5m + expr: sum without(device) (rate(node_network_receive_drop_total{device!="lo",job="node-exporter"}[5m])) + - record: instance:node_network_transmit_drop_excluding_lo:rate5m + expr: sum without(device) (rate(node_network_transmit_drop_total{device!="lo",job="node-exporter"}[5m])) + - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) + labels: + quantile: 0.99 + - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) + labels: + quantile: 0.99 + - name: infra-rules-02 + rules: + - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) + labels: + quantile: 0.99 + - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) + labels: + quantile: 0.9 + - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) + labels: + quantile: 0.9 + - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) + labels: + quantile: 0.9 + - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) + labels: + quantile: 0.5 + - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) + labels: + quantile: 0.5 + - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) + labels: + quantile: 0.5 + - record: instance:node_cpu:rate:sum + expr: sum by(instance) (rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) + - record: instance:node_network_receive_bytes:rate:sum + expr: sum by(instance) (rate(node_network_receive_bytes_total[3m])) + - record: instance:node_network_transmit_bytes:rate:sum + expr: sum by(instance) (rate(node_network_transmit_bytes_total[3m])) + - record: instance:node_cpu:ratio + expr: sum without(cpu, mode) (rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) / on(instance) group_left() count by(instance) (sum by(instance, cpu) (node_cpu_seconds_total)) + - record: cluster:node_cpu:sum_rate5m + expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) + - record: cluster:node_cpu:ratio + expr: cluster:node_cpu:sum_rate5m / count(sum by(instance, cpu) (node_cpu_seconds_total)) + - record: count:up1 + expr: count without(instance, pod, node) (up == 1) + - record: count:up0 + expr: count without(instance, pod, node) (up == 0) + - record: cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile + expr: histogram_quantile(0.99, sum by(cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[5m]))) > 0 + labels: + quantile: 0.99 + verb: read + - record: cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile + expr: histogram_quantile(0.99, sum by(cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 + labels: + quantile: 0.99 + verb: write + - record: apiserver_request:burnrate1d + expr: ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[1d])) - ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="1",scope=~"resource|",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[1d])) or vector(0)) + sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="5",scope="namespace",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[1d])) + sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="30",scope="cluster",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[1d])))) + sum by(cluster) (rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[1d]))) / sum by(cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) + labels: + verb: read + - record: apiserver_request:burnrate1h + expr: ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[1h])) - ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="1",scope=~"resource|",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[1h])) or vector(0)) + sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="5",scope="namespace",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[1h])) + sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="30",scope="cluster",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[1h])))) + sum by(cluster) (rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[1h]))) / sum by(cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) + labels: + verb: read + - record: apiserver_request:burnrate2h + expr: ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[2h])) - ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="1",scope=~"resource|",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[2h])) or vector(0)) + sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="5",scope="namespace",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[2h])) + sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="30",scope="cluster",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[2h])))) + sum by(cluster) (rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[2h]))) / sum by(cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) + labels: + verb: read + - name: infra-rules-03 + rules: + - record: apiserver_request:burnrate30m + expr: ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[30m])) - ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="1",scope=~"resource|",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[30m])) or vector(0)) + sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="5",scope="namespace",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[30m])) + sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="30",scope="cluster",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[30m])))) + sum by(cluster) (rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[30m]))) / sum by(cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) + labels: + verb: read + - record: apiserver_request:burnrate3d + expr: ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[3d])) - ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="1",scope=~"resource|",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[3d])) or vector(0)) + sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="5",scope="namespace",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[3d])) + sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="30",scope="cluster",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[3d])))) + sum by(cluster) (rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[3d]))) / sum by(cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) + labels: + verb: read + - record: apiserver_request:burnrate5m + expr: ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[5m])) - ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="1",scope=~"resource|",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[5m])) or vector(0)) + sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="5",scope="namespace",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[5m])) + sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="30",scope="cluster",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[5m])))) + sum by(cluster) (rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[5m]))) / sum by(cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + - record: apiserver_request:burnrate6h + expr: ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[6h])) - ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="1",scope=~"resource|",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[6h])) or vector(0)) + sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="5",scope="namespace",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[6h])) + sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="30",scope="cluster",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[6h])))) + sum by(cluster) (rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[6h]))) / sum by(cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) + labels: + verb: read + - record: apiserver_request:burnrate1d + expr: ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="1",subresource!~"proxy|attach|log|exec|portforward",verb=~"POST|PUT|PATCH|DELETE"}[1d]))) + sum by(cluster) (rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))) / sum by(cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + labels: + verb: read + - record: apiserver_request:burnrate1d + expr: ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[1d])) - ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="1",scope=~"resource|",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[1d])) or vector(0)) + sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="5",scope="namespace",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[1d])) + sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="30",scope="cluster",subresource!~"proxy|attach|log|exec|portforward",verb=~"LIST|GET"}[1d])))) + sum by(cluster) (rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[1d]))) / sum by(cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) + labels: + verb: write + - record: apiserver_request:burnrate1h + expr: ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="1",subresource!~"proxy|attach|log|exec|portforward",verb=~"POST|PUT|PATCH|DELETE"}[1h]))) + sum by(cluster) (rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))) / sum by(cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + labels: + verb: write + - record: apiserver_request:burnrate2h + expr: ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="1",subresource!~"proxy|attach|log|exec|portforward",verb=~"POST|PUT|PATCH|DELETE"}[2h]))) + sum by(cluster) (rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))) / sum by(cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + labels: + verb: write + - record: apiserver_request:burnrate30m + expr: ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="1",subresource!~"proxy|attach|log|exec|portforward",verb=~"POST|PUT|PATCH|DELETE"}[30m]))) + sum by(cluster) (rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))) / sum by(cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + labels: + verb: write + - record: apiserver_request:burnrate3d + expr: ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="1",subresource!~"proxy|attach|log|exec|portforward",verb=~"POST|PUT|PATCH|DELETE"}[3d]))) + sum by(cluster) (rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))) / sum by(cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + labels: + verb: write + - record: apiserver_request:burnrate5m + expr: ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="1",subresource!~"proxy|attach|log|exec|portforward",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) + sum by(cluster) (rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) / sum by(cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + - record: apiserver_request:burnrate6h + expr: ((sum by(cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",subresource!~"proxy|attach|log|exec|portforward",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - sum by(cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",le="1",subresource!~"proxy|attach|log|exec|portforward",verb=~"POST|PUT|PATCH|DELETE"}[6h]))) + sum by(cluster) (rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))) / sum by(cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + labels: + verb: write + - record: code_verb:apiserver_request_total:increase30d + expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 + - record: code:apiserver_request_total:increase30d + expr: sum by(cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + labels: + verb: read + - record: code:apiserver_request_total:increase30d + expr: sum by(cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + labels: + verb: write + - record: cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h + expr: sum by(cluster, verb, scope) (increase(apiserver_request_slo_duration_seconds_count[1h])) + - record: cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d + expr: sum by(cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h[30d]) * 24 * 30) + - record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate + expr: sum by(cluster, namespace, pod, container) (irate(container_cpu_usage_seconds_total{image!="",job="kubelet"}[5m])) * on(cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})) + - record: node_namespace_pod_container:container_memory_working_set_bytes + expr: container_memory_working_set_bytes{image!="",job="kubelet"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""})) + - record: node_namespace_pod_container:container_memory_rss + expr: container_memory_rss{image!="",job="kubelet"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""})) + - name: infra-rules-04 + rules: + - record: node_namespace_pod_container:container_memory_cache + expr: container_memory_cache{image!="",job="kubelet"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""})) + - record: node_namespace_pod_container:container_memory_swap + expr: container_memory_swap{image!="",job="kubelet"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""})) + - record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests + expr: kube_pod_container_resource_requests{job="kube-state-metrics",resource="memory"} * on(namespace, pod, cluster) group_left() max by(namespace, pod, cluster) ((kube_pod_status_phase{phase=~"Pending|Running"} == 1)) + - record: namespace_memory:kube_pod_container_resource_requests:sum + expr: sum by(namespace, cluster) (sum by(namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_requests{job="kube-state-metrics",resource="memory"}) * on(namespace, pod, cluster) group_left() max by(namespace, pod, cluster) (kube_pod_status_phase{phase=~"Pending|Running"} == 1))) + - record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests + expr: kube_pod_container_resource_requests{job="kube-state-metrics",resource="cpu"} * on(namespace, pod, cluster) group_left() max by(namespace, pod, cluster) ((kube_pod_status_phase{phase=~"Pending|Running"} == 1)) + - record: namespace_cpu:kube_pod_container_resource_requests:sum + expr: sum by(namespace, cluster) (sum by(namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_requests{job="kube-state-metrics",resource="cpu"}) * on(namespace, pod, cluster) group_left() max by(namespace, pod, cluster) (kube_pod_status_phase{phase=~"Pending|Running"} == 1))) + - record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits + expr: kube_pod_container_resource_limits{job="kube-state-metrics",resource="memory"} * on(namespace, pod, cluster) group_left() max by(namespace, pod, cluster) ((kube_pod_status_phase{phase=~"Pending|Running"} == 1)) + - record: namespace_memory:kube_pod_container_resource_limits:sum + expr: sum by(namespace, cluster) (sum by(namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_limits{job="kube-state-metrics",resource="memory"}) * on(namespace, pod, cluster) group_left() max by(namespace, pod, cluster) (kube_pod_status_phase{phase=~"Pending|Running"} == 1))) + - record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits + expr: kube_pod_container_resource_limits{job="kube-state-metrics",resource="cpu"} * on(namespace, pod, cluster) group_left() max by(namespace, pod, cluster) ((kube_pod_status_phase{phase=~"Pending|Running"} == 1)) + - record: namespace_cpu:kube_pod_container_resource_limits:sum + expr: sum by(namespace, cluster) (sum by(namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_limits{job="kube-state-metrics",resource="cpu"}) * on(namespace, pod, cluster) group_left() max by(namespace, pod, cluster) (kube_pod_status_phase{phase=~"Pending|Running"} == 1))) + - record: namespace_workload_pod:kube_pod_owner:relabel + expr: max by(cluster, namespace, workload, pod) (label_replace(label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)") * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (1, max by(replicaset, namespace, owner_name) (kube_replicaset_owner{job="kube-state-metrics"})), "workload", "$1", "owner_name", "(.*)")) + labels: + workload_type: deployment + - record: namespace_workload_pod:kube_pod_owner:relabel + expr: max by(cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)")) + labels: + workload_type: daemonset + - record: namespace_workload_pod:kube_pod_owner:relabel + expr: max by(cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)")) + labels: + workload_type: statefulset + - record: namespace_workload_pod:kube_pod_owner:relabel + expr: max by(cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="Job"}, "workload", "$1", "owner_name", "(.*)")) + labels: + workload_type: job \ No newline at end of file diff --git a/lib/multi-cluster-construct/resources/cost-optimization/scaleDownEksToZero.yml b/lib/multi-cluster-construct/resources/cost-optimization/scaleDownEksToZero.yml new file mode 100644 index 00000000..fd9d76dd --- /dev/null +++ b/lib/multi-cluster-construct/resources/cost-optimization/scaleDownEksToZero.yml @@ -0,0 +1,109 @@ +schemaVersion: '0.3' +description: |- + --- + # Scale down all conformitron EKS cluster to 0 +assumeRole: arn:aws:iam::ACCOUNT_ID:role/SsmEksRole +mainSteps: + - name: scaleEKSClusterToZero + action: aws:executeAwsApi + nextStep: scaleEKSClusterToZero_1 + isEnd: false + inputs: + Service: eks + Api: UpdateNodegroupConfig + clusterName: arm-1-26-blueprint + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 0 + maxSize: 1 + desiredSize: 0 + - name: scaleEKSClusterToZero_1 + action: aws:executeAwsApi + nextStep: scaleEKSClusterToZero_2 + isEnd: false + inputs: + Service: eks + Api: UpdateNodegroupConfig + clusterName: arm-1-27-blueprint + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 0 + maxSize: 1 + desiredSize: 0 + - name: scaleEKSClusterToZero_2 + action: aws:executeAwsApi + nextStep: scaleEKSClusterToZero_3 + isEnd: false + inputs: + Service: eks + Api: UpdateNodegroupConfig + clusterName: arm-1-28-blueprint + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 0 + maxSize: 1 + desiredSize: 0 + - name: scaleEKSClusterToZero_3 + action: aws:executeAwsApi + nextStep: scaleEKSClusterToZero_4 + isEnd: false + inputs: + Service: eks + Api: UpdateNodegroupConfig + clusterName: br-ARM1-28-blueprint + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 0 + maxSize: 1 + desiredSize: 0 + - name: scaleEKSClusterToZero_4 + action: aws:executeAwsApi + nextStep: scaleEKSClusterToZero_5 + isEnd: false + inputs: + Service: eks + Api: UpdateNodegroupConfig + clusterName: br-X861-28-blueprint + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 0 + maxSize: 1 + desiredSize: 0 + - name: scaleEKSClusterToZero_5 + action: aws:executeAwsApi + nextStep: scaleEKSClusterToZero_6 + isEnd: false + inputs: + Service: eks + Api: UpdateNodegroupConfig + clusterName: x86-1-26-blueprint + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 0 + maxSize: 1 + desiredSize: 0 + - name: scaleEKSClusterToZero_6 + action: aws:executeAwsApi + nextStep: scaleEKSClusterToZero_7 + isEnd: false + inputs: + Service: eks + Api: UpdateNodegroupConfig + clusterName: x86-1-27-blueprint + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 0 + maxSize: 1 + desiredSize: 0 + - name: scaleEKSClusterToZero_7 + action: aws:executeAwsApi + isEnd: true + inputs: + Service: eks + Api: UpdateNodegroupConfig + clusterName: x86-1-28-blueprint + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 0 + maxSize: 1 + desiredSize: 0 diff --git a/lib/multi-cluster-construct/resources/cost-optimization/scaleUpEksToOne.yml b/lib/multi-cluster-construct/resources/cost-optimization/scaleUpEksToOne.yml new file mode 100644 index 00000000..0fd8ef6e --- /dev/null +++ b/lib/multi-cluster-construct/resources/cost-optimization/scaleUpEksToOne.yml @@ -0,0 +1,109 @@ +schemaVersion: '0.3' +description: |- + --- + # Scale down all conformitron EKS cluster to1 +assumeRole: arn:aws:iam::ACCOUNT_ID:role/SsmEksRole +mainSteps: + - name: scaleEKSClusterToOne + action: aws:executeAwsApi + nextStep: scaleEKSClusterToOne_1 + isEnd: false + inputs: + Service: eks + Api: UpdateNodegroupConfig + clusterName: arm-1-26-blueprint + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 1 + maxSize: 1 + desiredSize: 1 + - name: scaleEKSClusterToOne_1 + action: aws:executeAwsApi + nextStep: scaleEKSClusterToOne_2 + isEnd: false + inputs: + Service: eks + Api: UpdateNodegroupConfig + clusterName: arm-1-27-blueprint + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 1 + maxSize: 1 + desiredSize: 1 + - name: scaleEKSClusterToOne_2 + action: aws:executeAwsApi + nextStep: scaleEKSClusterToOne_3 + isEnd: false + inputs: + Service: eks + Api: UpdateNodegroupConfig + clusterName: arm-1-28-blueprint + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 1 + maxSize: 1 + desiredSize: 1 + - name: scaleEKSClusterToOne_3 + action: aws:executeAwsApi + nextStep: scaleEKSClusterToOne_4 + isEnd: false + inputs: + Service: eks + Api: UpdateNodegroupConfig + clusterName: br-arm-1-28-blueprint + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 1 + maxSize: 1 + desiredSize: 1 + - name: scaleEKSClusterToOne_4 + action: aws:executeAwsApi + nextStep: scaleEKSClusterToOne_5 + isEnd: false + inputs: + Service: eks + Api: UpdateNodegroupConfig + clusterName: br-x86-1-28-blueprint + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 1 + maxSize: 1 + desiredSize: 1 + - name: scaleEKSClusterToOne_5 + action: aws:executeAwsApi + nextStep: scaleEKSClusterToOne_6 + isEnd: false + inputs: + Service: eks + Api: UpdateNodegroupConfig + clusterName: x86-1-26-blueprint + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 1 + maxSize: 1 + desiredSize: 1 + - name: scaleEKSClusterToOne_6 + action: aws:executeAwsApi + nextStep: scaleEKSClusterToOne_7 + isEnd: false + inputs: + Service: eks + Api: UpdateNodegroupConfig + clusterName: x86-1-27-blueprint + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 1 + maxSize: 1 + desiredSize: 1 + - name: scaleEKSClusterToOne_7 + action: aws:executeAwsApi + isEnd: true + inputs: + Service: eks + Api: UpdateNodegroupConfig + clusterName: x86-1-28-blueprint + nodegroupName: eks-blueprints-mng + scalingConfig: + minSize: 1 + maxSize: 1 + desiredSize: 1 diff --git a/lib/multi-cluster-construct/resources/otel-collector-config.yml b/lib/multi-cluster-construct/resources/otel-collector-config.yml new file mode 100644 index 00000000..14b76d04 --- /dev/null +++ b/lib/multi-cluster-construct/resources/otel-collector-config.yml @@ -0,0 +1,1878 @@ +# +# OpenTelemetry Collector configuration +# Metrics pipeline with Prometheus Receiver and AWS Remote Write Exporter sending metrics to Amazon Managed Prometheus +# +apiVersion: opentelemetry.io/v1alpha1 +kind: OpenTelemetryCollector +metadata: + name: otel-collector-amp + namespace: "{{namespace}}" +spec: + mode: "{{deploymentMode}}" + image: public.ecr.aws/aws-observability/aws-otel-collector:v0.37.0 + resources: + limits: + cpu: "1" + memory: "2Gi" + requests: + cpu: "1" + memory: "2Gi" + serviceAccount: adot-collector + podSecurityContext: + runAsGroup: 0 + runAsUser: 0 + volumeMounts: + - name: varlogpods + mountPath: /var/log/pods + readOnly: true + volumes: + - name: varlogpods + hostPath: + path: /var/log/pods + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + config: | + receivers: + prometheus: + config: + global: + scrape_interval: 15s + scrape_timeout: 10s + external_labels: + cluster: "{{clusterName}}" + scrape_configs: + {{ start enableAdotMetricsCollectionJob}} + - job_name: otel-collector-metrics + scrape_interval: 10s + static_configs: + - targets: ['localhost:8888'] + {{ stop enableAdotMetricsCollectionJob }} + - job_name: 'kubernetes-kubelet' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc.cluster.local:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/$${1}/proxy/metrics + - job_name: 'kubelet' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc.cluster.local:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/$${1}/proxy/metrics/cadvisor + {{ start enableAPIserverJob }} + - job_name: 'apiserver' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: + [ + __meta_kubernetes_namespace, + __meta_kubernetes_service_name, + __meta_kubernetes_endpoint_port_name, + ] + action: keep + regex: default;kubernetes;https + metric_relabel_configs: + - action: keep + source_labels: [__name__] + - source_labels: [__name__, le] + separator: ; + regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50) + replacement: $1 + action: drop + {{ stop enableAPIserverJob }} + - job_name: serviceMonitor/default/kube-prometheus-stack-prometheus-node-exporter/0 + honor_timestamps: true + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + follow_redirects: true + enable_http2: true + relabel_configs: + - source_labels: [job] + separator: ; + regex: (.*) + target_label: __tmp_prometheus_job_name + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_app, __meta_kubernetes_service_labelpresent_app] + separator: ; + regex: (prometheus-node-exporter);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_service_label_release, __meta_kubernetes_service_labelpresent_release] + separator: ; + regex: (kube-prometheus-stack);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: http-metrics + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_name] + separator: ; + regex: (.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_container_name] + separator: ; + regex: (.*) + target_label: container + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_jobLabel] + separator: ; + regex: (.+) + target_label: job + replacement: $$1 + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: http-metrics + action: replace + - source_labels: [__address__] + separator: ; + regex: (.*) + modulus: 1 + target_label: __tmp_hash + replacement: $$1 + action: hashmod + - source_labels: [__tmp_hash] + separator: ; + regex: "0" + replacement: $$1 + action: keep + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + follow_redirects: true + enable_http2: true + namespaces: + own_namespace: false + names: + - default + - job_name: serviceMonitor/default/kube-prometheus-stack-prometheus/0 + honor_timestamps: true + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + follow_redirects: true + enable_http2: true + relabel_configs: + - source_labels: [job] + separator: ; + regex: (.*) + target_label: __tmp_prometheus_job_name + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_app, __meta_kubernetes_service_labelpresent_app] + separator: ; + regex: (kube-prometheus-stack-prometheus);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_service_label_release, __meta_kubernetes_service_labelpresent_release] + separator: ; + regex: (kube-prometheus-stack);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_service_label_self_monitor, __meta_kubernetes_service_labelpresent_self_monitor] + separator: ; + regex: (true);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: http-web + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_name] + separator: ; + regex: (.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_container_name] + separator: ; + regex: (.*) + target_label: container + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: $$1 + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: http-web + action: replace + - source_labels: [__address__] + separator: ; + regex: (.*) + modulus: 1 + target_label: __tmp_hash + replacement: $$1 + action: hashmod + - source_labels: [__tmp_hash] + separator: ; + regex: "0" + replacement: $$1 + action: keep + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + follow_redirects: true + enable_http2: true + namespaces: + own_namespace: false + names: + - default + - job_name: serviceMonitor/default/kube-prometheus-stack-operator/0 + honor_labels: true + honor_timestamps: true + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + follow_redirects: true + enable_http2: true + relabel_configs: + - source_labels: [job] + separator: ; + regex: (.*) + target_label: __tmp_prometheus_job_name + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_app, __meta_kubernetes_service_labelpresent_app] + separator: ; + regex: (kube-prometheus-stack-operator);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_service_label_release, __meta_kubernetes_service_labelpresent_release] + separator: ; + regex: (kube-prometheus-stack);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: https + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_name] + separator: ; + regex: (.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_container_name] + separator: ; + regex: (.*) + target_label: container + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: $$1 + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: https + action: replace + - source_labels: [__address__] + separator: ; + regex: (.*) + modulus: 1 + target_label: __tmp_hash + replacement: $$1 + action: hashmod + - source_labels: [__tmp_hash] + separator: ; + regex: "0" + replacement: $$1 + action: keep + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + follow_redirects: true + enable_http2: true + namespaces: + own_namespace: false + names: + - default + - job_name: serviceMonitor/default/kube-prometheus-stack-kubelet/2 + honor_labels: true + honor_timestamps: true + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics/probes + scheme: https + authorization: + type: Bearer + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + follow_redirects: true + enable_http2: true + relabel_configs: + - source_labels: [job] + separator: ; + regex: (.*) + target_label: __tmp_prometheus_job_name + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name, __meta_kubernetes_service_labelpresent_app_kubernetes_io_name] + separator: ; + regex: (kubelet);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_service_label_k8s_app, __meta_kubernetes_service_labelpresent_k8s_app] + separator: ; + regex: (kubelet);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: https-metrics + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_name] + separator: ; + regex: (.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_container_name] + separator: ; + regex: (.*) + target_label: container + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_k8s_app] + separator: ; + regex: (.+) + target_label: job + replacement: $$1 + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: https-metrics + action: replace + - source_labels: [__metrics_path__] + separator: ; + regex: (.*) + target_label: metrics_path + replacement: $$1 + action: replace + - source_labels: [__address__] + separator: ; + regex: (.*) + modulus: 1 + target_label: __tmp_hash + replacement: $$1 + action: hashmod + - source_labels: [__tmp_hash] + separator: ; + regex: "0" + replacement: $$1 + action: keep + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + follow_redirects: true + enable_http2: true + namespaces: + own_namespace: false + names: + - kube-system + - job_name: serviceMonitor/default/kube-prometheus-stack-kubelet/1 + honor_labels: true + honor_timestamps: true + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics/cadvisor + scheme: https + authorization: + type: Bearer + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + follow_redirects: true + enable_http2: true + relabel_configs: + - source_labels: [job] + separator: ; + regex: (.*) + target_label: __tmp_prometheus_job_name + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name, __meta_kubernetes_service_labelpresent_app_kubernetes_io_name] + separator: ; + regex: (kubelet);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_service_label_k8s_app, __meta_kubernetes_service_labelpresent_k8s_app] + separator: ; + regex: (kubelet);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: https-metrics + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_name] + separator: ; + regex: (.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_container_name] + separator: ; + regex: (.*) + target_label: container + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_k8s_app] + separator: ; + regex: (.+) + target_label: job + replacement: $$1 + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: https-metrics + action: replace + - source_labels: [__metrics_path__] + separator: ; + regex: (.*) + target_label: metrics_path + replacement: $$1 + action: replace + - source_labels: [__address__] + separator: ; + regex: (.*) + modulus: 1 + target_label: __tmp_hash + replacement: $$1 + action: hashmod + - source_labels: [__tmp_hash] + separator: ; + regex: "0" + replacement: $$1 + action: keep + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + follow_redirects: true + enable_http2: true + namespaces: + own_namespace: false + names: + - kube-system + - job_name: serviceMonitor/default/kube-prometheus-stack-kubelet/0 + honor_labels: true + honor_timestamps: true + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics + scheme: https + authorization: + type: Bearer + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + follow_redirects: true + enable_http2: true + relabel_configs: + - source_labels: [job] + separator: ; + regex: (.*) + target_label: __tmp_prometheus_job_name + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name, __meta_kubernetes_service_labelpresent_app_kubernetes_io_name] + separator: ; + regex: (kubelet);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_service_label_k8s_app, __meta_kubernetes_service_labelpresent_k8s_app] + separator: ; + regex: (kubelet);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: https-metrics + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_name] + separator: ; + regex: (.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_container_name] + separator: ; + regex: (.*) + target_label: container + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_k8s_app] + separator: ; + regex: (.+) + target_label: job + replacement: $$1 + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: https-metrics + action: replace + - source_labels: [__metrics_path__] + separator: ; + regex: (.*) + target_label: metrics_path + replacement: $$1 + action: replace + - source_labels: [__address__] + separator: ; + regex: (.*) + modulus: 1 + target_label: __tmp_hash + replacement: $$1 + action: hashmod + - source_labels: [__tmp_hash] + separator: ; + regex: "0" + replacement: $$1 + action: keep + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + follow_redirects: true + enable_http2: true + namespaces: + own_namespace: false + names: + - kube-system + - job_name: serviceMonitor/default/kube-prometheus-stack-kube-state-metrics/0 + honor_labels: true + honor_timestamps: true + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + follow_redirects: true + enable_http2: true + relabel_configs: + - source_labels: [job] + separator: ; + regex: (.*) + target_label: __tmp_prometheus_job_name + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_instance, __meta_kubernetes_service_labelpresent_app_kubernetes_io_instance] + separator: ; + regex: (kube-prometheus-stack);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name, __meta_kubernetes_service_labelpresent_app_kubernetes_io_name] + separator: ; + regex: (kube-state-metrics);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: http + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_name] + separator: ; + regex: (.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_container_name] + separator: ; + regex: (.*) + target_label: container + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + separator: ; + regex: (.+) + target_label: job + replacement: $$1 + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: http + action: replace + - source_labels: [__address__] + separator: ; + regex: (.*) + modulus: 1 + target_label: __tmp_hash + replacement: $$1 + action: hashmod + - source_labels: [__tmp_hash] + separator: ; + regex: "0" + replacement: $$1 + action: keep + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + follow_redirects: true + enable_http2: true + namespaces: + own_namespace: false + names: + - default + - job_name: serviceMonitor/default/kube-prometheus-stack-kube-scheduler/0 + honor_timestamps: true + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + authorization: + type: Bearer + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + follow_redirects: true + enable_http2: true + relabel_configs: + - source_labels: [job] + separator: ; + regex: (.*) + target_label: __tmp_prometheus_job_name + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_app, __meta_kubernetes_service_labelpresent_app] + separator: ; + regex: (kube-prometheus-stack-kube-scheduler);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_service_label_release, __meta_kubernetes_service_labelpresent_release] + separator: ; + regex: (kube-prometheus-stack);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: http-metrics + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_name] + separator: ; + regex: (.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_container_name] + separator: ; + regex: (.*) + target_label: container + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_jobLabel] + separator: ; + regex: (.+) + target_label: job + replacement: $$1 + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: http-metrics + action: replace + - source_labels: [__address__] + separator: ; + regex: (.*) + modulus: 1 + target_label: __tmp_hash + replacement: $$1 + action: hashmod + - source_labels: [__tmp_hash] + separator: ; + regex: "0" + replacement: $$1 + action: keep + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + follow_redirects: true + enable_http2: true + namespaces: + own_namespace: false + names: + - kube-system + - job_name: serviceMonitor/default/kube-prometheus-stack-kube-proxy/0 + honor_timestamps: true + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + authorization: + type: Bearer + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + follow_redirects: true + enable_http2: true + relabel_configs: + - source_labels: [job] + separator: ; + regex: (.*) + target_label: __tmp_prometheus_job_name + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_app, __meta_kubernetes_service_labelpresent_app] + separator: ; + regex: (kube-prometheus-stack-kube-proxy);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_service_label_release, __meta_kubernetes_service_labelpresent_release] + separator: ; + regex: (kube-prometheus-stack);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: http-metrics + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_name] + separator: ; + regex: (.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_container_name] + separator: ; + regex: (.*) + target_label: container + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_jobLabel] + separator: ; + regex: (.+) + target_label: job + replacement: $$1 + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: http-metrics + action: replace + - source_labels: [__address__] + separator: ; + regex: (.*) + modulus: 1 + target_label: __tmp_hash + replacement: $$1 + action: hashmod + - source_labels: [__tmp_hash] + separator: ; + regex: "0" + replacement: $$1 + action: keep + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + follow_redirects: true + enable_http2: true + namespaces: + own_namespace: false + names: + - kube-system + - job_name: serviceMonitor/default/kube-prometheus-stack-kube-etcd/0 + honor_timestamps: true + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + authorization: + type: Bearer + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + follow_redirects: true + enable_http2: true + relabel_configs: + - source_labels: [job] + separator: ; + regex: (.*) + target_label: __tmp_prometheus_job_name + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_app, __meta_kubernetes_service_labelpresent_app] + separator: ; + regex: (kube-prometheus-stack-kube-etcd);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_service_label_release, __meta_kubernetes_service_labelpresent_release] + separator: ; + regex: (kube-prometheus-stack);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: http-metrics + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_name] + separator: ; + regex: (.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_container_name] + separator: ; + regex: (.*) + target_label: container + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_jobLabel] + separator: ; + regex: (.+) + target_label: job + replacement: $$1 + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: http-metrics + action: replace + - source_labels: [__address__] + separator: ; + regex: (.*) + modulus: 1 + target_label: __tmp_hash + replacement: $$1 + action: hashmod + - source_labels: [__tmp_hash] + separator: ; + regex: "0" + replacement: $$1 + action: keep + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + follow_redirects: true + enable_http2: true + namespaces: + own_namespace: false + names: + - kube-system + - job_name: serviceMonitor/default/kube-prometheus-stack-kube-controller-manager/0 + honor_timestamps: true + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + authorization: + type: Bearer + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + follow_redirects: true + enable_http2: true + relabel_configs: + - source_labels: [job] + separator: ; + regex: (.*) + target_label: __tmp_prometheus_job_name + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_app, __meta_kubernetes_service_labelpresent_app] + separator: ; + regex: (kube-prometheus-stack-kube-controller-manager);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_service_label_release, __meta_kubernetes_service_labelpresent_release] + separator: ; + regex: (kube-prometheus-stack);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: http-metrics + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_name] + separator: ; + regex: (.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_container_name] + separator: ; + regex: (.*) + target_label: container + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_jobLabel] + separator: ; + regex: (.+) + target_label: job + replacement: $$1 + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: http-metrics + action: replace + - source_labels: [__address__] + separator: ; + regex: (.*) + modulus: 1 + target_label: __tmp_hash + replacement: $$1 + action: hashmod + - source_labels: [__tmp_hash] + separator: ; + regex: "0" + replacement: $$1 + action: keep + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + follow_redirects: true + enable_http2: true + namespaces: + own_namespace: false + names: + - kube-system + - job_name: serviceMonitor/default/kube-prometheus-stack-coredns/0 + honor_timestamps: true + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + authorization: + type: Bearer + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + follow_redirects: true + enable_http2: true + relabel_configs: + - source_labels: [job] + separator: ; + regex: (.*) + target_label: __tmp_prometheus_job_name + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_app, __meta_kubernetes_service_labelpresent_app] + separator: ; + regex: (kube-prometheus-stack-coredns);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_service_label_release, __meta_kubernetes_service_labelpresent_release] + separator: ; + regex: (kube-prometheus-stack);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: http-metrics + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_name] + separator: ; + regex: (.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_container_name] + separator: ; + regex: (.*) + target_label: container + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_jobLabel] + separator: ; + regex: (.+) + target_label: job + replacement: $$1 + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: http-metrics + action: replace + - source_labels: [__address__] + separator: ; + regex: (.*) + modulus: 1 + target_label: __tmp_hash + replacement: $$1 + action: hashmod + - source_labels: [__tmp_hash] + separator: ; + regex: "0" + replacement: $$1 + action: keep + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + namespaces: + own_namespace: false + names: + - kube-system + - job_name: serviceMonitor/default/kube-prometheus-stack-apiserver/0 + honor_timestamps: true + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics + scheme: https + authorization: + type: Bearer + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + server_name: kubernetes + follow_redirects: true + enable_http2: true + relabel_configs: + - source_labels: [job] + separator: ; + regex: (.*) + target_label: __tmp_prometheus_job_name + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_component, __meta_kubernetes_service_labelpresent_component] + separator: ; + regex: (kubernetes);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: https + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_name] + separator: ; + regex: (.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_container_name] + separator: ; + regex: (.*) + target_label: container + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_component] + separator: ; + regex: (.+) + target_label: job + replacement: $$1 + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: https + action: replace + - source_labels: [__address__] + separator: ; + regex: (.*) + modulus: 1 + target_label: __tmp_hash + replacement: $$1 + action: hashmod + - source_labels: [__tmp_hash] + separator: ; + regex: "0" + replacement: $$1 + action: keep + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + follow_redirects: true + enable_http2: true + namespaces: + own_namespace: false + names: + - default + - job_name: serviceMonitor/default/kube-prometheus-stack-alertmanager/0 + honor_timestamps: true + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + follow_redirects: true + enable_http2: true + relabel_configs: + - source_labels: [job] + separator: ; + regex: (.*) + target_label: __tmp_prometheus_job_name + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_label_app, __meta_kubernetes_service_labelpresent_app] + separator: ; + regex: (kube-prometheus-stack-alertmanager);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_service_label_release, __meta_kubernetes_service_labelpresent_release] + separator: ; + regex: (kube-prometheus-stack);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_service_label_self_monitor, __meta_kubernetes_service_labelpresent_self_monitor] + separator: ; + regex: (true);true + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: http-web + replacement: $$1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Node;(.*) + target_label: node + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_endpoint_address_target_kind, __meta_kubernetes_endpoint_address_target_name] + separator: ; + regex: Pod;(.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + target_label: namespace + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: service + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_name] + separator: ; + regex: (.*) + target_label: pod + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_pod_container_name] + separator: ; + regex: (.*) + target_label: container + replacement: $$1 + action: replace + - source_labels: [__meta_kubernetes_service_name] + separator: ; + regex: (.*) + target_label: job + replacement: $$1 + action: replace + - separator: ; + regex: (.*) + target_label: endpoint + replacement: http-web + action: replace + - source_labels: [__address__] + separator: ; + regex: (.*) + modulus: 1 + target_label: __tmp_hash + replacement: $$1 + action: hashmod + - source_labels: [__tmp_hash] + separator: ; + regex: "0" + replacement: $$1 + action: keep + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + follow_redirects: true + enable_http2: true + namespaces: + own_namespace: false + names: + - default + - job_name: 'kube-state-metrics' + static_configs: + - targets: ['kube-state-metrics.kube-system.svc.cluster.local:8080'] + - job_name: 'node-exporter' + kubernetes_sd_configs: + - role: endpoints + ec2_sd_configs: + relabel_configs: + - source_labels: [ __address__ ] + action: keep + regex: '.*:9100$' + - action: replace + source_labels: [__meta_kubernetes_endpoint_node_name] + target_label: nodename + {{ start enableJavaMonJob }} + - job_name: 'kubernetes-java-jmx' + sample_limit: {{javaScrapeSampleLimit}} + metrics_path: {{javaPrometheusMetricsEndpoint}} + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [ __address__ ] + action: keep + regex: '.*:9404$' + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: [ __meta_kubernetes_namespace ] + target_label: Namespace + - source_labels: [ __meta_kubernetes_pod_name ] + action: replace + target_label: pod_name + - action: replace + source_labels: [ __meta_kubernetes_pod_container_name ] + target_label: container_name + - action: replace + source_labels: [ __meta_kubernetes_pod_controller_kind ] + target_label: pod_controller_kind + - action: replace + source_labels: [ __meta_kubernetes_pod_phase ] + target_label: pod_controller_phase + metric_relabel_configs: + - source_labels: [ __name__ ] + regex: 'jvm_gc_collection_seconds.*' + action: drop + {{ stop enableJavaMonJob }} + + {{ start enableNginxMonJob }} + - job_name: 'kubernetes-nginx' + sample_limit: {{nginxScrapeSampleLimit}} + metrics_path: {{nginxPrometheusMetricsEndpoint}} + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [ __address__ ] + action: keep + regex: '.*:10254$' + - source_labels: [__meta_kubernetes_pod_container_name] + target_label: container + action: replace + - source_labels: [__meta_kubernetes_pod_node_name] + target_label: host + action: replace + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + action: replace + metric_relabel_configs: + - source_labels: [__name__] + regex: 'go_memstats.*' + action: drop + - source_labels: [__name__] + regex: 'go_gc.*' + action: drop + - source_labels: [__name__] + regex: 'go_threads' + action: drop + - regex: exported_host + action: labeldrop + {{ stop enableNginxMonJob }} + + {{ start enableIstioMonJob }} + - honor_labels: true + job_name: kubernetes-istio + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$$2]:$$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $$2:$$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: keep + source_labels: [ __address__ ] + regex: '.*:15020$$' + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + {{ stop enableIstioMonJob }} + {{ start enableAdotContainerLogsReceiver }} + filelog: + include: [ /var/log/pods/*/*/*.log ] + include_file_name: false + include_file_path: true + start_at: end + operators: + # Find out which format is used by kubernetes + - type: router + id: get-format + routes: + - output: parser-docker + expr: 'body matches "^\\{"' + - output: parser-crio + expr: 'body matches "^[^ Z]+ "' + - output: parser-containerd + expr: 'body matches "^[^ Z]+Z"' + # Parse CRI-O format + - type: regex_parser + id: parser-crio + regex: + '^(?P