From 53ffe752d0304d4e3e856a06b9ec93134ae3804c Mon Sep 17 00:00:00 2001 From: Paramadon Date: Fri, 17 May 2024 14:47:42 -0400 Subject: [PATCH 01/60] adding e2e --- generator/test_case_generator.go | 3 +- terraform/eks/daemon/gpu/main.tf | 4 +- terraform/eks/daemon/gpu/variables.tf | 2 +- terraform/gpu/main.tf | 146 +++++++++++++++++++++++++ terraform/gpu/providers.tf | 20 ++++ terraform/gpu/variables.tf | 58 ++++++++++ test/metric/container_insights_util.go | 11 +- 7 files changed, 238 insertions(+), 6 deletions(-) create mode 100644 terraform/gpu/main.tf create mode 100644 terraform/gpu/providers.tf create mode 100644 terraform/gpu/variables.tf diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go index 3283404eb..017de5af8 100644 --- a/generator/test_case_generator.go +++ b/generator/test_case_generator.go @@ -217,8 +217,7 @@ var testTypeToTestConfig = map[string][]testConfig{ {testDir: "./test/fluent", terraformDir: "terraform/eks/daemon/fluent/windows/2022"}, { testDir: "./test/gpu", terraformDir: "terraform/eks/daemon/gpu", - targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, - instanceType: "g4dn.xlarge", + targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, }, }, "eks_deployment": { diff --git a/terraform/eks/daemon/gpu/main.tf b/terraform/eks/daemon/gpu/main.tf index 658b78269..86026fb72 100644 --- a/terraform/eks/daemon/gpu/main.tf +++ b/terraform/eks/daemon/gpu/main.tf @@ -2,13 +2,13 @@ // SPDX-License-Identifier: MIT module "common" { - source = "../../../common" + source = "../common" cwagent_image_repo = var.cwagent_image_repo cwagent_image_tag = var.cwagent_image_tag } module "basic_components" { - source = "../../../basic_components" + source = "../basic_components" region = var.region } diff --git a/terraform/eks/daemon/gpu/variables.tf b/terraform/eks/daemon/gpu/variables.tf index 26a0e6cd0..15602011e 100644 --- a/terraform/eks/daemon/gpu/variables.tf +++ b/terraform/eks/daemon/gpu/variables.tf @@ -8,7 +8,7 @@ variable "region" { variable "test_dir" { type = string - default = "./test/gpu" + default = "../test/gpu" } variable "cwagent_image_repo" { diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf new file mode 100644 index 000000000..92db23eb8 --- /dev/null +++ b/terraform/gpu/main.tf @@ -0,0 +1,146 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +module "common" { + source = "../common" + cwagent_image_repo = var.cwagent_image_repo + cwagent_image_tag = var.cwagent_image_tag +} + +module "basic_components" { + source = "../basic_components" + + region = var.region +} + + +data "aws_eks_cluster_auth" "this" { + name = aws_eks_cluster.this.name +} + +locals { + role_arn = format("%s%s", module.basic_components.role_arn, var.beta ? "-eks-beta" : "") + aws_eks = format("%s%s", "aws eks --region ${var.region}", var.beta ? " --endpoint ${var.beta_endpoint}" : "") +} + +resource "aws_eks_cluster" "this" { + name = "cwagent-operator-eks-integ-${module.common.testing_id}" + role_arn = local.role_arn + version = var.k8s_version + vpc_config { + subnet_ids = module.basic_components.public_subnet_ids + security_group_ids = [module.basic_components.security_group] + } +} + +# EKS Node Groups +resource "aws_eks_node_group" "this" { + cluster_name = aws_eks_cluster.this.name + node_group_name = "cwagent-operator-eks-integ-node" + node_role_arn = aws_iam_role.node_role.arn + subnet_ids = module.basic_components.public_subnet_ids + + scaling_config { + desired_size = 1 + max_size = 1 + min_size = 1 + } + + ami_type = "AL2_x86_64_GPU" + capacity_type = "ON_DEMAND" + disk_size = 20 + instance_types = ["g4dn.xlarge"] + + depends_on = [ + aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, + aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy, + aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy, + aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy + ] +} + +# EKS Node IAM Role +resource "aws_iam_role" "node_role" { + name = "cwagent-operator-eks-Worker-Role-${module.common.testing_id}" + + assume_role_policy = < pods.txt + kubectl describe pods --all-namespaces > pods_describe.txt + + # Log the contents of the files + cat pods.txt + cat pods_describe.txt + EOT + } +} + diff --git a/terraform/gpu/providers.tf b/terraform/gpu/providers.tf new file mode 100644 index 000000000..205375027 --- /dev/null +++ b/terraform/gpu/providers.tf @@ -0,0 +1,20 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +provider "aws" { + region = var.region + endpoints { + eks = var.beta ? var.beta_endpoint : null + } +} + +provider "kubernetes" { + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name] + } + host = aws_eks_cluster.this.endpoint + cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data) + token = data.aws_eks_cluster_auth.this.token +} \ No newline at end of file diff --git a/terraform/gpu/variables.tf b/terraform/gpu/variables.tf new file mode 100644 index 000000000..2bc2bec38 --- /dev/null +++ b/terraform/gpu/variables.tf @@ -0,0 +1,58 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +variable "region" { + type = string + default = "us-west-2" +} + +variable "test_dir" { + type = string + default = "../../test/gpu" +} + +variable "addon_name" { + type = string + default = "amazon-cloudwatch-observability" +} + +variable "addon_version" { + type = string + default = "v1.6.0-eksbuild.1" +} + + +variable "cwagent_image_repo" { + type = string + default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent" +} + +variable "cwagent_image_tag" { + type = string + default = "latest" +} + +variable "k8s_version" { + type = string + default = "1.28" +} + +variable "ami_type" { + type = string + default = "AL2_x86_64_GPU" +} + +variable "instance_type" { + type = string + default = "g4dn.xlarge" +} + +variable "beta" { + type = bool + default = true +} + +variable "beta_endpoint" { + type = string + default = "https://api.beta.us-west-2.wesley.amazonaws.com" +} \ No newline at end of file diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go index 140c839c1..ff0e2862d 100644 --- a/test/metric/container_insights_util.go +++ b/test/metric/container_insights_util.go @@ -58,6 +58,15 @@ func ValidateMetrics(env *environment.MetaData, metricFilter string, expectedDim } results = append(results, validateMetricsAvailability(dims, metrics, actual)) for _, m := range metrics { + // this is to prevent panic with rand.Intn when metrics are not yet ready in a cluster + if _, ok := actual[m]; !ok { + results = append(results, status.TestResult{ + Name: dims, + Status: status.FAILED, + }) + log.Printf("ValidateMetrics failed with missing metric: %s", m) + continue + } // pick a random dimension set to test metric data OR test all dimension sets which might be overkill randIdx := rand.Intn(len(actual[m])) results = append(results, validateMetricValue(m, actual[m][randIdx])) @@ -123,7 +132,7 @@ func validateMetricsAvailability(dims string, expected []string, actual map[stri Name: dims, Status: status.FAILED, } - log.Printf("expected metrics: %d, actual metrics: %d", len(expected), len(actual)) + log.Printf("expected metrics: %d, actual metrics: %d", len(expected), 3*len(actual)) if compareMetrics(expected, actual) { testResult.Status = status.SUCCESSFUL } else { From 8523e45475b5fef309e4352bd14e95af36d4d88b Mon Sep 17 00:00:00 2001 From: Paramadon Date: Mon, 20 May 2024 10:07:26 -0400 Subject: [PATCH 02/60] adding if statments to go test so failure can occur --- terraform/gpu/main.tf | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index 92db23eb8..92f0b4382 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -131,16 +131,31 @@ resource "null_resource" "validator" { provisioner "local-exec" { command = < pods.txt + kubectl describe pods --all-namespaces > pods_describe.txt + # Log the contents of the files + cat pods.txt + cat pods_describe.txt + echo "Tests passed" + + else # Get all pods and describe them - kubectl get pods --all-namespaces -o wide > pods.txt - kubectl describe pods --all-namespaces > pods_describe.txt + kubectl get pods --all-namespaces -o wide > pods.txt + kubectl describe pods --all-namespaces > pods_describe.txt + + # Log the contents of the files + cat pods.txt + cat pods_describe.txt + echo "Tests failed" + exit 1 + fi + - # Log the contents of the files - cat pods.txt - cat pods_describe.txt EOT } } + From 3dcc4caf8d5d62982479024a49ba65b76d5dcb59 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Mon, 20 May 2024 12:15:22 -0400 Subject: [PATCH 03/60] fixing tests --- generator/test_case_generator.go | 5 +++-- terraform/eks/daemon/gpu/main.tf | 4 ++-- terraform/eks/daemon/gpu/variables.tf | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go index 017de5af8..c71be7c79 100644 --- a/generator/test_case_generator.go +++ b/generator/test_case_generator.go @@ -185,8 +185,9 @@ var testTypeToTestConfig = map[string][]testConfig{ }, "eks_daemon": { { - testDir: "./test/metric_value_benchmark", - targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, + testDir: "./test/metric_value_benchmark", + targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, + instanceType: "g4dn.xlarge", }, { testDir: "./test/metric_value_benchmark", diff --git a/terraform/eks/daemon/gpu/main.tf b/terraform/eks/daemon/gpu/main.tf index 86026fb72..658b78269 100644 --- a/terraform/eks/daemon/gpu/main.tf +++ b/terraform/eks/daemon/gpu/main.tf @@ -2,13 +2,13 @@ // SPDX-License-Identifier: MIT module "common" { - source = "../common" + source = "../../../common" cwagent_image_repo = var.cwagent_image_repo cwagent_image_tag = var.cwagent_image_tag } module "basic_components" { - source = "../basic_components" + source = "../../../basic_components" region = var.region } diff --git a/terraform/eks/daemon/gpu/variables.tf b/terraform/eks/daemon/gpu/variables.tf index 15602011e..26a0e6cd0 100644 --- a/terraform/eks/daemon/gpu/variables.tf +++ b/terraform/eks/daemon/gpu/variables.tf @@ -8,7 +8,7 @@ variable "region" { variable "test_dir" { type = string - default = "../test/gpu" + default = "./test/gpu" } variable "cwagent_image_repo" { From 7d84ad63e60a330bde7ef98f6d737d933c57b136 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Mon, 20 May 2024 14:58:27 -0400 Subject: [PATCH 04/60] adding security --- terraform/gpu/main.tf | 152 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index 92f0b4382..231e914a2 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -99,6 +99,158 @@ resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" { role = aws_iam_role.node_role.name } +# TODO: these security groups be created once and then reused +# EKS Cluster Security Group +resource "aws_security_group" "eks_cluster_sg" { + name = "cwagent-eks-cluster-sg-${module.common.testing_id}" + description = "Cluster communication with worker nodes" + vpc_id = module.basic_components.vpc_id +} + +resource "aws_security_group_rule" "cluster_inbound" { + description = "Allow worker nodes to communicate with the cluster API Server" + from_port = 443 + protocol = "tcp" + security_group_id = aws_security_group.eks_cluster_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 443 + type = "ingress" +} + +resource "aws_security_group_rule" "cluster_outbound" { + description = "Allow cluster API Server to communicate with the worker nodes" + from_port = 1024 + protocol = "tcp" + security_group_id = aws_security_group.eks_cluster_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 65535 + type = "egress" +} + + +# EKS Node Security Group +resource "aws_security_group" "eks_nodes_sg" { + name = "cwagent-eks-node-sg-${module.common.testing_id}" + description = "Security group for all nodes in the cluster" + vpc_id = module.basic_components.vpc_id + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_security_group_rule" "nodes_internal" { + description = "Allow nodes to communicate with each other" + from_port = 0 + protocol = "-1" + security_group_id = aws_security_group.eks_nodes_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 65535 + type = "ingress" +} + +resource "aws_security_group_rule" "nodes_cluster_inbound" { + description = "Allow worker Kubelets and pods to receive communication from the cluster control plane" + from_port = 1025 + protocol = "tcp" + security_group_id = aws_security_group.eks_nodes_sg.id + source_security_group_id = aws_security_group.eks_cluster_sg.id + to_port = 65535 + type = "ingress" +} + + +# create cert for communication between agent and dcgm +resource "tls_private_key" "private_key" { + algorithm = "RSA" +} + +resource "local_file" "ca_key" { + content = tls_private_key.private_key.private_key_pem + filename = "${path.module}/certs/ca.key" +} + +resource "tls_self_signed_cert" "ca_cert" { + private_key_pem = tls_private_key.private_key.private_key_pem + is_ca_certificate = true + subject { + common_name = "dcgm-exporter-service.amazon-cloudwatch.svc" + organization = "Amazon CloudWatch Agent" + } + validity_period_hours = 24 + allowed_uses = [ + "digital_signature", + "key_encipherment", + "cert_signing", + "crl_signing", + "server_auth", + "client_auth", + ] +} + +resource "local_file" "ca_cert_file" { + content = tls_self_signed_cert.ca_cert.cert_pem + filename = "${path.module}/certs/ca.cert" +} + +resource "tls_private_key" "server_private_key" { + algorithm = "RSA" +} + +resource "local_file" "server_key" { + content = tls_private_key.server_private_key.private_key_pem + filename = "${path.module}/certs/server.key" +} + +resource "tls_cert_request" "local_csr" { + private_key_pem = tls_private_key.server_private_key.private_key_pem + dns_names = ["localhost", "127.0.0.1", "dcgm-exporter-service.amazon-cloudwatch.svc"] + subject { + common_name = "dcgm-exporter-service.amazon-cloudwatch.svc" + organization = "Amazon CloudWatch Agent" + } +} + +resource "tls_locally_signed_cert" "server_cert" { + cert_request_pem = tls_cert_request.local_csr.cert_request_pem + ca_private_key_pem = tls_private_key.private_key.private_key_pem + ca_cert_pem = tls_self_signed_cert.ca_cert.cert_pem + validity_period_hours = 12 + allowed_uses = [ + "digital_signature", + "key_encipherment", + "server_auth", + "client_auth", + ] +} + +resource "local_file" "server_cert_file" { + content = tls_locally_signed_cert.server_cert.cert_pem + filename = "${path.module}/certs/server.cert" +} + +resource "kubernetes_secret" "agent_cert" { + metadata { + name = "amazon-cloudwatch-observability-agent-cert" + namespace = "amazon-cloudwatch" + } + data = { + "ca.crt" = tls_self_signed_cert.ca_cert.cert_pem #filebase64(local_file.ca_cert_file.filename) + "tls.crt" = tls_locally_signed_cert.server_cert.cert_pem #filebase64(local_file.server_cert_file.filename) + "tls.key" = tls_private_key.server_private_key.private_key_pem #filebase64(local_file.server_key.filename) + } +} + + +resource "kubernetes_namespace" "namespace" { + metadata { + name = "amazon-cloudwatch" + } +} + resource "null_resource" "kubectl" { depends_on = [ From 1c366e659e9fe9e82c4922b20ba2db2d33d04b12 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Mon, 20 May 2024 15:44:29 -0400 Subject: [PATCH 05/60] fixing test --- terraform/eks/daemon/gpu2/main.tf | 378 +++++++++++++++++++++++++ terraform/eks/daemon/gpu2/providers.tf | 20 ++ terraform/eks/daemon/gpu2/variables.tf | 55 ++++ terraform/gpu/main.tf | 157 +--------- 4 files changed, 463 insertions(+), 147 deletions(-) create mode 100644 terraform/eks/daemon/gpu2/main.tf create mode 100644 terraform/eks/daemon/gpu2/providers.tf create mode 100644 terraform/eks/daemon/gpu2/variables.tf diff --git a/terraform/eks/daemon/gpu2/main.tf b/terraform/eks/daemon/gpu2/main.tf new file mode 100644 index 000000000..b9471fcf8 --- /dev/null +++ b/terraform/eks/daemon/gpu2/main.tf @@ -0,0 +1,378 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +module "common" { + source = "../../../common" + cwagent_image_repo = var.cwagent_image_repo + cwagent_image_tag = var.cwagent_image_tag +} + +module "basic_components" { + source = "../../../basic_components" + + region = var.region +} + +data "aws_eks_cluster_auth" "this" { + name = aws_eks_cluster.this.name +} + +resource "aws_eks_cluster" "this" { + name = "cwagent-eks-integ-${module.common.testing_id}" + role_arn = module.basic_components.role_arn + version = var.k8s_version + enabled_cluster_log_types = [ + "api", + "audit", + "authenticator", + "controllerManager", + "scheduler" + ] + vpc_config { + subnet_ids = module.basic_components.public_subnet_ids + security_group_ids = [module.basic_components.security_group] + } +} + +# EKS Node Groups +resource "aws_eks_node_group" "this" { + cluster_name = aws_eks_cluster.this.name + node_group_name = "cwagent-eks-integ-node" + node_role_arn = aws_iam_role.node_role.arn + subnet_ids = module.basic_components.public_subnet_ids + + scaling_config { + desired_size = 1 + max_size = 1 + min_size = 1 + } + + ami_type = var.ami_type + capacity_type = "ON_DEMAND" + disk_size = 20 + instance_types = [var.instance_type] + + depends_on = [ + aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, + aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy, + aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy, + aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy + ] +} + +# EKS Node IAM Role +resource "aws_iam_role" "node_role" { + name = "cwagent-eks-Worker-Role-${module.common.testing_id}" + assume_role_policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = "ec2.amazonaws.com" + }, + Action = "sts:AssumeRole" + } + ] + }) + +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" { + policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy" + role = aws_iam_role.node_role.name +} + +# TODO: these security groups be created once and then reused +# EKS Cluster Security Group +resource "aws_security_group" "eks_cluster_sg" { + name = "cwagent-eks-cluster-sg-${module.common.testing_id}" + description = "Cluster communication with worker nodes" + vpc_id = module.basic_components.vpc_id +} + +resource "aws_security_group_rule" "cluster_inbound" { + description = "Allow worker nodes to communicate with the cluster API Server" + from_port = 443 + protocol = "tcp" + security_group_id = aws_security_group.eks_cluster_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 443 + type = "ingress" +} + +resource "aws_security_group_rule" "cluster_outbound" { + description = "Allow cluster API Server to communicate with the worker nodes" + from_port = 1024 + protocol = "tcp" + security_group_id = aws_security_group.eks_cluster_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 65535 + type = "egress" +} + + +# EKS Node Security Group +resource "aws_security_group" "eks_nodes_sg" { + name = "cwagent-eks-node-sg-${module.common.testing_id}" + description = "Security group for all nodes in the cluster" + vpc_id = module.basic_components.vpc_id + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_security_group_rule" "nodes_internal" { + description = "Allow nodes to communicate with each other" + from_port = 0 + protocol = "-1" + security_group_id = aws_security_group.eks_nodes_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 65535 + type = "ingress" +} + +resource "aws_security_group_rule" "nodes_cluster_inbound" { + description = "Allow worker Kubelets and pods to receive communication from the cluster control plane" + from_port = 1025 + protocol = "tcp" + security_group_id = aws_security_group.eks_nodes_sg.id + source_security_group_id = aws_security_group.eks_cluster_sg.id + to_port = 65535 + type = "ingress" +} + + +# create cert for communication between agent and dcgm +resource "tls_private_key" "private_key" { + algorithm = "RSA" +} + +resource "local_file" "ca_key" { + content = tls_private_key.private_key.private_key_pem + filename = "${path.module}/certs/ca.key" +} + +resource "tls_self_signed_cert" "ca_cert" { + private_key_pem = tls_private_key.private_key.private_key_pem + is_ca_certificate = true + subject { + common_name = "dcgm-exporter-service.amazon-cloudwatch.svc" + organization = "Amazon CloudWatch Agent" + } + validity_period_hours = 24 + allowed_uses = [ + "digital_signature", + "key_encipherment", + "cert_signing", + "crl_signing", + "server_auth", + "client_auth", + ] +} + +resource "local_file" "ca_cert_file" { + content = tls_self_signed_cert.ca_cert.cert_pem + filename = "${path.module}/certs/ca.cert" +} + +resource "tls_private_key" "server_private_key" { + algorithm = "RSA" +} + +resource "local_file" "server_key" { + content = tls_private_key.server_private_key.private_key_pem + filename = "${path.module}/certs/server.key" +} + +resource "tls_cert_request" "local_csr" { + private_key_pem = tls_private_key.server_private_key.private_key_pem + dns_names = ["localhost", "127.0.0.1", "dcgm-exporter-service.amazon-cloudwatch.svc"] + subject { + common_name = "dcgm-exporter-service.amazon-cloudwatch.svc" + organization = "Amazon CloudWatch Agent" + } +} + +resource "tls_locally_signed_cert" "server_cert" { + cert_request_pem = tls_cert_request.local_csr.cert_request_pem + ca_private_key_pem = tls_private_key.private_key.private_key_pem + ca_cert_pem = tls_self_signed_cert.ca_cert.cert_pem + validity_period_hours = 12 + allowed_uses = [ + "digital_signature", + "key_encipherment", + "server_auth", + "client_auth", + ] +} + +resource "local_file" "server_cert_file" { + content = tls_locally_signed_cert.server_cert.cert_pem + filename = "${path.module}/certs/server.cert" +} + +resource "kubernetes_secret" "agent_cert" { + metadata { + name = "amazon-cloudwatch-observability-agent-cert" + namespace = "amazon-cloudwatch" + } + data = { + "ca.crt" = tls_self_signed_cert.ca_cert.cert_pem #filebase64(local_file.ca_cert_file.filename) + "tls.crt" = tls_locally_signed_cert.server_cert.cert_pem #filebase64(local_file.server_cert_file.filename) + "tls.key" = tls_private_key.server_private_key.private_key_pem #filebase64(local_file.server_key.filename) + } +} + + +resource "kubernetes_namespace" "namespace" { + metadata { + name = "amazon-cloudwatch" + } +} + + + + + + +########################################## +# Template Files +########################################## +locals { + httpd_config = "../../../../${var.test_dir}/resources/httpd.conf" + httpd_ssl_config = "../../../../${var.test_dir}/resources/httpd-ssl.conf" + cwagent_config = fileexists("../../../../${var.test_dir}/resources/config.json") ? "../../../../${var.test_dir}/resources/config.json" : "../default_resources/default_amazon_cloudwatch_agent.json" +} + +data "template_file" "cwagent_config" { + template = file(local.cwagent_config) + vars = { + } +} + +resource "kubernetes_config_map" "cwagentconfig" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice + ] + metadata { + name = "cwagentconfig" + namespace = "amazon-cloudwatch" + } + data = { + "cwagentconfig.json" : data.template_file.cwagent_config.rendered + } +} + +data "template_file" "httpd_config" { + template = file(local.httpd_config) + vars = {} +} +data "template_file" "httpd_ssl_config" { + template = file(local.httpd_ssl_config) + vars = {} +} + + + + + +resource "kubernetes_cluster_role" "clusterrole" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent-role" + } + rule { + verbs = ["get", "list", "watch"] + resources = ["pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints"] + api_groups = [""] + } + rule { + verbs = ["list", "watch"] + resources = ["replicasets"] + api_groups = ["apps"] + } + rule { + verbs = ["list", "watch"] + resources = ["jobs"] + api_groups = ["batch"] + } + rule { + verbs = ["get"] + resources = ["nodes/proxy"] + api_groups = [""] + } + rule { + verbs = ["create"] + resources = ["nodes/stats", "configmaps", "events"] + api_groups = [""] + } + rule { + verbs = ["get", "update"] + resource_names = ["cwagent-clusterleader"] + resources = ["configmaps"] + api_groups = [""] + } + rule { + verbs = ["list", "watch"] + resources = ["services"] + api_groups = [""] + } + rule { + non_resource_urls = ["/metrics"] + verbs = ["get", "list", "watch"] + } +} + +resource "kubernetes_cluster_role_binding" "rolebinding" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent-role-binding" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = "cloudwatch-agent-role" + } + subject { + kind = "ServiceAccount" + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } +} + +resource "null_resource" "validator" { + depends_on = [ + aws_eks_node_group.this, + kubernetes_daemonset.service, + kubernetes_cluster_role_binding.rolebinding, + kubernetes_service_account.cwagentservice, + ] + provisioner "local-exec" { + command = <<-EOT + echo "Validating EKS metrics/logs for EMF" + cd ../../../.. + go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia + EOT + } +} diff --git a/terraform/eks/daemon/gpu2/providers.tf b/terraform/eks/daemon/gpu2/providers.tf new file mode 100644 index 000000000..205375027 --- /dev/null +++ b/terraform/eks/daemon/gpu2/providers.tf @@ -0,0 +1,20 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +provider "aws" { + region = var.region + endpoints { + eks = var.beta ? var.beta_endpoint : null + } +} + +provider "kubernetes" { + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name] + } + host = aws_eks_cluster.this.endpoint + cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data) + token = data.aws_eks_cluster_auth.this.token +} \ No newline at end of file diff --git a/terraform/eks/daemon/gpu2/variables.tf b/terraform/eks/daemon/gpu2/variables.tf new file mode 100644 index 000000000..a5b789e8d --- /dev/null +++ b/terraform/eks/daemon/gpu2/variables.tf @@ -0,0 +1,55 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +variable "region" { + type = string + default = "us-west-2" +} + +variable "test_dir" { + type = string + default = "./test/gpu" +} + +variable "cwagent_image_repo" { + type = string + default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent" +} + +variable "cwagent_image_tag" { + type = string + default = "latest" +} + +variable "k8s_version" { + type = string + default = "1.28" +} + +variable "ami_type" { + type = string + default = "AL2_x86_64_GPU" +} + +variable "instance_type" { + type = string + default = "g4dn.xlarge" +} + +variable "beta" { + type = bool + default = true +} + +variable "beta_endpoint" { + type = string + default = "https://api.beta.us-west-2.wesley.amazonaws.com" +} +variable "addon_name" { + type = string + default = "amazon-cloudwatch-observability" +} +variable "addon_version" { + type = string + default = "v1.6.0-eksbuild.1" +} \ No newline at end of file diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index 231e914a2..e37749d82 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -99,155 +99,18 @@ resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" { role = aws_iam_role.node_role.name } -# TODO: these security groups be created once and then reused -# EKS Cluster Security Group -resource "aws_security_group" "eks_cluster_sg" { - name = "cwagent-eks-cluster-sg-${module.common.testing_id}" - description = "Cluster communication with worker nodes" - vpc_id = module.basic_components.vpc_id -} - -resource "aws_security_group_rule" "cluster_inbound" { - description = "Allow worker nodes to communicate with the cluster API Server" - from_port = 443 - protocol = "tcp" - security_group_id = aws_security_group.eks_cluster_sg.id - source_security_group_id = aws_security_group.eks_nodes_sg.id - to_port = 443 - type = "ingress" -} - -resource "aws_security_group_rule" "cluster_outbound" { - description = "Allow cluster API Server to communicate with the worker nodes" - from_port = 1024 - protocol = "tcp" - security_group_id = aws_security_group.eks_cluster_sg.id - source_security_group_id = aws_security_group.eks_nodes_sg.id - to_port = 65535 - type = "egress" -} - - -# EKS Node Security Group -resource "aws_security_group" "eks_nodes_sg" { - name = "cwagent-eks-node-sg-${module.common.testing_id}" - description = "Security group for all nodes in the cluster" - vpc_id = module.basic_components.vpc_id - - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } -} - -resource "aws_security_group_rule" "nodes_internal" { - description = "Allow nodes to communicate with each other" - from_port = 0 - protocol = "-1" - security_group_id = aws_security_group.eks_nodes_sg.id - source_security_group_id = aws_security_group.eks_nodes_sg.id - to_port = 65535 - type = "ingress" -} - -resource "aws_security_group_rule" "nodes_cluster_inbound" { - description = "Allow worker Kubelets and pods to receive communication from the cluster control plane" - from_port = 1025 - protocol = "tcp" - security_group_id = aws_security_group.eks_nodes_sg.id - source_security_group_id = aws_security_group.eks_cluster_sg.id - to_port = 65535 - type = "ingress" -} - - -# create cert for communication between agent and dcgm -resource "tls_private_key" "private_key" { - algorithm = "RSA" -} -resource "local_file" "ca_key" { - content = tls_private_key.private_key.private_key_pem - filename = "${path.module}/certs/ca.key" -} - -resource "tls_self_signed_cert" "ca_cert" { - private_key_pem = tls_private_key.private_key.private_key_pem - is_ca_certificate = true - subject { - common_name = "dcgm-exporter-service.amazon-cloudwatch.svc" - organization = "Amazon CloudWatch Agent" - } - validity_period_hours = 24 - allowed_uses = [ - "digital_signature", - "key_encipherment", - "cert_signing", - "crl_signing", - "server_auth", - "client_auth", - ] -} - -resource "local_file" "ca_cert_file" { - content = tls_self_signed_cert.ca_cert.cert_pem - filename = "${path.module}/certs/ca.cert" -} - -resource "tls_private_key" "server_private_key" { - algorithm = "RSA" -} - -resource "local_file" "server_key" { - content = tls_private_key.server_private_key.private_key_pem - filename = "${path.module}/certs/server.key" -} - -resource "tls_cert_request" "local_csr" { - private_key_pem = tls_private_key.server_private_key.private_key_pem - dns_names = ["localhost", "127.0.0.1", "dcgm-exporter-service.amazon-cloudwatch.svc"] - subject { - common_name = "dcgm-exporter-service.amazon-cloudwatch.svc" - organization = "Amazon CloudWatch Agent" - } -} - -resource "tls_locally_signed_cert" "server_cert" { - cert_request_pem = tls_cert_request.local_csr.cert_request_pem - ca_private_key_pem = tls_private_key.private_key.private_key_pem - ca_cert_pem = tls_self_signed_cert.ca_cert.cert_pem - validity_period_hours = 12 - allowed_uses = [ - "digital_signature", - "key_encipherment", - "server_auth", - "client_auth", +resource "null_resource" "kubectl" { + depends_on = [ + aws_eks_cluster.this, + aws_eks_node_group.this ] -} - -resource "local_file" "server_cert_file" { - content = tls_locally_signed_cert.server_cert.cert_pem - filename = "${path.module}/certs/server.cert" -} - -resource "kubernetes_secret" "agent_cert" { - metadata { - name = "amazon-cloudwatch-observability-agent-cert" - namespace = "amazon-cloudwatch" - } - data = { - "ca.crt" = tls_self_signed_cert.ca_cert.cert_pem #filebase64(local_file.ca_cert_file.filename) - "tls.crt" = tls_locally_signed_cert.server_cert.cert_pem #filebase64(local_file.server_cert_file.filename) - "tls.key" = tls_private_key.server_private_key.private_key_pem #filebase64(local_file.server_key.filename) - } -} - - -resource "kubernetes_namespace" "namespace" { - metadata { - name = "amazon-cloudwatch" + provisioner "local-exec" { + command = <<-EOT + ${local.aws_eks} update-kubeconfig --name ${aws_eks_cluster.this.name} + ${local.aws_eks} list-clusters --output text + ${local.aws_eks} describe-cluster --name ${aws_eks_cluster.this.name} --output text + EOT } } From 3dbcb3ac1d31998ccc885eca2f7a455ad6d3c7ee Mon Sep 17 00:00:00 2001 From: Paramadon Date: Mon, 20 May 2024 15:49:43 -0400 Subject: [PATCH 06/60] adding locals --- terraform/eks/daemon/gpu2/main.tf | 47 +++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/terraform/eks/daemon/gpu2/main.tf b/terraform/eks/daemon/gpu2/main.tf index b9471fcf8..6ba09d866 100644 --- a/terraform/eks/daemon/gpu2/main.tf +++ b/terraform/eks/daemon/gpu2/main.tf @@ -262,6 +262,8 @@ locals { httpd_config = "../../../../${var.test_dir}/resources/httpd.conf" httpd_ssl_config = "../../../../${var.test_dir}/resources/httpd-ssl.conf" cwagent_config = fileexists("../../../../${var.test_dir}/resources/config.json") ? "../../../../${var.test_dir}/resources/config.json" : "../default_resources/default_amazon_cloudwatch_agent.json" + role_arn = format("%s%s", module.basic_components.role_arn, var.beta ? "-eks-beta" : "") + aws_eks = format("%s%s", "aws eks --region ${var.region}", var.beta ? " --endpoint ${var.beta_endpoint}" : "") } data "template_file" "cwagent_config" { @@ -361,6 +363,51 @@ resource "kubernetes_cluster_role_binding" "rolebinding" { } } + +resource "null_resource" "kubectl" { + depends_on = [ + aws_eks_cluster.this, + aws_eks_node_group.this + ] + provisioner "local-exec" { + command = <<-EOT + ${local.aws_eks} update-kubeconfig --name ${aws_eks_cluster.this.name} + ${local.aws_eks} list-clusters --output text + ${local.aws_eks} describe-cluster --name ${aws_eks_cluster.this.name} --output text + EOT + } +} + + +resource "null_resource" "kubectl" { + depends_on = [ + aws_eks_cluster.this, + aws_eks_node_group.this + ] + provisioner "local-exec" { + command = <<-EOT + ${local.aws_eks} update-kubeconfig --name ${aws_eks_cluster.this.name} + ${local.aws_eks} list-clusters --output text + ${local.aws_eks} describe-cluster --name ${aws_eks_cluster.this.name} --output text + EOT + } +} + +resource "aws_eks_addon" "this" { + depends_on = [ + null_resource.kubectl + ] + addon_name = var.addon_name + cluster_name = aws_eks_cluster.this.name + addon_version = var.addon_version +} + +resource "null_resource" "validator" { + depends_on = [ + aws_eks_node_group.this, + aws_eks_addon.this + ] + resource "null_resource" "validator" { depends_on = [ aws_eks_node_group.this, From 8c30284fa8e3c1df7d78561610bb8bf0bfc6e1fe Mon Sep 17 00:00:00 2001 From: Paramadon Date: Mon, 20 May 2024 15:51:07 -0400 Subject: [PATCH 07/60] making test --- terraform/eks/daemon/gpu3/main.tf | 444 +++++++++++++++++++++++++ terraform/eks/daemon/gpu3/providers.tf | 20 ++ terraform/eks/daemon/gpu3/variables.tf | 55 +++ 3 files changed, 519 insertions(+) create mode 100644 terraform/eks/daemon/gpu3/main.tf create mode 100644 terraform/eks/daemon/gpu3/providers.tf create mode 100644 terraform/eks/daemon/gpu3/variables.tf diff --git a/terraform/eks/daemon/gpu3/main.tf b/terraform/eks/daemon/gpu3/main.tf new file mode 100644 index 000000000..b0381a8ac --- /dev/null +++ b/terraform/eks/daemon/gpu3/main.tf @@ -0,0 +1,444 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +module "common" { + source = "../../../common" + cwagent_image_repo = var.cwagent_image_repo + cwagent_image_tag = var.cwagent_image_tag +} + +module "basic_components" { + source = "../../../basic_components" + + region = var.region +} + +data "aws_eks_cluster_auth" "this" { + name = aws_eks_cluster.this.name +} + +resource "aws_eks_cluster" "this" { + name = "cwagent-eks-integ-${module.common.testing_id}" + role_arn = module.basic_components.role_arn + version = var.k8s_version + enabled_cluster_log_types = [ + "api", + "audit", + "authenticator", + "controllerManager", + "scheduler" + ] + vpc_config { + subnet_ids = module.basic_components.public_subnet_ids + security_group_ids = [module.basic_components.security_group] + } +} + +# EKS Node Groups +resource "aws_eks_node_group" "this" { + cluster_name = aws_eks_cluster.this.name + node_group_name = "cwagent-eks-integ-node" + node_role_arn = aws_iam_role.node_role.arn + subnet_ids = module.basic_components.public_subnet_ids + + scaling_config { + desired_size = 1 + max_size = 1 + min_size = 1 + } + + ami_type = var.ami_type + capacity_type = "ON_DEMAND" + disk_size = 20 + instance_types = [var.instance_type] + + depends_on = [ + aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, + aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy, + aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy, + aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy + ] +} + +# EKS Node IAM Role +resource "aws_iam_role" "node_role" { + name = "cwagent-eks-Worker-Role-${module.common.testing_id}" + assume_role_policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = "ec2.amazonaws.com" + }, + Action = "sts:AssumeRole" + } + ] + }) + +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" { + policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy" + role = aws_iam_role.node_role.name +} + +# TODO: these security groups be created once and then reused +# EKS Cluster Security Group +resource "aws_security_group" "eks_cluster_sg" { + name = "cwagent-eks-cluster-sg-${module.common.testing_id}" + description = "Cluster communication with worker nodes" + vpc_id = module.basic_components.vpc_id +} + +resource "aws_security_group_rule" "cluster_inbound" { + description = "Allow worker nodes to communicate with the cluster API Server" + from_port = 443 + protocol = "tcp" + security_group_id = aws_security_group.eks_cluster_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 443 + type = "ingress" +} + +resource "aws_security_group_rule" "cluster_outbound" { + description = "Allow cluster API Server to communicate with the worker nodes" + from_port = 1024 + protocol = "tcp" + security_group_id = aws_security_group.eks_cluster_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 65535 + type = "egress" +} + + +# EKS Node Security Group +resource "aws_security_group" "eks_nodes_sg" { + name = "cwagent-eks-node-sg-${module.common.testing_id}" + description = "Security group for all nodes in the cluster" + vpc_id = module.basic_components.vpc_id + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_security_group_rule" "nodes_internal" { + description = "Allow nodes to communicate with each other" + from_port = 0 + protocol = "-1" + security_group_id = aws_security_group.eks_nodes_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 65535 + type = "ingress" +} + +resource "aws_security_group_rule" "nodes_cluster_inbound" { + description = "Allow worker Kubelets and pods to receive communication from the cluster control plane" + from_port = 1025 + protocol = "tcp" + security_group_id = aws_security_group.eks_nodes_sg.id + source_security_group_id = aws_security_group.eks_cluster_sg.id + to_port = 65535 + type = "ingress" +} + + +# create cert for communication between agent and dcgm +resource "tls_private_key" "private_key" { + algorithm = "RSA" +} + +resource "local_file" "ca_key" { + content = tls_private_key.private_key.private_key_pem + filename = "${path.module}/certs/ca.key" +} + +resource "tls_self_signed_cert" "ca_cert" { + private_key_pem = tls_private_key.private_key.private_key_pem + is_ca_certificate = true + subject { + common_name = "dcgm-exporter-service.amazon-cloudwatch.svc" + organization = "Amazon CloudWatch Agent" + } + validity_period_hours = 24 + allowed_uses = [ + "digital_signature", + "key_encipherment", + "cert_signing", + "crl_signing", + "server_auth", + "client_auth", + ] +} + +resource "local_file" "ca_cert_file" { + content = tls_self_signed_cert.ca_cert.cert_pem + filename = "${path.module}/certs/ca.cert" +} + +resource "tls_private_key" "server_private_key" { + algorithm = "RSA" +} + +resource "local_file" "server_key" { + content = tls_private_key.server_private_key.private_key_pem + filename = "${path.module}/certs/server.key" +} + +resource "tls_cert_request" "local_csr" { + private_key_pem = tls_private_key.server_private_key.private_key_pem + dns_names = ["localhost", "127.0.0.1", "dcgm-exporter-service.amazon-cloudwatch.svc"] + subject { + common_name = "dcgm-exporter-service.amazon-cloudwatch.svc" + organization = "Amazon CloudWatch Agent" + } +} + +resource "tls_locally_signed_cert" "server_cert" { + cert_request_pem = tls_cert_request.local_csr.cert_request_pem + ca_private_key_pem = tls_private_key.private_key.private_key_pem + ca_cert_pem = tls_self_signed_cert.ca_cert.cert_pem + validity_period_hours = 12 + allowed_uses = [ + "digital_signature", + "key_encipherment", + "server_auth", + "client_auth", + ] +} + +resource "local_file" "server_cert_file" { + content = tls_locally_signed_cert.server_cert.cert_pem + filename = "${path.module}/certs/server.cert" +} + +resource "kubernetes_secret" "agent_cert" { + metadata { + name = "amazon-cloudwatch-observability-agent-cert" + namespace = "amazon-cloudwatch" + } + data = { + "ca.crt" = tls_self_signed_cert.ca_cert.cert_pem #filebase64(local_file.ca_cert_file.filename) + "tls.crt" = tls_locally_signed_cert.server_cert.cert_pem #filebase64(local_file.server_cert_file.filename) + "tls.key" = tls_private_key.server_private_key.private_key_pem #filebase64(local_file.server_key.filename) + } +} + + +resource "kubernetes_namespace" "namespace" { + metadata { + name = "amazon-cloudwatch" + } +} + + + + + + +########################################## +# Template Files +########################################## +locals { + httpd_config = "../../../../${var.test_dir}/resources/httpd.conf" + httpd_ssl_config = "../../../../${var.test_dir}/resources/httpd-ssl.conf" + cwagent_config = fileexists("../../../../${var.test_dir}/resources/config.json") ? "../../../../${var.test_dir}/resources/config.json" : "../default_resources/default_amazon_cloudwatch_agent.json" + role_arn = format("%s%s", module.basic_components.role_arn, var.beta ? "-eks-beta" : "") + aws_eks = format("%s%s", "aws eks --region ${var.region}", var.beta ? " --endpoint ${var.beta_endpoint}" : "") +} + +data "template_file" "cwagent_config" { + template = file(local.cwagent_config) + vars = { + } +} + +resource "kubernetes_config_map" "cwagentconfig" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice + ] + metadata { + name = "cwagentconfig" + namespace = "amazon-cloudwatch" + } + data = { + "cwagentconfig.json" : data.template_file.cwagent_config.rendered + } +} + +data "template_file" "httpd_config" { + template = file(local.httpd_config) + vars = {} +} +data "template_file" "httpd_ssl_config" { + template = file(local.httpd_ssl_config) + vars = {} +} + + + + + +resource "kubernetes_cluster_role" "clusterrole" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent-role" + } + rule { + verbs = ["get", "list", "watch"] + resources = ["pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints"] + api_groups = [""] + } + rule { + verbs = ["list", "watch"] + resources = ["replicasets"] + api_groups = ["apps"] + } + rule { + verbs = ["list", "watch"] + resources = ["jobs"] + api_groups = ["batch"] + } + rule { + verbs = ["get"] + resources = ["nodes/proxy"] + api_groups = [""] + } + rule { + verbs = ["create"] + resources = ["nodes/stats", "configmaps", "events"] + api_groups = [""] + } + rule { + verbs = ["get", "update"] + resource_names = ["cwagent-clusterleader"] + resources = ["configmaps"] + api_groups = [""] + } + rule { + verbs = ["list", "watch"] + resources = ["services"] + api_groups = [""] + } + rule { + non_resource_urls = ["/metrics"] + verbs = ["get", "list", "watch"] + } +} + +resource "kubernetes_cluster_role_binding" "rolebinding" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent-role-binding" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = "cloudwatch-agent-role" + } + subject { + kind = "ServiceAccount" + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } +} + + +resource "null_resource" "kubectl" { + depends_on = [ + aws_eks_cluster.this, + aws_eks_node_group.this + ] + provisioner "local-exec" { + command = <<-EOT + ${local.aws_eks} update-kubeconfig --name ${aws_eks_cluster.this.name} + ${local.aws_eks} list-clusters --output text + ${local.aws_eks} describe-cluster --name ${aws_eks_cluster.this.name} --output text + EOT + } +} + + +resource "null_resource" "kubectl" { + depends_on = [ + aws_eks_cluster.this, + aws_eks_node_group.this + ] + provisioner "local-exec" { + command = <<-EOT + ${local.aws_eks} update-kubeconfig --name ${aws_eks_cluster.this.name} + ${local.aws_eks} list-clusters --output text + ${local.aws_eks} describe-cluster --name ${aws_eks_cluster.this.name} --output text + EOT + } +} + +resource "aws_eks_addon" "this" { + depends_on = [ + null_resource.kubectl + ] + addon_name = var.addon_name + cluster_name = aws_eks_cluster.this.name + addon_version = var.addon_version +} + +resource "null_resource" "validator" { + depends_on = [ + aws_eks_node_group.this, + aws_eks_addon.this + ] + +resource "null_resource" "validator" { + depends_on = [ + aws_eks_node_group.this, + aws_eks_addon.this + ] + + provisioner "local-exec" { + command = < pods.txt + kubectl describe pods --all-namespaces > pods_describe.txt + + # Log the contents of the files + cat pods.txt + cat pods_describe.txt + echo "Tests passed" + + else + # Get all pods and describe them + kubectl get pods --all-namespaces -o wide > pods.txt + kubectl describe pods --all-namespaces > pods_describe.txt + + # Log the contents of the files + cat pods.txt + cat pods_describe.txt + echo "Tests failed" + exit 1 + fi + + + EOT + } +} diff --git a/terraform/eks/daemon/gpu3/providers.tf b/terraform/eks/daemon/gpu3/providers.tf new file mode 100644 index 000000000..205375027 --- /dev/null +++ b/terraform/eks/daemon/gpu3/providers.tf @@ -0,0 +1,20 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +provider "aws" { + region = var.region + endpoints { + eks = var.beta ? var.beta_endpoint : null + } +} + +provider "kubernetes" { + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name] + } + host = aws_eks_cluster.this.endpoint + cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data) + token = data.aws_eks_cluster_auth.this.token +} \ No newline at end of file diff --git a/terraform/eks/daemon/gpu3/variables.tf b/terraform/eks/daemon/gpu3/variables.tf new file mode 100644 index 000000000..a5b789e8d --- /dev/null +++ b/terraform/eks/daemon/gpu3/variables.tf @@ -0,0 +1,55 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +variable "region" { + type = string + default = "us-west-2" +} + +variable "test_dir" { + type = string + default = "./test/gpu" +} + +variable "cwagent_image_repo" { + type = string + default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent" +} + +variable "cwagent_image_tag" { + type = string + default = "latest" +} + +variable "k8s_version" { + type = string + default = "1.28" +} + +variable "ami_type" { + type = string + default = "AL2_x86_64_GPU" +} + +variable "instance_type" { + type = string + default = "g4dn.xlarge" +} + +variable "beta" { + type = bool + default = true +} + +variable "beta_endpoint" { + type = string + default = "https://api.beta.us-west-2.wesley.amazonaws.com" +} +variable "addon_name" { + type = string + default = "amazon-cloudwatch-observability" +} +variable "addon_version" { + type = string + default = "v1.6.0-eksbuild.1" +} \ No newline at end of file From ad3663d68661a979eb31bc8c01fc7d587d0e6e03 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Tue, 21 May 2024 11:24:04 -0400 Subject: [PATCH 08/60] adding to test --- terraform/eks/daemon/gpu/main.tf | 1 + terraform/eks/daemon/gpu2/main.tf | 50 ++++++------------------------- terraform/eks/daemon/gpu3/main.tf | 40 ++++--------------------- terraform/eks/daemon/script.sh | 28 +++++++++++++++++ terraform/gpu/main.tf | 14 --------- 5 files changed, 44 insertions(+), 89 deletions(-) create mode 100644 terraform/eks/daemon/script.sh diff --git a/terraform/eks/daemon/gpu/main.tf b/terraform/eks/daemon/gpu/main.tf index 658b78269..146029c46 100644 --- a/terraform/eks/daemon/gpu/main.tf +++ b/terraform/eks/daemon/gpu/main.tf @@ -712,6 +712,7 @@ resource "null_resource" "validator" { provisioner "local-exec" { command = <<-EOT echo "Validating EKS metrics/logs for EMF" + kubectl apply -f cd ../../../.. go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia EOT diff --git a/terraform/eks/daemon/gpu2/main.tf b/terraform/eks/daemon/gpu2/main.tf index 6ba09d866..db8e5f960 100644 --- a/terraform/eks/daemon/gpu2/main.tf +++ b/terraform/eks/daemon/gpu2/main.tf @@ -272,19 +272,6 @@ data "template_file" "cwagent_config" { } } -resource "kubernetes_config_map" "cwagentconfig" { - depends_on = [ - kubernetes_namespace.namespace, - kubernetes_service_account.cwagentservice - ] - metadata { - name = "cwagentconfig" - namespace = "amazon-cloudwatch" - } - data = { - "cwagentconfig.json" : data.template_file.cwagent_config.rendered - } -} data "template_file" "httpd_config" { template = file(local.httpd_config) @@ -364,19 +351,6 @@ resource "kubernetes_cluster_role_binding" "rolebinding" { } -resource "null_resource" "kubectl" { - depends_on = [ - aws_eks_cluster.this, - aws_eks_node_group.this - ] - provisioner "local-exec" { - command = <<-EOT - ${local.aws_eks} update-kubeconfig --name ${aws_eks_cluster.this.name} - ${local.aws_eks} list-clusters --output text - ${local.aws_eks} describe-cluster --name ${aws_eks_cluster.this.name} --output text - EOT - } -} resource "null_resource" "kubectl" { @@ -402,24 +376,18 @@ resource "aws_eks_addon" "this" { addon_version = var.addon_version } -resource "null_resource" "validator" { - depends_on = [ - aws_eks_node_group.this, - aws_eks_addon.this - ] -resource "null_resource" "validator" { - depends_on = [ - aws_eks_node_group.this, - kubernetes_daemonset.service, - kubernetes_cluster_role_binding.rolebinding, - kubernetes_service_account.cwagentservice, - ] - provisioner "local-exec" { - command = <<-EOT + resource "null_resource" "validator" { + depends_on = [ + aws_eks_node_group.this, + kubernetes_cluster_role_binding.rolebinding, + ] + provisioner "local-exec" { + command = <<-EOT echo "Validating EKS metrics/logs for EMF" + kubectl apply -f ../script.sh cd ../../../.. go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia EOT + } } -} diff --git a/terraform/eks/daemon/gpu3/main.tf b/terraform/eks/daemon/gpu3/main.tf index b0381a8ac..a52e7f0c0 100644 --- a/terraform/eks/daemon/gpu3/main.tf +++ b/terraform/eks/daemon/gpu3/main.tf @@ -272,19 +272,7 @@ data "template_file" "cwagent_config" { } } -resource "kubernetes_config_map" "cwagentconfig" { - depends_on = [ - kubernetes_namespace.namespace, - kubernetes_service_account.cwagentservice - ] - metadata { - name = "cwagentconfig" - namespace = "amazon-cloudwatch" - } - data = { - "cwagentconfig.json" : data.template_file.cwagent_config.rendered - } -} + data "template_file" "httpd_config" { template = file(local.httpd_config) @@ -364,19 +352,6 @@ resource "kubernetes_cluster_role_binding" "rolebinding" { } -resource "null_resource" "kubectl" { - depends_on = [ - aws_eks_cluster.this, - aws_eks_node_group.this - ] - provisioner "local-exec" { - command = <<-EOT - ${local.aws_eks} update-kubeconfig --name ${aws_eks_cluster.this.name} - ${local.aws_eks} list-clusters --output text - ${local.aws_eks} describe-cluster --name ${aws_eks_cluster.this.name} --output text - EOT - } -} resource "null_resource" "kubectl" { @@ -404,14 +379,8 @@ resource "aws_eks_addon" "this" { resource "null_resource" "validator" { depends_on = [ - aws_eks_node_group.this, - aws_eks_addon.this - ] - -resource "null_resource" "validator" { - depends_on = [ - aws_eks_node_group.this, - aws_eks_addon.this + aws_eks_node_group.this, + aws_eks_addon.this ] provisioner "local-exec" { @@ -442,3 +411,6 @@ resource "null_resource" "validator" { EOT } } + + + diff --git a/terraform/eks/daemon/script.sh b/terraform/eks/daemon/script.sh new file mode 100644 index 000000000..c97dbfefd --- /dev/null +++ b/terraform/eks/daemon/script.sh @@ -0,0 +1,28 @@ +kind: Deployment +apiVersion: apps/v1 +metadata: + name: gpu-burn + namespace: amazon-cloudwatch + labels: + app: gpu-burn +spec: + replicas: 6 + selector: + matchLabels: + app: gpu-burn + template: + metadata: + labels: + app: gpu-burn + spec: + containers: + - name: main + image: oguzpastirmaci/gpu-burn + imagePullPolicy: IfNotPresent + command: + - bash + - '-c' + - while true; do /app/gpu_burn 20; sleep 20; done + resources: + limits: + nvidia.com/gpu: 2 diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index e37749d82..5b48f46f8 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -100,20 +100,6 @@ resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" { } -resource "null_resource" "kubectl" { - depends_on = [ - aws_eks_cluster.this, - aws_eks_node_group.this - ] - provisioner "local-exec" { - command = <<-EOT - ${local.aws_eks} update-kubeconfig --name ${aws_eks_cluster.this.name} - ${local.aws_eks} list-clusters --output text - ${local.aws_eks} describe-cluster --name ${aws_eks_cluster.this.name} --output text - EOT - } -} - resource "null_resource" "kubectl" { depends_on = [ From 031d072096c04e6cbfef9044b93563c2243f33a1 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Tue, 21 May 2024 12:22:02 -0400 Subject: [PATCH 09/60] fixing some issues --- terraform/eks/daemon/gpu/main.tf | 2 +- terraform/eks/daemon/gpu3/main.tf | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/terraform/eks/daemon/gpu/main.tf b/terraform/eks/daemon/gpu/main.tf index 146029c46..d966dd053 100644 --- a/terraform/eks/daemon/gpu/main.tf +++ b/terraform/eks/daemon/gpu/main.tf @@ -712,7 +712,7 @@ resource "null_resource" "validator" { provisioner "local-exec" { command = <<-EOT echo "Validating EKS metrics/logs for EMF" - kubectl apply -f + kubectl apply -f ../script.sh cd ../../../.. go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia EOT diff --git a/terraform/eks/daemon/gpu3/main.tf b/terraform/eks/daemon/gpu3/main.tf index a52e7f0c0..3a6b43431 100644 --- a/terraform/eks/daemon/gpu3/main.tf +++ b/terraform/eks/daemon/gpu3/main.tf @@ -385,6 +385,8 @@ resource "null_resource" "validator" { provisioner "local-exec" { command = < pods.txt From b97c68cb96bfceaa8d2bb29b77abf2380e03c7ca Mon Sep 17 00:00:00 2001 From: Paramadon Date: Tue, 21 May 2024 14:22:16 -0400 Subject: [PATCH 10/60] fixing terrafom --- terraform/eks/daemon/gpu/main.tf | 8 +- terraform/eks/daemon/gpu2/main.tf | 8 +- terraform/eks/daemon/gpu3/main.tf | 239 +----------------- .../eks/daemon/{script.sh => gpuBurner.yaml} | 0 terraform/gpu/main.tf | 1 + 5 files changed, 18 insertions(+), 238 deletions(-) rename terraform/eks/daemon/{script.sh => gpuBurner.yaml} (100%) diff --git a/terraform/eks/daemon/gpu/main.tf b/terraform/eks/daemon/gpu/main.tf index d966dd053..b5e1eacce 100644 --- a/terraform/eks/daemon/gpu/main.tf +++ b/terraform/eks/daemon/gpu/main.tf @@ -42,9 +42,9 @@ resource "aws_eks_node_group" "this" { subnet_ids = module.basic_components.public_subnet_ids scaling_config { - desired_size = 1 - max_size = 1 - min_size = 1 + desired_size = 2 + max_size = 2 + min_size = 2 } ami_type = var.ami_type @@ -712,7 +712,7 @@ resource "null_resource" "validator" { provisioner "local-exec" { command = <<-EOT echo "Validating EKS metrics/logs for EMF" - kubectl apply -f ../script.sh + kubectl apply -f ../gpuBurner.yaml cd ../../../.. go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia EOT diff --git a/terraform/eks/daemon/gpu2/main.tf b/terraform/eks/daemon/gpu2/main.tf index db8e5f960..5ab34d5bc 100644 --- a/terraform/eks/daemon/gpu2/main.tf +++ b/terraform/eks/daemon/gpu2/main.tf @@ -42,9 +42,9 @@ resource "aws_eks_node_group" "this" { subnet_ids = module.basic_components.public_subnet_ids scaling_config { - desired_size = 1 - max_size = 1 - min_size = 1 + desired_size = 2 + max_size = 2 + min_size = 2 } ami_type = var.ami_type @@ -385,7 +385,7 @@ resource "aws_eks_addon" "this" { provisioner "local-exec" { command = <<-EOT echo "Validating EKS metrics/logs for EMF" - kubectl apply -f ../script.sh + kubectl apply -f ../gpuBurner.yaml cd ../../../.. go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia EOT diff --git a/terraform/eks/daemon/gpu3/main.tf b/terraform/eks/daemon/gpu3/main.tf index 3a6b43431..97417e33a 100644 --- a/terraform/eks/daemon/gpu3/main.tf +++ b/terraform/eks/daemon/gpu3/main.tf @@ -42,9 +42,9 @@ resource "aws_eks_node_group" "this" { subnet_ids = module.basic_components.public_subnet_ids scaling_config { - desired_size = 1 - max_size = 1 - min_size = 1 + desired_size = 2 + max_size = 2 + min_size = 2 } ami_type = var.ami_type @@ -75,7 +75,12 @@ resource "aws_iam_role" "node_role" { } ] }) - + depends_on = [ + aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, + aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy, + aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy, + aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy + ] } resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" { @@ -98,150 +103,6 @@ resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" { role = aws_iam_role.node_role.name } -# TODO: these security groups be created once and then reused -# EKS Cluster Security Group -resource "aws_security_group" "eks_cluster_sg" { - name = "cwagent-eks-cluster-sg-${module.common.testing_id}" - description = "Cluster communication with worker nodes" - vpc_id = module.basic_components.vpc_id -} - -resource "aws_security_group_rule" "cluster_inbound" { - description = "Allow worker nodes to communicate with the cluster API Server" - from_port = 443 - protocol = "tcp" - security_group_id = aws_security_group.eks_cluster_sg.id - source_security_group_id = aws_security_group.eks_nodes_sg.id - to_port = 443 - type = "ingress" -} - -resource "aws_security_group_rule" "cluster_outbound" { - description = "Allow cluster API Server to communicate with the worker nodes" - from_port = 1024 - protocol = "tcp" - security_group_id = aws_security_group.eks_cluster_sg.id - source_security_group_id = aws_security_group.eks_nodes_sg.id - to_port = 65535 - type = "egress" -} - - -# EKS Node Security Group -resource "aws_security_group" "eks_nodes_sg" { - name = "cwagent-eks-node-sg-${module.common.testing_id}" - description = "Security group for all nodes in the cluster" - vpc_id = module.basic_components.vpc_id - - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } -} - -resource "aws_security_group_rule" "nodes_internal" { - description = "Allow nodes to communicate with each other" - from_port = 0 - protocol = "-1" - security_group_id = aws_security_group.eks_nodes_sg.id - source_security_group_id = aws_security_group.eks_nodes_sg.id - to_port = 65535 - type = "ingress" -} - -resource "aws_security_group_rule" "nodes_cluster_inbound" { - description = "Allow worker Kubelets and pods to receive communication from the cluster control plane" - from_port = 1025 - protocol = "tcp" - security_group_id = aws_security_group.eks_nodes_sg.id - source_security_group_id = aws_security_group.eks_cluster_sg.id - to_port = 65535 - type = "ingress" -} - - -# create cert for communication between agent and dcgm -resource "tls_private_key" "private_key" { - algorithm = "RSA" -} - -resource "local_file" "ca_key" { - content = tls_private_key.private_key.private_key_pem - filename = "${path.module}/certs/ca.key" -} - -resource "tls_self_signed_cert" "ca_cert" { - private_key_pem = tls_private_key.private_key.private_key_pem - is_ca_certificate = true - subject { - common_name = "dcgm-exporter-service.amazon-cloudwatch.svc" - organization = "Amazon CloudWatch Agent" - } - validity_period_hours = 24 - allowed_uses = [ - "digital_signature", - "key_encipherment", - "cert_signing", - "crl_signing", - "server_auth", - "client_auth", - ] -} - -resource "local_file" "ca_cert_file" { - content = tls_self_signed_cert.ca_cert.cert_pem - filename = "${path.module}/certs/ca.cert" -} - -resource "tls_private_key" "server_private_key" { - algorithm = "RSA" -} - -resource "local_file" "server_key" { - content = tls_private_key.server_private_key.private_key_pem - filename = "${path.module}/certs/server.key" -} - -resource "tls_cert_request" "local_csr" { - private_key_pem = tls_private_key.server_private_key.private_key_pem - dns_names = ["localhost", "127.0.0.1", "dcgm-exporter-service.amazon-cloudwatch.svc"] - subject { - common_name = "dcgm-exporter-service.amazon-cloudwatch.svc" - organization = "Amazon CloudWatch Agent" - } -} - -resource "tls_locally_signed_cert" "server_cert" { - cert_request_pem = tls_cert_request.local_csr.cert_request_pem - ca_private_key_pem = tls_private_key.private_key.private_key_pem - ca_cert_pem = tls_self_signed_cert.ca_cert.cert_pem - validity_period_hours = 12 - allowed_uses = [ - "digital_signature", - "key_encipherment", - "server_auth", - "client_auth", - ] -} - -resource "local_file" "server_cert_file" { - content = tls_locally_signed_cert.server_cert.cert_pem - filename = "${path.module}/certs/server.cert" -} - -resource "kubernetes_secret" "agent_cert" { - metadata { - name = "amazon-cloudwatch-observability-agent-cert" - namespace = "amazon-cloudwatch" - } - data = { - "ca.crt" = tls_self_signed_cert.ca_cert.cert_pem #filebase64(local_file.ca_cert_file.filename) - "tls.crt" = tls_locally_signed_cert.server_cert.cert_pem #filebase64(local_file.server_cert_file.filename) - "tls.key" = tls_private_key.server_private_key.private_key_pem #filebase64(local_file.server_key.filename) - } -} resource "kubernetes_namespace" "namespace" { @@ -266,92 +127,10 @@ locals { aws_eks = format("%s%s", "aws eks --region ${var.region}", var.beta ? " --endpoint ${var.beta_endpoint}" : "") } -data "template_file" "cwagent_config" { - template = file(local.cwagent_config) - vars = { - } -} - - - -data "template_file" "httpd_config" { - template = file(local.httpd_config) - vars = {} -} -data "template_file" "httpd_ssl_config" { - template = file(local.httpd_ssl_config) - vars = {} -} - -resource "kubernetes_cluster_role" "clusterrole" { - depends_on = [kubernetes_namespace.namespace] - metadata { - name = "cloudwatch-agent-role" - } - rule { - verbs = ["get", "list", "watch"] - resources = ["pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints"] - api_groups = [""] - } - rule { - verbs = ["list", "watch"] - resources = ["replicasets"] - api_groups = ["apps"] - } - rule { - verbs = ["list", "watch"] - resources = ["jobs"] - api_groups = ["batch"] - } - rule { - verbs = ["get"] - resources = ["nodes/proxy"] - api_groups = [""] - } - rule { - verbs = ["create"] - resources = ["nodes/stats", "configmaps", "events"] - api_groups = [""] - } - rule { - verbs = ["get", "update"] - resource_names = ["cwagent-clusterleader"] - resources = ["configmaps"] - api_groups = [""] - } - rule { - verbs = ["list", "watch"] - resources = ["services"] - api_groups = [""] - } - rule { - non_resource_urls = ["/metrics"] - verbs = ["get", "list", "watch"] - } -} - -resource "kubernetes_cluster_role_binding" "rolebinding" { - depends_on = [kubernetes_namespace.namespace] - metadata { - name = "cloudwatch-agent-role-binding" - } - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "ClusterRole" - name = "cloudwatch-agent-role" - } - subject { - kind = "ServiceAccount" - name = "cloudwatch-agent" - namespace = "amazon-cloudwatch" - } -} - - resource "null_resource" "kubectl" { diff --git a/terraform/eks/daemon/script.sh b/terraform/eks/daemon/gpuBurner.yaml similarity index 100% rename from terraform/eks/daemon/script.sh rename to terraform/eks/daemon/gpuBurner.yaml diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index 5b48f46f8..59a749f8d 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -132,6 +132,7 @@ resource "null_resource" "validator" { provisioner "local-exec" { command = < pods.txt From f8514ed51a08cc482f611db4847dccc68f5b7636 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Wed, 22 May 2024 12:41:16 -0400 Subject: [PATCH 11/60] reverting back and adding kubectl commands --- terraform/eks/daemon/gpuBurner.yaml | 4 +- terraform/gpu/main.tf | 118 +++++++++++++++++++++++++--- 2 files changed, 108 insertions(+), 14 deletions(-) diff --git a/terraform/eks/daemon/gpuBurner.yaml b/terraform/eks/daemon/gpuBurner.yaml index c97dbfefd..1c3e164a5 100644 --- a/terraform/eks/daemon/gpuBurner.yaml +++ b/terraform/eks/daemon/gpuBurner.yaml @@ -6,7 +6,7 @@ metadata: labels: app: gpu-burn spec: - replicas: 6 + replicas: 1 selector: matchLabels: app: gpu-burn @@ -25,4 +25,4 @@ spec: - while true; do /app/gpu_burn 20; sleep 20; done resources: limits: - nvidia.com/gpu: 2 + nvidia.com/gpu: 1 diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index 59a749f8d..55e511d61 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -79,6 +79,99 @@ resource "aws_iam_role" "node_role" { POLICY } + + + +resource "aws_eks_addon" "this" { + depends_on = [ + null_resource.kubectl + ] + addon_name = var.addon_name + cluster_name = aws_eks_cluster.this.name + addon_version = var.addon_version +} + +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +module "common" { + source = "../common" + cwagent_image_repo = var.cwagent_image_repo + cwagent_image_tag = var.cwagent_image_tag +} + +module "basic_components" { + source = "../basic_components" + + region = var.region +} + + +data "aws_eks_cluster_auth" "this" { + name = aws_eks_cluster.this.name +} + +locals { + role_arn = format("%s%s", module.basic_components.role_arn, var.beta ? "-eks-beta" : "") + aws_eks = format("%s%s", "aws eks --region ${var.region}", var.beta ? " --endpoint ${var.beta_endpoint}" : "") +} + +resource "aws_eks_cluster" "this" { + name = "cwagent-operator-eks-integ-${module.common.testing_id}" + role_arn = local.role_arn + version = var.k8s_version + vpc_config { + subnet_ids = module.basic_components.public_subnet_ids + security_group_ids = [module.basic_components.security_group] + } +} + +# EKS Node Groups +resource "aws_eks_node_group" "this" { + cluster_name = aws_eks_cluster.this.name + node_group_name = "cwagent-operator-eks-integ-node" + node_role_arn = aws_iam_role.node_role.arn + subnet_ids = module.basic_components.public_subnet_ids + + scaling_config { + desired_size = 1 + max_size = 1 + min_size = 1 + } + + ami_type = "AL2_x86_64_GPU" + capacity_type = "ON_DEMAND" + disk_size = 20 + instance_types = ["g4dn.xlarge"] + + depends_on = [ + aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, + aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy, + aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy, + aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy + ] +} + +# EKS Node IAM Role +resource "aws_iam_role" "node_role" { + name = "cwagent-operator-eks-Worker-Role-${module.common.testing_id}" + + assume_role_policy = < pods.txt - kubectl describe pods --all-namespaces > pods_describe.txt + kubectl get pods --all-namespaces -o wide > pods.txt + kubectl describe pods --all-namespaces > pods_describe.txt - # Log the contents of the files - cat pods.txt - cat pods_describe.txt + # Log the contents of the files + cat pods.txt + cat pods_describe.txt echo "Tests passed" - else - # Get all pods and describe them + # Get all pods and describe them kubectl get pods --all-namespaces -o wide > pods.txt kubectl describe pods --all-namespaces > pods_describe.txt @@ -154,10 +246,12 @@ resource "null_resource" "validator" { echo "Tests failed" exit 1 fi - - EOT } } + + + + From e9ae147757a6d7a2409dd0152fc033c0ff7c481b Mon Sep 17 00:00:00 2001 From: Paramadon Date: Wed, 22 May 2024 13:43:58 -0400 Subject: [PATCH 12/60] fixing dupes --- terraform/gpu/main.tf | 87 ------------------------------------------- 1 file changed, 87 deletions(-) diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index 55e511d61..2938b769c 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -1,95 +1,8 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: MIT -module "common" { - source = "../common" - cwagent_image_repo = var.cwagent_image_repo - cwagent_image_tag = var.cwagent_image_tag -} - -module "basic_components" { - source = "../basic_components" - - region = var.region -} - - -data "aws_eks_cluster_auth" "this" { - name = aws_eks_cluster.this.name -} - -locals { - role_arn = format("%s%s", module.basic_components.role_arn, var.beta ? "-eks-beta" : "") - aws_eks = format("%s%s", "aws eks --region ${var.region}", var.beta ? " --endpoint ${var.beta_endpoint}" : "") -} - -resource "aws_eks_cluster" "this" { - name = "cwagent-operator-eks-integ-${module.common.testing_id}" - role_arn = local.role_arn - version = var.k8s_version - vpc_config { - subnet_ids = module.basic_components.public_subnet_ids - security_group_ids = [module.basic_components.security_group] - } -} - -# EKS Node Groups -resource "aws_eks_node_group" "this" { - cluster_name = aws_eks_cluster.this.name - node_group_name = "cwagent-operator-eks-integ-node" - node_role_arn = aws_iam_role.node_role.arn - subnet_ids = module.basic_components.public_subnet_ids - scaling_config { - desired_size = 1 - max_size = 1 - min_size = 1 - } - ami_type = "AL2_x86_64_GPU" - capacity_type = "ON_DEMAND" - disk_size = 20 - instance_types = ["g4dn.xlarge"] - - depends_on = [ - aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, - aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy, - aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy, - aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy - ] -} - -# EKS Node IAM Role -resource "aws_iam_role" "node_role" { - name = "cwagent-operator-eks-Worker-Role-${module.common.testing_id}" - - assume_role_policy = < Date: Wed, 22 May 2024 15:56:59 -0400 Subject: [PATCH 13/60] adding test --- terraform/gpu/main.tf | 26 +++++++++----------------- test/metric/container_insights_util.go | 2 +- test/metric/metric_value_query.go | 1 + 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index 2938b769c..0be3c20d8 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -1,12 +1,6 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: MIT - - - -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - module "common" { source = "../common" cwagent_image_repo = var.cwagent_image_repo @@ -141,15 +135,16 @@ resource "null_resource" "validator" { kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml if go test ${var.test_dir} -eksClusterName ${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia; then # Get all pods and describe them - kubectl get pods --all-namespaces -o wide > pods.txt - kubectl describe pods --all-namespaces > pods_describe.txt + kubectl get pods --all-namespaces -o wide > pods.txt + kubectl describe pods --all-namespaces > pods_describe.txt - # Log the contents of the files - cat pods.txt - cat pods_describe.txt + # Log the contents of the files + cat pods.txt + cat pods_describe.txt echo "Tests passed" + else - # Get all pods and describe them + # Get all pods and describe them kubectl get pods --all-namespaces -o wide > pods.txt kubectl describe pods --all-namespaces > pods_describe.txt @@ -159,12 +154,9 @@ resource "null_resource" "validator" { echo "Tests failed" exit 1 fi + + EOT } } - - - - - diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go index ff0e2862d..a3ddff03d 100644 --- a/test/metric/container_insights_util.go +++ b/test/metric/container_insights_util.go @@ -132,7 +132,7 @@ func validateMetricsAvailability(dims string, expected []string, actual map[stri Name: dims, Status: status.FAILED, } - log.Printf("expected metrics: %d, actual metrics: %d", len(expected), 3*len(actual)) + log.Printf("expected metrics: %d, actual metrics: %d", len(expected), len(actual)) if compareMetrics(expected, actual) { testResult.Status = status.SUCCESSFUL } else { diff --git a/test/metric/metric_value_query.go b/test/metric/metric_value_query.go index eea358074..23f4464f8 100644 --- a/test/metric/metric_value_query.go +++ b/test/metric/metric_value_query.go @@ -57,6 +57,7 @@ func (n *MetricValueFetcher) Fetch(namespace, metricName string, metricSpecificD EndTime: &endTime, MetricDataQueries: metricDataQueries, } + log.Print("This is the the getMetric data input", getMetricDataInput, getMetricDataInput.StartTime, getMetricDataInput.EndTime, getMetricDataInput.MetricDataQueries) log.Printf("Metric data input: namespace %v, name %v, stat %v, period %v", namespace, metricName, stat, metricQueryPeriod) From a3dcc91cb8ba9bf227ea4afd58dfaedcfa7c32d9 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 23 May 2024 09:47:04 -0400 Subject: [PATCH 14/60] fixing log error --- terraform/eks/daemon/gpu2/main.tf | 393 ------------------------- terraform/eks/daemon/gpu2/providers.tf | 20 -- terraform/eks/daemon/gpu2/variables.tf | 55 ---- terraform/eks/daemon/gpu3/main.tf | 197 ------------- terraform/eks/daemon/gpu3/providers.tf | 20 -- terraform/eks/daemon/gpu3/variables.tf | 55 ---- terraform/gpu/main.tf | 9 +- test/metric/container_insights_util.go | 1 + 8 files changed, 8 insertions(+), 742 deletions(-) delete mode 100644 terraform/eks/daemon/gpu2/main.tf delete mode 100644 terraform/eks/daemon/gpu2/providers.tf delete mode 100644 terraform/eks/daemon/gpu2/variables.tf delete mode 100644 terraform/eks/daemon/gpu3/main.tf delete mode 100644 terraform/eks/daemon/gpu3/providers.tf delete mode 100644 terraform/eks/daemon/gpu3/variables.tf diff --git a/terraform/eks/daemon/gpu2/main.tf b/terraform/eks/daemon/gpu2/main.tf deleted file mode 100644 index 5ab34d5bc..000000000 --- a/terraform/eks/daemon/gpu2/main.tf +++ /dev/null @@ -1,393 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -module "common" { - source = "../../../common" - cwagent_image_repo = var.cwagent_image_repo - cwagent_image_tag = var.cwagent_image_tag -} - -module "basic_components" { - source = "../../../basic_components" - - region = var.region -} - -data "aws_eks_cluster_auth" "this" { - name = aws_eks_cluster.this.name -} - -resource "aws_eks_cluster" "this" { - name = "cwagent-eks-integ-${module.common.testing_id}" - role_arn = module.basic_components.role_arn - version = var.k8s_version - enabled_cluster_log_types = [ - "api", - "audit", - "authenticator", - "controllerManager", - "scheduler" - ] - vpc_config { - subnet_ids = module.basic_components.public_subnet_ids - security_group_ids = [module.basic_components.security_group] - } -} - -# EKS Node Groups -resource "aws_eks_node_group" "this" { - cluster_name = aws_eks_cluster.this.name - node_group_name = "cwagent-eks-integ-node" - node_role_arn = aws_iam_role.node_role.arn - subnet_ids = module.basic_components.public_subnet_ids - - scaling_config { - desired_size = 2 - max_size = 2 - min_size = 2 - } - - ami_type = var.ami_type - capacity_type = "ON_DEMAND" - disk_size = 20 - instance_types = [var.instance_type] - - depends_on = [ - aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, - aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy, - aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy, - aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy - ] -} - -# EKS Node IAM Role -resource "aws_iam_role" "node_role" { - name = "cwagent-eks-Worker-Role-${module.common.testing_id}" - assume_role_policy = jsonencode({ - Version = "2012-10-17", - Statement = [ - { - Effect = "Allow", - Principal = { - Service = "ec2.amazonaws.com" - }, - Action = "sts:AssumeRole" - } - ] - }) - -} - -resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" { - policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" - role = aws_iam_role.node_role.name -} - -resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" { - policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" - role = aws_iam_role.node_role.name -} - -resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" { - policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" - role = aws_iam_role.node_role.name -} - -resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" { - policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy" - role = aws_iam_role.node_role.name -} - -# TODO: these security groups be created once and then reused -# EKS Cluster Security Group -resource "aws_security_group" "eks_cluster_sg" { - name = "cwagent-eks-cluster-sg-${module.common.testing_id}" - description = "Cluster communication with worker nodes" - vpc_id = module.basic_components.vpc_id -} - -resource "aws_security_group_rule" "cluster_inbound" { - description = "Allow worker nodes to communicate with the cluster API Server" - from_port = 443 - protocol = "tcp" - security_group_id = aws_security_group.eks_cluster_sg.id - source_security_group_id = aws_security_group.eks_nodes_sg.id - to_port = 443 - type = "ingress" -} - -resource "aws_security_group_rule" "cluster_outbound" { - description = "Allow cluster API Server to communicate with the worker nodes" - from_port = 1024 - protocol = "tcp" - security_group_id = aws_security_group.eks_cluster_sg.id - source_security_group_id = aws_security_group.eks_nodes_sg.id - to_port = 65535 - type = "egress" -} - - -# EKS Node Security Group -resource "aws_security_group" "eks_nodes_sg" { - name = "cwagent-eks-node-sg-${module.common.testing_id}" - description = "Security group for all nodes in the cluster" - vpc_id = module.basic_components.vpc_id - - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } -} - -resource "aws_security_group_rule" "nodes_internal" { - description = "Allow nodes to communicate with each other" - from_port = 0 - protocol = "-1" - security_group_id = aws_security_group.eks_nodes_sg.id - source_security_group_id = aws_security_group.eks_nodes_sg.id - to_port = 65535 - type = "ingress" -} - -resource "aws_security_group_rule" "nodes_cluster_inbound" { - description = "Allow worker Kubelets and pods to receive communication from the cluster control plane" - from_port = 1025 - protocol = "tcp" - security_group_id = aws_security_group.eks_nodes_sg.id - source_security_group_id = aws_security_group.eks_cluster_sg.id - to_port = 65535 - type = "ingress" -} - - -# create cert for communication between agent and dcgm -resource "tls_private_key" "private_key" { - algorithm = "RSA" -} - -resource "local_file" "ca_key" { - content = tls_private_key.private_key.private_key_pem - filename = "${path.module}/certs/ca.key" -} - -resource "tls_self_signed_cert" "ca_cert" { - private_key_pem = tls_private_key.private_key.private_key_pem - is_ca_certificate = true - subject { - common_name = "dcgm-exporter-service.amazon-cloudwatch.svc" - organization = "Amazon CloudWatch Agent" - } - validity_period_hours = 24 - allowed_uses = [ - "digital_signature", - "key_encipherment", - "cert_signing", - "crl_signing", - "server_auth", - "client_auth", - ] -} - -resource "local_file" "ca_cert_file" { - content = tls_self_signed_cert.ca_cert.cert_pem - filename = "${path.module}/certs/ca.cert" -} - -resource "tls_private_key" "server_private_key" { - algorithm = "RSA" -} - -resource "local_file" "server_key" { - content = tls_private_key.server_private_key.private_key_pem - filename = "${path.module}/certs/server.key" -} - -resource "tls_cert_request" "local_csr" { - private_key_pem = tls_private_key.server_private_key.private_key_pem - dns_names = ["localhost", "127.0.0.1", "dcgm-exporter-service.amazon-cloudwatch.svc"] - subject { - common_name = "dcgm-exporter-service.amazon-cloudwatch.svc" - organization = "Amazon CloudWatch Agent" - } -} - -resource "tls_locally_signed_cert" "server_cert" { - cert_request_pem = tls_cert_request.local_csr.cert_request_pem - ca_private_key_pem = tls_private_key.private_key.private_key_pem - ca_cert_pem = tls_self_signed_cert.ca_cert.cert_pem - validity_period_hours = 12 - allowed_uses = [ - "digital_signature", - "key_encipherment", - "server_auth", - "client_auth", - ] -} - -resource "local_file" "server_cert_file" { - content = tls_locally_signed_cert.server_cert.cert_pem - filename = "${path.module}/certs/server.cert" -} - -resource "kubernetes_secret" "agent_cert" { - metadata { - name = "amazon-cloudwatch-observability-agent-cert" - namespace = "amazon-cloudwatch" - } - data = { - "ca.crt" = tls_self_signed_cert.ca_cert.cert_pem #filebase64(local_file.ca_cert_file.filename) - "tls.crt" = tls_locally_signed_cert.server_cert.cert_pem #filebase64(local_file.server_cert_file.filename) - "tls.key" = tls_private_key.server_private_key.private_key_pem #filebase64(local_file.server_key.filename) - } -} - - -resource "kubernetes_namespace" "namespace" { - metadata { - name = "amazon-cloudwatch" - } -} - - - - - - -########################################## -# Template Files -########################################## -locals { - httpd_config = "../../../../${var.test_dir}/resources/httpd.conf" - httpd_ssl_config = "../../../../${var.test_dir}/resources/httpd-ssl.conf" - cwagent_config = fileexists("../../../../${var.test_dir}/resources/config.json") ? "../../../../${var.test_dir}/resources/config.json" : "../default_resources/default_amazon_cloudwatch_agent.json" - role_arn = format("%s%s", module.basic_components.role_arn, var.beta ? "-eks-beta" : "") - aws_eks = format("%s%s", "aws eks --region ${var.region}", var.beta ? " --endpoint ${var.beta_endpoint}" : "") -} - -data "template_file" "cwagent_config" { - template = file(local.cwagent_config) - vars = { - } -} - - -data "template_file" "httpd_config" { - template = file(local.httpd_config) - vars = {} -} -data "template_file" "httpd_ssl_config" { - template = file(local.httpd_ssl_config) - vars = {} -} - - - - - -resource "kubernetes_cluster_role" "clusterrole" { - depends_on = [kubernetes_namespace.namespace] - metadata { - name = "cloudwatch-agent-role" - } - rule { - verbs = ["get", "list", "watch"] - resources = ["pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints"] - api_groups = [""] - } - rule { - verbs = ["list", "watch"] - resources = ["replicasets"] - api_groups = ["apps"] - } - rule { - verbs = ["list", "watch"] - resources = ["jobs"] - api_groups = ["batch"] - } - rule { - verbs = ["get"] - resources = ["nodes/proxy"] - api_groups = [""] - } - rule { - verbs = ["create"] - resources = ["nodes/stats", "configmaps", "events"] - api_groups = [""] - } - rule { - verbs = ["get", "update"] - resource_names = ["cwagent-clusterleader"] - resources = ["configmaps"] - api_groups = [""] - } - rule { - verbs = ["list", "watch"] - resources = ["services"] - api_groups = [""] - } - rule { - non_resource_urls = ["/metrics"] - verbs = ["get", "list", "watch"] - } -} - -resource "kubernetes_cluster_role_binding" "rolebinding" { - depends_on = [kubernetes_namespace.namespace] - metadata { - name = "cloudwatch-agent-role-binding" - } - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "ClusterRole" - name = "cloudwatch-agent-role" - } - subject { - kind = "ServiceAccount" - name = "cloudwatch-agent" - namespace = "amazon-cloudwatch" - } -} - - - - -resource "null_resource" "kubectl" { - depends_on = [ - aws_eks_cluster.this, - aws_eks_node_group.this - ] - provisioner "local-exec" { - command = <<-EOT - ${local.aws_eks} update-kubeconfig --name ${aws_eks_cluster.this.name} - ${local.aws_eks} list-clusters --output text - ${local.aws_eks} describe-cluster --name ${aws_eks_cluster.this.name} --output text - EOT - } -} - -resource "aws_eks_addon" "this" { - depends_on = [ - null_resource.kubectl - ] - addon_name = var.addon_name - cluster_name = aws_eks_cluster.this.name - addon_version = var.addon_version -} - - - resource "null_resource" "validator" { - depends_on = [ - aws_eks_node_group.this, - kubernetes_cluster_role_binding.rolebinding, - ] - provisioner "local-exec" { - command = <<-EOT - echo "Validating EKS metrics/logs for EMF" - kubectl apply -f ../gpuBurner.yaml - cd ../../../.. - go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia - EOT - } - } diff --git a/terraform/eks/daemon/gpu2/providers.tf b/terraform/eks/daemon/gpu2/providers.tf deleted file mode 100644 index 205375027..000000000 --- a/terraform/eks/daemon/gpu2/providers.tf +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -provider "aws" { - region = var.region - endpoints { - eks = var.beta ? var.beta_endpoint : null - } -} - -provider "kubernetes" { - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name] - } - host = aws_eks_cluster.this.endpoint - cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data) - token = data.aws_eks_cluster_auth.this.token -} \ No newline at end of file diff --git a/terraform/eks/daemon/gpu2/variables.tf b/terraform/eks/daemon/gpu2/variables.tf deleted file mode 100644 index a5b789e8d..000000000 --- a/terraform/eks/daemon/gpu2/variables.tf +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -variable "region" { - type = string - default = "us-west-2" -} - -variable "test_dir" { - type = string - default = "./test/gpu" -} - -variable "cwagent_image_repo" { - type = string - default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent" -} - -variable "cwagent_image_tag" { - type = string - default = "latest" -} - -variable "k8s_version" { - type = string - default = "1.28" -} - -variable "ami_type" { - type = string - default = "AL2_x86_64_GPU" -} - -variable "instance_type" { - type = string - default = "g4dn.xlarge" -} - -variable "beta" { - type = bool - default = true -} - -variable "beta_endpoint" { - type = string - default = "https://api.beta.us-west-2.wesley.amazonaws.com" -} -variable "addon_name" { - type = string - default = "amazon-cloudwatch-observability" -} -variable "addon_version" { - type = string - default = "v1.6.0-eksbuild.1" -} \ No newline at end of file diff --git a/terraform/eks/daemon/gpu3/main.tf b/terraform/eks/daemon/gpu3/main.tf deleted file mode 100644 index 97417e33a..000000000 --- a/terraform/eks/daemon/gpu3/main.tf +++ /dev/null @@ -1,197 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -module "common" { - source = "../../../common" - cwagent_image_repo = var.cwagent_image_repo - cwagent_image_tag = var.cwagent_image_tag -} - -module "basic_components" { - source = "../../../basic_components" - - region = var.region -} - -data "aws_eks_cluster_auth" "this" { - name = aws_eks_cluster.this.name -} - -resource "aws_eks_cluster" "this" { - name = "cwagent-eks-integ-${module.common.testing_id}" - role_arn = module.basic_components.role_arn - version = var.k8s_version - enabled_cluster_log_types = [ - "api", - "audit", - "authenticator", - "controllerManager", - "scheduler" - ] - vpc_config { - subnet_ids = module.basic_components.public_subnet_ids - security_group_ids = [module.basic_components.security_group] - } -} - -# EKS Node Groups -resource "aws_eks_node_group" "this" { - cluster_name = aws_eks_cluster.this.name - node_group_name = "cwagent-eks-integ-node" - node_role_arn = aws_iam_role.node_role.arn - subnet_ids = module.basic_components.public_subnet_ids - - scaling_config { - desired_size = 2 - max_size = 2 - min_size = 2 - } - - ami_type = var.ami_type - capacity_type = "ON_DEMAND" - disk_size = 20 - instance_types = [var.instance_type] - - depends_on = [ - aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, - aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy, - aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy, - aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy - ] -} - -# EKS Node IAM Role -resource "aws_iam_role" "node_role" { - name = "cwagent-eks-Worker-Role-${module.common.testing_id}" - assume_role_policy = jsonencode({ - Version = "2012-10-17", - Statement = [ - { - Effect = "Allow", - Principal = { - Service = "ec2.amazonaws.com" - }, - Action = "sts:AssumeRole" - } - ] - }) - depends_on = [ - aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, - aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy, - aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy, - aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy - ] -} - -resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" { - policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" - role = aws_iam_role.node_role.name -} - -resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" { - policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" - role = aws_iam_role.node_role.name -} - -resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" { - policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" - role = aws_iam_role.node_role.name -} - -resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" { - policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy" - role = aws_iam_role.node_role.name -} - - - -resource "kubernetes_namespace" "namespace" { - metadata { - name = "amazon-cloudwatch" - } -} - - - - - - -########################################## -# Template Files -########################################## -locals { - httpd_config = "../../../../${var.test_dir}/resources/httpd.conf" - httpd_ssl_config = "../../../../${var.test_dir}/resources/httpd-ssl.conf" - cwagent_config = fileexists("../../../../${var.test_dir}/resources/config.json") ? "../../../../${var.test_dir}/resources/config.json" : "../default_resources/default_amazon_cloudwatch_agent.json" - role_arn = format("%s%s", module.basic_components.role_arn, var.beta ? "-eks-beta" : "") - aws_eks = format("%s%s", "aws eks --region ${var.region}", var.beta ? " --endpoint ${var.beta_endpoint}" : "") -} - - - - - - - -resource "null_resource" "kubectl" { - depends_on = [ - aws_eks_cluster.this, - aws_eks_node_group.this - ] - provisioner "local-exec" { - command = <<-EOT - ${local.aws_eks} update-kubeconfig --name ${aws_eks_cluster.this.name} - ${local.aws_eks} list-clusters --output text - ${local.aws_eks} describe-cluster --name ${aws_eks_cluster.this.name} --output text - EOT - } -} - -resource "aws_eks_addon" "this" { - depends_on = [ - null_resource.kubectl - ] - addon_name = var.addon_name - cluster_name = aws_eks_cluster.this.name - addon_version = var.addon_version -} - -resource "null_resource" "validator" { - depends_on = [ - aws_eks_node_group.this, - aws_eks_addon.this - ] - - provisioner "local-exec" { - command = < pods.txt - kubectl describe pods --all-namespaces > pods_describe.txt - - # Log the contents of the files - cat pods.txt - cat pods_describe.txt - echo "Tests passed" - - else - # Get all pods and describe them - kubectl get pods --all-namespaces -o wide > pods.txt - kubectl describe pods --all-namespaces > pods_describe.txt - - # Log the contents of the files - cat pods.txt - cat pods_describe.txt - echo "Tests failed" - exit 1 - fi - - - EOT - } -} - - - diff --git a/terraform/eks/daemon/gpu3/providers.tf b/terraform/eks/daemon/gpu3/providers.tf deleted file mode 100644 index 205375027..000000000 --- a/terraform/eks/daemon/gpu3/providers.tf +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -provider "aws" { - region = var.region - endpoints { - eks = var.beta ? var.beta_endpoint : null - } -} - -provider "kubernetes" { - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name] - } - host = aws_eks_cluster.this.endpoint - cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data) - token = data.aws_eks_cluster_auth.this.token -} \ No newline at end of file diff --git a/terraform/eks/daemon/gpu3/variables.tf b/terraform/eks/daemon/gpu3/variables.tf deleted file mode 100644 index a5b789e8d..000000000 --- a/terraform/eks/daemon/gpu3/variables.tf +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -variable "region" { - type = string - default = "us-west-2" -} - -variable "test_dir" { - type = string - default = "./test/gpu" -} - -variable "cwagent_image_repo" { - type = string - default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent" -} - -variable "cwagent_image_tag" { - type = string - default = "latest" -} - -variable "k8s_version" { - type = string - default = "1.28" -} - -variable "ami_type" { - type = string - default = "AL2_x86_64_GPU" -} - -variable "instance_type" { - type = string - default = "g4dn.xlarge" -} - -variable "beta" { - type = bool - default = true -} - -variable "beta_endpoint" { - type = string - default = "https://api.beta.us-west-2.wesley.amazonaws.com" -} -variable "addon_name" { - type = string - default = "amazon-cloudwatch-observability" -} -variable "addon_version" { - type = string - default = "v1.6.0-eksbuild.1" -} \ No newline at end of file diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index 0be3c20d8..1fbad5594 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -3,8 +3,6 @@ module "common" { source = "../common" - cwagent_image_repo = var.cwagent_image_repo - cwagent_image_tag = var.cwagent_image_tag } module "basic_components" { @@ -27,6 +25,13 @@ resource "aws_eks_cluster" "this" { name = "cwagent-operator-eks-integ-${module.common.testing_id}" role_arn = local.role_arn version = var.k8s_version + enabled_cluster_log_types = [ + "api", + "audit", + "authenticator", + "controllerManager", + "scheduler" + ] vpc_config { subnet_ids = module.basic_components.public_subnet_ids security_group_ids = [module.basic_components.security_group] diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go index a3ddff03d..bf2ca84e4 100644 --- a/test/metric/container_insights_util.go +++ b/test/metric/container_insights_util.go @@ -212,6 +212,7 @@ func ValidateLogs(env *environment.MetaData) status.TestResult { //log.Printf("eksClusterType is: %s", eksClusterType.Type) jsonSchema, ok := eks_resources.EksClusterValidationMap[eksClusterType.Type] if !ok { + fmt.Println("This is the eksClusterType:", eksClusterType.Type) return "", errors.New("invalid cluster type provided") } return jsonSchema, nil From b35c91e3ace0d559fcb9582ead926eb47dd20b05 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 23 May 2024 09:50:19 -0400 Subject: [PATCH 15/60] fixing test --- terraform/gpu/main.tf | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index 1fbad5594..f9af97cec 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -131,7 +131,8 @@ resource "aws_eks_addon" "this" { resource "null_resource" "validator" { depends_on = [ aws_eks_node_group.this, - aws_eks_addon.this + aws_eks_addon.this, + null_resource.kubectl ] provisioner "local-exec" { @@ -159,8 +160,6 @@ resource "null_resource" "validator" { echo "Tests failed" exit 1 fi - - EOT } } From 747b439bbd6dd2ba6324ef9981b6edfbf575359e Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 23 May 2024 11:14:55 -0400 Subject: [PATCH 16/60] EksClusterValidationMap had wrong casing for ClusterDaemonSet --- test/metric/container_insights_util.go | 2 +- test/metric_value_benchmark/eks_resources/util.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go index bf2ca84e4..3baea87cf 100644 --- a/test/metric/container_insights_util.go +++ b/test/metric/container_insights_util.go @@ -209,7 +209,7 @@ func ValidateLogs(env *environment.MetaData) status.TestResult { return "", fmt.Errorf("failed to unmarshal log file: %w", innerErr) } - //log.Printf("eksClusterType is: %s", eksClusterType.Type) + log.Printf("eksClusterType is: %v", eksClusterType.Type) jsonSchema, ok := eks_resources.EksClusterValidationMap[eksClusterType.Type] if !ok { fmt.Println("This is the eksClusterType:", eksClusterType.Type) diff --git a/test/metric_value_benchmark/eks_resources/util.go b/test/metric_value_benchmark/eks_resources/util.go index d398b81e1..bff8892a7 100644 --- a/test/metric_value_benchmark/eks_resources/util.go +++ b/test/metric_value_benchmark/eks_resources/util.go @@ -49,7 +49,7 @@ var ( EksClusterValidationMap = map[string]string{ "Cluster": eksClusterSchema, - "ClusterDaemonset": eksClusterDaemonsetSchema, + "ClusterDaemonSet": eksClusterDaemonsetSchema, "ClusterDeployment": eksClusterDeploymentSchema, "ClusterNamespace": eksClusterNamespaceSchema, "ClusterService": eksClusterServiceSchema, From 7dd06699ca774a96ece9454bf0df537155eee920 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 23 May 2024 15:40:36 -0400 Subject: [PATCH 17/60] tyring to fix logging --- test/metric/container_insights_util.go | 7 +++++-- .../eks_resources/test_schemas/cluster_gpu.json | 2 +- util/awsservice/cloudwatchlogs.go | 4 ++++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go index 3baea87cf..8ad9ce7e2 100644 --- a/test/metric/container_insights_util.go +++ b/test/metric/container_insights_util.go @@ -193,6 +193,10 @@ func ValidateLogs(env *environment.MetaData) status.TestResult { } for _, instance := range eKSInstances { + fmt.Println("Number of eks Instances", len(eKSInstances)) + fmt.Println("This is the instance", instance.InstanceName) + fmt.Println("This is the instance", instance) + stream := *instance.InstanceName err = awsservice.ValidateLogs( group, @@ -208,11 +212,10 @@ func ValidateLogs(env *environment.MetaData) status.TestResult { if innerErr != nil { return "", fmt.Errorf("failed to unmarshal log file: %w", innerErr) } - log.Printf("eksClusterType is: %v", eksClusterType.Type) jsonSchema, ok := eks_resources.EksClusterValidationMap[eksClusterType.Type] if !ok { - fmt.Println("This is the eksClusterType:", eksClusterType.Type) + log.Printf("eksClusterType is: %v", eksClusterType.Type) return "", errors.New("invalid cluster type provided") } return jsonSchema, nil diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/cluster_gpu.json b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_gpu.json index 5b14e3fb1..5247a96dd 100644 --- a/test/metric_value_benchmark/eks_resources/test_schemas/cluster_gpu.json +++ b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_gpu.json @@ -10,7 +10,7 @@ "Type": {}, "Version": {}, "cluster_gpu_total": {}, - "cluster_gpu_request": {}, + "cluster_gpu_request": {} }, "required": [ "ClusterName", diff --git a/util/awsservice/cloudwatchlogs.go b/util/awsservice/cloudwatchlogs.go index 10a9f830b..7ba1f5d3e 100644 --- a/util/awsservice/cloudwatchlogs.go +++ b/util/awsservice/cloudwatchlogs.go @@ -242,6 +242,7 @@ func WithSchema(schema string) SchemaRetriever { func AssertLogSchema(schemaRetriever SchemaRetriever) LogEventValidator { return func(event types.OutputLogEvent) error { message := *event.Message + fmt.Println("This is the message", message) if schemaRetriever == nil { return errors.New("nil schema retriever") } @@ -253,6 +254,9 @@ func AssertLogSchema(schemaRetriever SchemaRetriever) LogEventValidator { if err != nil { return fmt.Errorf("failed to execute schema validator: %w", err) } else if len(keyErrors) > 0 { + fmt.Println("This is the length of key errors", len(keyErrors)) + fmt.Printf("This is the first key errors info: Property Path: %s Invalid Value %s and the message %s \n ", keyErrors[0].PropertyPath, keyErrors[0].InvalidValue, keyErrors[0].Message) + return fmt.Errorf("failed schema validation: %v | schema: %s | log: %s", keyErrors, schema, message) } return nil From 399ede3ed34984e98d00dfa9441aefeafc1d6d8b Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 23 May 2024 16:46:26 -0400 Subject: [PATCH 18/60] fixing some issues --- terraform/gpu/main.tf | 308 ++++++++++++++++++ .../eks_resources/util.go | 2 +- 2 files changed, 309 insertions(+), 1 deletion(-) diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index f9af97cec..ea1421f5c 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -104,7 +104,315 @@ resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" { role = aws_iam_role.node_role.name } +# TODO: these security groups be created once and then reused +# EKS Cluster Security Group +resource "aws_security_group" "eks_cluster_sg" { + name = "cwagent-eks-cluster-sg-${module.common.testing_id}" + description = "Cluster communication with worker nodes" + vpc_id = module.basic_components.vpc_id +} + +resource "aws_security_group_rule" "cluster_inbound" { + description = "Allow worker nodes to communicate with the cluster API Server" + from_port = 443 + protocol = "tcp" + security_group_id = aws_security_group.eks_cluster_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 443 + type = "ingress" +} + +resource "aws_security_group_rule" "cluster_outbound" { + description = "Allow cluster API Server to communicate with the worker nodes" + from_port = 1024 + protocol = "tcp" + security_group_id = aws_security_group.eks_cluster_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 65535 + type = "egress" +} + + +# EKS Node Security Group +resource "aws_security_group" "eks_nodes_sg" { + name = "cwagent-eks-node-sg-${module.common.testing_id}" + description = "Security group for all nodes in the cluster" + vpc_id = module.basic_components.vpc_id + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_security_group_rule" "nodes_internal" { + description = "Allow nodes to communicate with each other" + from_port = 0 + protocol = "-1" + security_group_id = aws_security_group.eks_nodes_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 65535 + type = "ingress" +} + +resource "aws_security_group_rule" "nodes_cluster_inbound" { + description = "Allow worker Kubelets and pods to receive communication from the cluster control plane" + from_port = 1025 + protocol = "tcp" + security_group_id = aws_security_group.eks_nodes_sg.id + source_security_group_id = aws_security_group.eks_cluster_sg.id + to_port = 65535 + type = "ingress" +} + +resource "kubernetes_namespace" "namespace" { + metadata { + name = "amazon-cloudwatch" + } +} + +# TODO: how do we support different deployment types? Should they be in separate terraform +# files, and spawn separate tests? +resource "kubernetes_daemonset" "service" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_config_map.cwagentconfig, + kubernetes_service_account.cwagentservice, + aws_eks_node_group.this + ] + metadata { + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } + spec { + selector { + match_labels = { + "name" : "cloudwatch-agent" + } + } + template { + metadata { + labels = { + "name" : "cloudwatch-agent" + } + } + spec { + node_selector = { + "kubernetes.io/os" : "linux" + } + container { + name = "cwagent" + image = "${var.cwagent_image_repo}:${var.cwagent_image_tag}" + image_pull_policy = "Always" + resources { + limits = { + "cpu" : "200m", + "memory" : "200Mi" + } + requests = { + "cpu" : "200m", + "memory" : "200Mi" + } + } + env { + name = "HOST_IP" + value_from { + field_ref { + field_path = "status.hostIP" + } + } + } + env { + name = "HOST_NAME" + value_from { + field_ref { + field_path = "spec.nodeName" + } + } + } + env { + name = "K8S_NAMESPACE" + value_from { + field_ref { + field_path = "metadata.namespace" + } + } + } + volume_mount { + mount_path = "/etc/cwagentconfig" + name = "cwagentconfig" + } + volume_mount { + mount_path = "/rootfs" + name = "rootfs" + read_only = true + } + volume_mount { + mount_path = "/var/run/docker.sock" + name = "dockersock" + read_only = true + } + volume_mount { + mount_path = "/var/lib/docker" + name = "varlibdocker" + read_only = true + } + volume_mount { + mount_path = "/run/containerd/containerd.sock" + name = "containerdsock" + read_only = true + } + volume_mount { + mount_path = "/sys" + name = "sys" + read_only = true + } + volume_mount { + mount_path = "/dev/disk" + name = "devdisk" + read_only = true + } + } + volume { + name = "cwagentconfig" + config_map { + name = "cwagentconfig" + } + } + volume { + name = "rootfs" + host_path { + path = "/" + } + } + volume { + name = "dockersock" + host_path { + path = "/var/run/docker.sock" + } + } + volume { + name = "varlibdocker" + host_path { + path = "/var/lib/docker" + } + } + volume { + name = "containerdsock" + host_path { + path = "/run/containerd/containerd.sock" + } + } + volume { + name = "sys" + host_path { + path = "/sys" + } + } + volume { + name = "devdisk" + host_path { + path = "/dev/disk" + } + } + service_account_name = "cloudwatch-agent" + termination_grace_period_seconds = 60 + } + } + } +} + +########################################## +# Template Files +########################################## +locals { + cwagent_config = fileexists("../../../${var.test_dir}/resources/config.json") ? "../../../${var.test_dir}/resources/config.json" : "./default_resources/default_amazon_cloudwatch_agent.json" +} + +data "template_file" "cwagent_config" { + template = file(local.cwagent_config) + vars = { + } +} + +resource "kubernetes_config_map" "cwagentconfig" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice + ] + metadata { + name = "cwagentconfig" + namespace = "amazon-cloudwatch" + } + data = { + "cwagentconfig.json" : data.template_file.cwagent_config.rendered + } +} + +resource "kubernetes_service_account" "cwagentservice" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } +} +resource "kubernetes_cluster_role" "clusterrole" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent-role" + } + rule { + verbs = ["list", "watch"] + resources = ["pods", "nodes", "endpoints"] + api_groups = [""] + } + rule { + verbs = ["list", "watch"] + resources = ["replicasets"] + api_groups = ["apps"] + } + rule { + verbs = ["list", "watch"] + resources = ["jobs"] + api_groups = ["batch"] + } + rule { + verbs = ["get"] + resources = ["nodes/proxy"] + api_groups = [""] + } + rule { + verbs = ["create"] + resources = ["nodes/stats", "configmaps", "events"] + api_groups = [""] + } + rule { + verbs = ["get", "update"] + resource_names = ["cwagent-clusterleader"] + resources = ["configmaps"] + api_groups = [""] + } +} + +resource "kubernetes_cluster_role_binding" "rolebinding" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent-role-binding" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = "cloudwatch-agent-role" + } + subject { + kind = "ServiceAccount" + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } +} resource "null_resource" "kubectl" { depends_on = [ aws_eks_cluster.this, diff --git a/test/metric_value_benchmark/eks_resources/util.go b/test/metric_value_benchmark/eks_resources/util.go index bff8892a7..d398b81e1 100644 --- a/test/metric_value_benchmark/eks_resources/util.go +++ b/test/metric_value_benchmark/eks_resources/util.go @@ -49,7 +49,7 @@ var ( EksClusterValidationMap = map[string]string{ "Cluster": eksClusterSchema, - "ClusterDaemonSet": eksClusterDaemonsetSchema, + "ClusterDaemonset": eksClusterDaemonsetSchema, "ClusterDeployment": eksClusterDeploymentSchema, "ClusterNamespace": eksClusterNamespaceSchema, "ClusterService": eksClusterServiceSchema, From f4f64fe1ca5645038458b6161529baa90f371468 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Fri, 24 May 2024 09:23:20 -0400 Subject: [PATCH 19/60] adding other mocks to see if test works --- terraform/gpu/main.tf | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index ea1421f5c..f093e440b 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -16,10 +16,7 @@ data "aws_eks_cluster_auth" "this" { name = aws_eks_cluster.this.name } -locals { - role_arn = format("%s%s", module.basic_components.role_arn, var.beta ? "-eks-beta" : "") - aws_eks = format("%s%s", "aws eks --region ${var.region}", var.beta ? " --endpoint ${var.beta_endpoint}" : "") -} + resource "aws_eks_cluster" "this" { name = "cwagent-operator-eks-integ-${module.common.testing_id}" @@ -63,7 +60,12 @@ resource "aws_eks_node_group" "this" { aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy ] } +locals { + cwagent_config = fileexists("${var.test_dir}/resources/config.json") ? "${var.test_dir}/resources/config.json" : "../eks/daemon/default_resources/default_amazon_cloudwatch_agent.json" + role_arn = format("%s%s", module.basic_components.role_arn, var.beta ? "-eks-beta" : "") + aws_eks = format("%s%s", "aws eks --region ${var.region}", var.beta ? " --endpoint ${var.beta_endpoint}" : "") +} # EKS Node IAM Role resource "aws_iam_role" "node_role" { name = "cwagent-operator-eks-Worker-Role-${module.common.testing_id}" @@ -327,9 +329,7 @@ resource "kubernetes_daemonset" "service" { ########################################## # Template Files ########################################## -locals { - cwagent_config = fileexists("../../../${var.test_dir}/resources/config.json") ? "../../../${var.test_dir}/resources/config.json" : "./default_resources/default_amazon_cloudwatch_agent.json" -} + data "template_file" "cwagent_config" { template = file(local.cwagent_config) From 0ae4f568be0cd8544aa3b1b0def02e70a12c9f37 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Fri, 24 May 2024 10:13:23 -0400 Subject: [PATCH 20/60] fixing schema --- test/metric/container_insights_util.go | 4 ++-- .../eks_resources/test_schemas/cluster_daemonset.json | 2 -- test/metric_value_benchmark/eks_resources/util.go | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go index 8ad9ce7e2..e8b05e7e6 100644 --- a/test/metric/container_insights_util.go +++ b/test/metric/container_insights_util.go @@ -212,10 +212,10 @@ func ValidateLogs(env *environment.MetaData) status.TestResult { if innerErr != nil { return "", fmt.Errorf("failed to unmarshal log file: %w", innerErr) } - log.Printf("eksClusterType is: %v", eksClusterType.Type) + log.Printf("The eksClusterType is: %v", eksClusterType.Type) jsonSchema, ok := eks_resources.EksClusterValidationMap[eksClusterType.Type] if !ok { - log.Printf("eksClusterType is: %v", eksClusterType.Type) + log.Printf("Error for this eksClusterType: %v", eksClusterType.Type) return "", errors.New("invalid cluster type provided") } return jsonSchema, nil diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/cluster_daemonset.json b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_daemonset.json index e8aa9420c..e30f59728 100644 --- a/test/metric_value_benchmark/eks_resources/test_schemas/cluster_daemonset.json +++ b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_daemonset.json @@ -9,7 +9,6 @@ "Sources":{}, "Timestamp":{}, "Version":{}, - "AutoScalingGroupName":{}, "Namespace": {}, "NodeName": {}, "PodName": {}, @@ -24,7 +23,6 @@ "Sources", "Timestamp", "Version", - "AutoScalingGroupName", "NodeName", "PodName", "Namespace" diff --git a/test/metric_value_benchmark/eks_resources/util.go b/test/metric_value_benchmark/eks_resources/util.go index d398b81e1..bff8892a7 100644 --- a/test/metric_value_benchmark/eks_resources/util.go +++ b/test/metric_value_benchmark/eks_resources/util.go @@ -49,7 +49,7 @@ var ( EksClusterValidationMap = map[string]string{ "Cluster": eksClusterSchema, - "ClusterDaemonset": eksClusterDaemonsetSchema, + "ClusterDaemonSet": eksClusterDaemonsetSchema, "ClusterDeployment": eksClusterDeploymentSchema, "ClusterNamespace": eksClusterNamespaceSchema, "ClusterService": eksClusterServiceSchema, From c8c5fea1d42e7870b4762868f0900a8fbf5156a1 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Fri, 24 May 2024 10:19:35 -0400 Subject: [PATCH 21/60] fixing test --- terraform/gpu/main.tf | 316 +----------------------------------------- 1 file changed, 4 insertions(+), 312 deletions(-) diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index f093e440b..f9af97cec 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -16,7 +16,10 @@ data "aws_eks_cluster_auth" "this" { name = aws_eks_cluster.this.name } - +locals { + role_arn = format("%s%s", module.basic_components.role_arn, var.beta ? "-eks-beta" : "") + aws_eks = format("%s%s", "aws eks --region ${var.region}", var.beta ? " --endpoint ${var.beta_endpoint}" : "") +} resource "aws_eks_cluster" "this" { name = "cwagent-operator-eks-integ-${module.common.testing_id}" @@ -60,12 +63,7 @@ resource "aws_eks_node_group" "this" { aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy ] } -locals { - cwagent_config = fileexists("${var.test_dir}/resources/config.json") ? "${var.test_dir}/resources/config.json" : "../eks/daemon/default_resources/default_amazon_cloudwatch_agent.json" - role_arn = format("%s%s", module.basic_components.role_arn, var.beta ? "-eks-beta" : "") - aws_eks = format("%s%s", "aws eks --region ${var.region}", var.beta ? " --endpoint ${var.beta_endpoint}" : "") -} # EKS Node IAM Role resource "aws_iam_role" "node_role" { name = "cwagent-operator-eks-Worker-Role-${module.common.testing_id}" @@ -106,313 +104,7 @@ resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" { role = aws_iam_role.node_role.name } -# TODO: these security groups be created once and then reused -# EKS Cluster Security Group -resource "aws_security_group" "eks_cluster_sg" { - name = "cwagent-eks-cluster-sg-${module.common.testing_id}" - description = "Cluster communication with worker nodes" - vpc_id = module.basic_components.vpc_id -} - -resource "aws_security_group_rule" "cluster_inbound" { - description = "Allow worker nodes to communicate with the cluster API Server" - from_port = 443 - protocol = "tcp" - security_group_id = aws_security_group.eks_cluster_sg.id - source_security_group_id = aws_security_group.eks_nodes_sg.id - to_port = 443 - type = "ingress" -} - -resource "aws_security_group_rule" "cluster_outbound" { - description = "Allow cluster API Server to communicate with the worker nodes" - from_port = 1024 - protocol = "tcp" - security_group_id = aws_security_group.eks_cluster_sg.id - source_security_group_id = aws_security_group.eks_nodes_sg.id - to_port = 65535 - type = "egress" -} - - -# EKS Node Security Group -resource "aws_security_group" "eks_nodes_sg" { - name = "cwagent-eks-node-sg-${module.common.testing_id}" - description = "Security group for all nodes in the cluster" - vpc_id = module.basic_components.vpc_id - - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } -} - -resource "aws_security_group_rule" "nodes_internal" { - description = "Allow nodes to communicate with each other" - from_port = 0 - protocol = "-1" - security_group_id = aws_security_group.eks_nodes_sg.id - source_security_group_id = aws_security_group.eks_nodes_sg.id - to_port = 65535 - type = "ingress" -} - -resource "aws_security_group_rule" "nodes_cluster_inbound" { - description = "Allow worker Kubelets and pods to receive communication from the cluster control plane" - from_port = 1025 - protocol = "tcp" - security_group_id = aws_security_group.eks_nodes_sg.id - source_security_group_id = aws_security_group.eks_cluster_sg.id - to_port = 65535 - type = "ingress" -} - -resource "kubernetes_namespace" "namespace" { - metadata { - name = "amazon-cloudwatch" - } -} - -# TODO: how do we support different deployment types? Should they be in separate terraform -# files, and spawn separate tests? -resource "kubernetes_daemonset" "service" { - depends_on = [ - kubernetes_namespace.namespace, - kubernetes_config_map.cwagentconfig, - kubernetes_service_account.cwagentservice, - aws_eks_node_group.this - ] - metadata { - name = "cloudwatch-agent" - namespace = "amazon-cloudwatch" - } - spec { - selector { - match_labels = { - "name" : "cloudwatch-agent" - } - } - template { - metadata { - labels = { - "name" : "cloudwatch-agent" - } - } - spec { - node_selector = { - "kubernetes.io/os" : "linux" - } - container { - name = "cwagent" - image = "${var.cwagent_image_repo}:${var.cwagent_image_tag}" - image_pull_policy = "Always" - resources { - limits = { - "cpu" : "200m", - "memory" : "200Mi" - } - requests = { - "cpu" : "200m", - "memory" : "200Mi" - } - } - env { - name = "HOST_IP" - value_from { - field_ref { - field_path = "status.hostIP" - } - } - } - env { - name = "HOST_NAME" - value_from { - field_ref { - field_path = "spec.nodeName" - } - } - } - env { - name = "K8S_NAMESPACE" - value_from { - field_ref { - field_path = "metadata.namespace" - } - } - } - volume_mount { - mount_path = "/etc/cwagentconfig" - name = "cwagentconfig" - } - volume_mount { - mount_path = "/rootfs" - name = "rootfs" - read_only = true - } - volume_mount { - mount_path = "/var/run/docker.sock" - name = "dockersock" - read_only = true - } - volume_mount { - mount_path = "/var/lib/docker" - name = "varlibdocker" - read_only = true - } - volume_mount { - mount_path = "/run/containerd/containerd.sock" - name = "containerdsock" - read_only = true - } - volume_mount { - mount_path = "/sys" - name = "sys" - read_only = true - } - volume_mount { - mount_path = "/dev/disk" - name = "devdisk" - read_only = true - } - } - volume { - name = "cwagentconfig" - config_map { - name = "cwagentconfig" - } - } - volume { - name = "rootfs" - host_path { - path = "/" - } - } - volume { - name = "dockersock" - host_path { - path = "/var/run/docker.sock" - } - } - volume { - name = "varlibdocker" - host_path { - path = "/var/lib/docker" - } - } - volume { - name = "containerdsock" - host_path { - path = "/run/containerd/containerd.sock" - } - } - volume { - name = "sys" - host_path { - path = "/sys" - } - } - volume { - name = "devdisk" - host_path { - path = "/dev/disk" - } - } - service_account_name = "cloudwatch-agent" - termination_grace_period_seconds = 60 - } - } - } -} - -########################################## -# Template Files -########################################## - - -data "template_file" "cwagent_config" { - template = file(local.cwagent_config) - vars = { - } -} - -resource "kubernetes_config_map" "cwagentconfig" { - depends_on = [ - kubernetes_namespace.namespace, - kubernetes_service_account.cwagentservice - ] - metadata { - name = "cwagentconfig" - namespace = "amazon-cloudwatch" - } - data = { - "cwagentconfig.json" : data.template_file.cwagent_config.rendered - } -} -resource "kubernetes_service_account" "cwagentservice" { - depends_on = [kubernetes_namespace.namespace] - metadata { - name = "cloudwatch-agent" - namespace = "amazon-cloudwatch" - } -} - -resource "kubernetes_cluster_role" "clusterrole" { - depends_on = [kubernetes_namespace.namespace] - metadata { - name = "cloudwatch-agent-role" - } - rule { - verbs = ["list", "watch"] - resources = ["pods", "nodes", "endpoints"] - api_groups = [""] - } - rule { - verbs = ["list", "watch"] - resources = ["replicasets"] - api_groups = ["apps"] - } - rule { - verbs = ["list", "watch"] - resources = ["jobs"] - api_groups = ["batch"] - } - rule { - verbs = ["get"] - resources = ["nodes/proxy"] - api_groups = [""] - } - rule { - verbs = ["create"] - resources = ["nodes/stats", "configmaps", "events"] - api_groups = [""] - } - rule { - verbs = ["get", "update"] - resource_names = ["cwagent-clusterleader"] - resources = ["configmaps"] - api_groups = [""] - } -} - -resource "kubernetes_cluster_role_binding" "rolebinding" { - depends_on = [kubernetes_namespace.namespace] - metadata { - name = "cloudwatch-agent-role-binding" - } - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "ClusterRole" - name = "cloudwatch-agent-role" - } - subject { - kind = "ServiceAccount" - name = "cloudwatch-agent" - namespace = "amazon-cloudwatch" - } -} resource "null_resource" "kubectl" { depends_on = [ aws_eks_cluster.this, From e0c5902def51dd29d2f0d69f7891178752f9e659 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Fri, 24 May 2024 11:55:10 -0400 Subject: [PATCH 22/60] fixing cluster deployment --- .../eks_resources/test_schemas/cluster_deployment.json | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/cluster_deployment.json b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_deployment.json index e8aa9420c..e30f59728 100644 --- a/test/metric_value_benchmark/eks_resources/test_schemas/cluster_deployment.json +++ b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_deployment.json @@ -9,7 +9,6 @@ "Sources":{}, "Timestamp":{}, "Version":{}, - "AutoScalingGroupName":{}, "Namespace": {}, "NodeName": {}, "PodName": {}, @@ -24,7 +23,6 @@ "Sources", "Timestamp", "Version", - "AutoScalingGroupName", "NodeName", "PodName", "Namespace" From ba229fa57ea4a8660ca1b82e16fc0d9640f18750 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Fri, 24 May 2024 12:40:21 -0400 Subject: [PATCH 23/60] increasing size of instance --- terraform/gpu/main.tf | 2 +- terraform/gpu/variables.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index f9af97cec..bd9119cff 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -54,7 +54,7 @@ resource "aws_eks_node_group" "this" { ami_type = "AL2_x86_64_GPU" capacity_type = "ON_DEMAND" disk_size = 20 - instance_types = ["g4dn.xlarge"] + instance_types = ["g4dn.12xlarge"] depends_on = [ aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, diff --git a/terraform/gpu/variables.tf b/terraform/gpu/variables.tf index 2bc2bec38..de0d5a2a6 100644 --- a/terraform/gpu/variables.tf +++ b/terraform/gpu/variables.tf @@ -44,7 +44,7 @@ variable "ami_type" { variable "instance_type" { type = string - default = "g4dn.xlarge" + default = "g4dn.12xlarge" } variable "beta" { From b07a1a62faef2d820e0aaf2d433cad9bff207524 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Fri, 24 May 2024 14:32:51 -0400 Subject: [PATCH 24/60] adding a sleep --- terraform/gpu/main.tf | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index bd9119cff..03774c59a 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -54,7 +54,7 @@ resource "aws_eks_node_group" "this" { ami_type = "AL2_x86_64_GPU" capacity_type = "ON_DEMAND" disk_size = 20 - instance_types = ["g4dn.12xlarge"] + instance_types = ["g4dn.xlarge"] depends_on = [ aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, @@ -137,8 +137,11 @@ resource "null_resource" "validator" { provisioner "local-exec" { command = < pods.txt From 4469748e23fc9235819a980c74f2f8db66f6c170 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Fri, 24 May 2024 16:14:42 -0400 Subject: [PATCH 25/60] removing logs just go to the commit before if you want lags --- terraform/eks/daemon/gpu/main.tf | 7 +++---- terraform/eks/daemon/gpuBurner.yaml | 1 + test/metric/container_insights_util.go | 5 ----- test/metric/metric_value_query.go | 1 - util/awsservice/cloudwatchlogs.go | 4 ---- 5 files changed, 4 insertions(+), 14 deletions(-) diff --git a/terraform/eks/daemon/gpu/main.tf b/terraform/eks/daemon/gpu/main.tf index b5e1eacce..658b78269 100644 --- a/terraform/eks/daemon/gpu/main.tf +++ b/terraform/eks/daemon/gpu/main.tf @@ -42,9 +42,9 @@ resource "aws_eks_node_group" "this" { subnet_ids = module.basic_components.public_subnet_ids scaling_config { - desired_size = 2 - max_size = 2 - min_size = 2 + desired_size = 1 + max_size = 1 + min_size = 1 } ami_type = var.ami_type @@ -712,7 +712,6 @@ resource "null_resource" "validator" { provisioner "local-exec" { command = <<-EOT echo "Validating EKS metrics/logs for EMF" - kubectl apply -f ../gpuBurner.yaml cd ../../../.. go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia EOT diff --git a/terraform/eks/daemon/gpuBurner.yaml b/terraform/eks/daemon/gpuBurner.yaml index 1c3e164a5..4fad95303 100644 --- a/terraform/eks/daemon/gpuBurner.yaml +++ b/terraform/eks/daemon/gpuBurner.yaml @@ -26,3 +26,4 @@ spec: resources: limits: nvidia.com/gpu: 1 + diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go index e8b05e7e6..ae014079d 100644 --- a/test/metric/container_insights_util.go +++ b/test/metric/container_insights_util.go @@ -193,9 +193,6 @@ func ValidateLogs(env *environment.MetaData) status.TestResult { } for _, instance := range eKSInstances { - fmt.Println("Number of eks Instances", len(eKSInstances)) - fmt.Println("This is the instance", instance.InstanceName) - fmt.Println("This is the instance", instance) stream := *instance.InstanceName err = awsservice.ValidateLogs( @@ -212,10 +209,8 @@ func ValidateLogs(env *environment.MetaData) status.TestResult { if innerErr != nil { return "", fmt.Errorf("failed to unmarshal log file: %w", innerErr) } - log.Printf("The eksClusterType is: %v", eksClusterType.Type) jsonSchema, ok := eks_resources.EksClusterValidationMap[eksClusterType.Type] if !ok { - log.Printf("Error for this eksClusterType: %v", eksClusterType.Type) return "", errors.New("invalid cluster type provided") } return jsonSchema, nil diff --git a/test/metric/metric_value_query.go b/test/metric/metric_value_query.go index 23f4464f8..eea358074 100644 --- a/test/metric/metric_value_query.go +++ b/test/metric/metric_value_query.go @@ -57,7 +57,6 @@ func (n *MetricValueFetcher) Fetch(namespace, metricName string, metricSpecificD EndTime: &endTime, MetricDataQueries: metricDataQueries, } - log.Print("This is the the getMetric data input", getMetricDataInput, getMetricDataInput.StartTime, getMetricDataInput.EndTime, getMetricDataInput.MetricDataQueries) log.Printf("Metric data input: namespace %v, name %v, stat %v, period %v", namespace, metricName, stat, metricQueryPeriod) diff --git a/util/awsservice/cloudwatchlogs.go b/util/awsservice/cloudwatchlogs.go index 7ba1f5d3e..10a9f830b 100644 --- a/util/awsservice/cloudwatchlogs.go +++ b/util/awsservice/cloudwatchlogs.go @@ -242,7 +242,6 @@ func WithSchema(schema string) SchemaRetriever { func AssertLogSchema(schemaRetriever SchemaRetriever) LogEventValidator { return func(event types.OutputLogEvent) error { message := *event.Message - fmt.Println("This is the message", message) if schemaRetriever == nil { return errors.New("nil schema retriever") } @@ -254,9 +253,6 @@ func AssertLogSchema(schemaRetriever SchemaRetriever) LogEventValidator { if err != nil { return fmt.Errorf("failed to execute schema validator: %w", err) } else if len(keyErrors) > 0 { - fmt.Println("This is the length of key errors", len(keyErrors)) - fmt.Printf("This is the first key errors info: Property Path: %s Invalid Value %s and the message %s \n ", keyErrors[0].PropertyPath, keyErrors[0].InvalidValue, keyErrors[0].Message) - return fmt.Errorf("failed schema validation: %v | schema: %s | log: %s", keyErrors, schema, message) } return nil From f785e3f7f2d064a05f060f53a31ffc01ae9e8680 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Tue, 28 May 2024 14:41:57 -0400 Subject: [PATCH 26/60] resolving comments --- terraform/gpu/main.tf | 20 +------------------- terraform/gpu/variables.tf | 6 +++--- test/metric/container_insights_util.go | 3 ++- 3 files changed, 6 insertions(+), 23 deletions(-) diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index 03774c59a..fbf993170 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -54,7 +54,7 @@ resource "aws_eks_node_group" "this" { ami_type = "AL2_x86_64_GPU" capacity_type = "ON_DEMAND" disk_size = 20 - instance_types = ["g4dn.xlarge"] + instance_types = var.instance_type depends_on = [ aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, @@ -137,29 +137,11 @@ resource "null_resource" "validator" { provisioner "local-exec" { command = < pods.txt - kubectl describe pods --all-namespaces > pods_describe.txt - - # Log the contents of the files - cat pods.txt - cat pods_describe.txt echo "Tests passed" - else - # Get all pods and describe them - kubectl get pods --all-namespaces -o wide > pods.txt - kubectl describe pods --all-namespaces > pods_describe.txt - - # Log the contents of the files - cat pods.txt - cat pods_describe.txt echo "Tests failed" exit 1 fi diff --git a/terraform/gpu/variables.tf b/terraform/gpu/variables.tf index de0d5a2a6..2ef05a053 100644 --- a/terraform/gpu/variables.tf +++ b/terraform/gpu/variables.tf @@ -18,7 +18,7 @@ variable "addon_name" { variable "addon_version" { type = string - default = "v1.6.0-eksbuild.1" + default = "latest" } @@ -34,7 +34,7 @@ variable "cwagent_image_tag" { variable "k8s_version" { type = string - default = "1.28" + default = "1.29" } variable "ami_type" { @@ -44,7 +44,7 @@ variable "ami_type" { variable "instance_type" { type = string - default = "g4dn.12xlarge" + default = "g4dn.xlarge" } variable "beta" { diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go index ae014079d..a3ddff03d 100644 --- a/test/metric/container_insights_util.go +++ b/test/metric/container_insights_util.go @@ -193,7 +193,6 @@ func ValidateLogs(env *environment.MetaData) status.TestResult { } for _, instance := range eKSInstances { - stream := *instance.InstanceName err = awsservice.ValidateLogs( group, @@ -209,6 +208,8 @@ func ValidateLogs(env *environment.MetaData) status.TestResult { if innerErr != nil { return "", fmt.Errorf("failed to unmarshal log file: %w", innerErr) } + + //log.Printf("eksClusterType is: %s", eksClusterType.Type) jsonSchema, ok := eks_resources.EksClusterValidationMap[eksClusterType.Type] if !ok { return "", errors.New("invalid cluster type provided") From 09f98baafe4c5a13042a86ee7edce1719016096f Mon Sep 17 00:00:00 2001 From: Paramadon Date: Tue, 28 May 2024 16:17:03 -0400 Subject: [PATCH 27/60] fixing lint --- terraform/gpu/main.tf | 6 +++--- terraform/gpu/variables.tf | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf index fbf993170..0df63b78b 100644 --- a/terraform/gpu/main.tf +++ b/terraform/gpu/main.tf @@ -2,7 +2,7 @@ // SPDX-License-Identifier: MIT module "common" { - source = "../common" + source = "../common" } module "basic_components" { @@ -123,8 +123,8 @@ resource "aws_eks_addon" "this" { depends_on = [ null_resource.kubectl ] - addon_name = var.addon_name - cluster_name = aws_eks_cluster.this.name + addon_name = var.addon_name + cluster_name = aws_eks_cluster.this.name addon_version = var.addon_version } diff --git a/terraform/gpu/variables.tf b/terraform/gpu/variables.tf index 2ef05a053..b89773575 100644 --- a/terraform/gpu/variables.tf +++ b/terraform/gpu/variables.tf @@ -17,7 +17,7 @@ variable "addon_name" { } variable "addon_version" { - type = string + type = string default = "latest" } From ddc61eca8dc0e487699908bf4e80637ba09dbbba Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 6 Jun 2024 11:31:12 -0400 Subject: [PATCH 28/60] organized code --- .../{ => eks_addon_on_gpu}/gpuBurner.yaml | 0 .../daemon/eks_addon_on_gpu}/main.tf | 7 +-- .../daemon/eks_addon_on_gpu}/providers.tf | 0 .../daemon/eks_addon_on_gpu}/variables.tf | 13 +---- .../agent_configs/ethtool_config.json | 55 +++++++++++-------- 5 files changed, 35 insertions(+), 40 deletions(-) rename terraform/eks/daemon/{ => eks_addon_on_gpu}/gpuBurner.yaml (100%) rename terraform/{gpu => eks/daemon/eks_addon_on_gpu}/main.tf (97%) rename terraform/{gpu => eks/daemon/eks_addon_on_gpu}/providers.tf (100%) rename terraform/{gpu => eks/daemon/eks_addon_on_gpu}/variables.tf (77%) diff --git a/terraform/eks/daemon/gpuBurner.yaml b/terraform/eks/daemon/eks_addon_on_gpu/gpuBurner.yaml similarity index 100% rename from terraform/eks/daemon/gpuBurner.yaml rename to terraform/eks/daemon/eks_addon_on_gpu/gpuBurner.yaml diff --git a/terraform/gpu/main.tf b/terraform/eks/daemon/eks_addon_on_gpu/main.tf similarity index 97% rename from terraform/gpu/main.tf rename to terraform/eks/daemon/eks_addon_on_gpu/main.tf index 0df63b78b..a3195aea4 100644 --- a/terraform/gpu/main.tf +++ b/terraform/eks/daemon/eks_addon_on_gpu/main.tf @@ -2,12 +2,11 @@ // SPDX-License-Identifier: MIT module "common" { - source = "../common" + source = "../../../common" } module "basic_components" { - source = "../basic_components" - + source = "../../../basic_components" region = var.region } @@ -137,7 +136,7 @@ resource "null_resource" "validator" { provisioner "local-exec" { command = < Date: Thu, 6 Jun 2024 11:36:04 -0400 Subject: [PATCH 29/60] fixing terraform --- terraform/eks/daemon/eks_addon_on_gpu/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/eks/daemon/eks_addon_on_gpu/main.tf b/terraform/eks/daemon/eks_addon_on_gpu/main.tf index a3195aea4..e0a69e1ef 100644 --- a/terraform/eks/daemon/eks_addon_on_gpu/main.tf +++ b/terraform/eks/daemon/eks_addon_on_gpu/main.tf @@ -53,7 +53,7 @@ resource "aws_eks_node_group" "this" { ami_type = "AL2_x86_64_GPU" capacity_type = "ON_DEMAND" disk_size = 20 - instance_types = var.instance_type + instance_types = [var.instance_type] depends_on = [ aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, From 133aca1e5f471936ab58e0ff262bc54c1db3697a Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 6 Jun 2024 11:48:21 -0400 Subject: [PATCH 30/60] fixing terraform --- terraform/eks/daemon/eks_addon_on_gpu/main.tf | 2 +- terraform/eks/daemon/eks_addon_on_gpu/variables.tf | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/terraform/eks/daemon/eks_addon_on_gpu/main.tf b/terraform/eks/daemon/eks_addon_on_gpu/main.tf index e0a69e1ef..6458ad101 100644 --- a/terraform/eks/daemon/eks_addon_on_gpu/main.tf +++ b/terraform/eks/daemon/eks_addon_on_gpu/main.tf @@ -21,7 +21,7 @@ locals { } resource "aws_eks_cluster" "this" { - name = "cwagent-operator-eks-integ-${module.common.testing_id}" + name = "cwagent-operator-eks-integ2-${module.common.testing_id}" role_arn = local.role_arn version = var.k8s_version enabled_cluster_log_types = [ diff --git a/terraform/eks/daemon/eks_addon_on_gpu/variables.tf b/terraform/eks/daemon/eks_addon_on_gpu/variables.tf index 7ecdf9845..6b5267aca 100644 --- a/terraform/eks/daemon/eks_addon_on_gpu/variables.tf +++ b/terraform/eks/daemon/eks_addon_on_gpu/variables.tf @@ -17,8 +17,8 @@ variable "addon_name" { } variable "addon_version" { - type = string - default = "latest" + type = string + default = "v1.1.0-eksbuild.1" } variable "k8s_version" { From ee4615759434bb9dc667990f70aab83385e8028a Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 6 Jun 2024 11:58:58 -0400 Subject: [PATCH 31/60] removing change to ethtool --- .../agent_configs/ethtool_config.json | 55 ++++++++----------- 1 file changed, 24 insertions(+), 31 deletions(-) diff --git a/test/metric_value_benchmark/agent_configs/ethtool_config.json b/test/metric_value_benchmark/agent_configs/ethtool_config.json index b313cc1fc..d2cc4912f 100644 --- a/test/metric_value_benchmark/agent_configs/ethtool_config.json +++ b/test/metric_value_benchmark/agent_configs/ethtool_config.json @@ -1,34 +1,27 @@ { - "metrics": { - "namespace": "EthtoolTest", - "metrics_collected": { - "ethtool": { - "metrics_include": [ - "bw_in_allowance_exceeded", - "bw_out_allowance_exceeded", - "pps_allowance_exceeded", - "conntrack_allowance_exceeded", - "linklocal_allowance_exceeded", - "rx_gso_checksum_fixup", - "ena_admin_q_no_completion", - "ena_admin_q_out_of_space", - "ena_admin_q_completed_cmd", - "ena_admin_q_submitted_cmd", - "ena_admin_q_aborted_cmd", - "queue_7_rx_zc_queue_pkt_copy", - "queue_7_rx_rx_copybreak_pkt", - "queue_0_tx_cnt", - "conntrack_allowance_available", - "linklocal_allowance_exceeded", - "pps_allowance_exceeded", - "queue_0_rx_bytes" - ], + "agent": { + "metrics_collection_interval": 10, + "run_as_user": "root", + "debug": true, + "logfile": "" + }, + "metrics": { + "namespace": "MetricValueBenchmarkTest", "append_dimensions": { - "name": "param", - "key": "chicken", - "key2": "chicken2" - } + "InstanceId": "${aws:InstanceId}" + }, + "metrics_collected": { + "ethtool": { + "interface_include": [ + "eth0", + "ens5" + ], + "metrics_include": [ + "queue_0_tx_cnt", + "queue_0_rx_cnt" + ] + } + }, + "force_flush_interval": 5 } - } - } -} +} \ No newline at end of file From 07e2141ec3bc0056e307d1c6758f2778e76aa15d Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 6 Jun 2024 12:01:22 -0400 Subject: [PATCH 32/60] fixing aws cluster name --- terraform/eks/daemon/eks_addon_on_gpu/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/eks/daemon/eks_addon_on_gpu/main.tf b/terraform/eks/daemon/eks_addon_on_gpu/main.tf index 6458ad101..e0a69e1ef 100644 --- a/terraform/eks/daemon/eks_addon_on_gpu/main.tf +++ b/terraform/eks/daemon/eks_addon_on_gpu/main.tf @@ -21,7 +21,7 @@ locals { } resource "aws_eks_cluster" "this" { - name = "cwagent-operator-eks-integ2-${module.common.testing_id}" + name = "cwagent-operator-eks-integ-${module.common.testing_id}" role_arn = local.role_arn version = var.k8s_version enabled_cluster_log_types = [ From 28ea49e3d8c391b98ed02342994a4133f6395cb4 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 6 Jun 2024 12:07:48 -0400 Subject: [PATCH 33/60] fixing add_version --- terraform/eks/daemon/eks_addon_on_gpu/variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/eks/daemon/eks_addon_on_gpu/variables.tf b/terraform/eks/daemon/eks_addon_on_gpu/variables.tf index 6b5267aca..ce3fc6d44 100644 --- a/terraform/eks/daemon/eks_addon_on_gpu/variables.tf +++ b/terraform/eks/daemon/eks_addon_on_gpu/variables.tf @@ -18,7 +18,7 @@ variable "addon_name" { variable "addon_version" { type = string - default = "v1.1.0-eksbuild.1" + default = "v1.6.0-eksbuild.1" } variable "k8s_version" { From 2a28a1cbd2439262963287be5ab32034c99351f5 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 6 Jun 2024 14:22:50 -0400 Subject: [PATCH 34/60] fixed file organization --- .../gpu}/gpuBurner.yaml | 0 .../eks_addon_on_gpu => addon/gpu}/main.tf | 8 +-- .../gpu}/providers.tf | 0 .../gpu}/variables.tf | 0 .../agent_configs/ethtool_config.json | 69 ++++++++++++------- 5 files changed, 49 insertions(+), 28 deletions(-) rename terraform/eks/{daemon/eks_addon_on_gpu => addon/gpu}/gpuBurner.yaml (100%) rename terraform/eks/{daemon/eks_addon_on_gpu => addon/gpu}/main.tf (96%) rename terraform/eks/{daemon/eks_addon_on_gpu => addon/gpu}/providers.tf (100%) rename terraform/eks/{daemon/eks_addon_on_gpu => addon/gpu}/variables.tf (100%) diff --git a/terraform/eks/daemon/eks_addon_on_gpu/gpuBurner.yaml b/terraform/eks/addon/gpu/gpuBurner.yaml similarity index 100% rename from terraform/eks/daemon/eks_addon_on_gpu/gpuBurner.yaml rename to terraform/eks/addon/gpu/gpuBurner.yaml diff --git a/terraform/eks/daemon/eks_addon_on_gpu/main.tf b/terraform/eks/addon/gpu/main.tf similarity index 96% rename from terraform/eks/daemon/eks_addon_on_gpu/main.tf rename to terraform/eks/addon/gpu/main.tf index e0a69e1ef..824331fe0 100644 --- a/terraform/eks/daemon/eks_addon_on_gpu/main.tf +++ b/terraform/eks/addon/gpu/main.tf @@ -21,7 +21,7 @@ locals { } resource "aws_eks_cluster" "this" { - name = "cwagent-operator-eks-integ-${module.common.testing_id}" + name = "cwagent-operator-eks-integ5-${module.common.testing_id}" role_arn = local.role_arn version = var.k8s_version enabled_cluster_log_types = [ @@ -45,9 +45,9 @@ resource "aws_eks_node_group" "this" { subnet_ids = module.basic_components.public_subnet_ids scaling_config { - desired_size = 1 - max_size = 1 - min_size = 1 + desired_size = 2 + max_size = 2 + min_size = 2 } ami_type = "AL2_x86_64_GPU" diff --git a/terraform/eks/daemon/eks_addon_on_gpu/providers.tf b/terraform/eks/addon/gpu/providers.tf similarity index 100% rename from terraform/eks/daemon/eks_addon_on_gpu/providers.tf rename to terraform/eks/addon/gpu/providers.tf diff --git a/terraform/eks/daemon/eks_addon_on_gpu/variables.tf b/terraform/eks/addon/gpu/variables.tf similarity index 100% rename from terraform/eks/daemon/eks_addon_on_gpu/variables.tf rename to terraform/eks/addon/gpu/variables.tf diff --git a/test/metric_value_benchmark/agent_configs/ethtool_config.json b/test/metric_value_benchmark/agent_configs/ethtool_config.json index d2cc4912f..b06cc3879 100644 --- a/test/metric_value_benchmark/agent_configs/ethtool_config.json +++ b/test/metric_value_benchmark/agent_configs/ethtool_config.json @@ -1,27 +1,48 @@ { - "agent": { - "metrics_collection_interval": 10, - "run_as_user": "root", - "debug": true, - "logfile": "" - }, - "metrics": { - "namespace": "MetricValueBenchmarkTest", + "metrics": { + "namespace": "EthtoolTest4", + "metrics_collected": { + "ethtool": { + "metrics_include": [ + "bw_in_allowance_exceeded", + "bw_out_allowance_exceeded", + "pps_allowance_exceeded", + "conntrack_allowance_exceeded", + "linklocal_allowance_exceeded", + "rx_gso_checksum_fixup", + "ena_admin_q_no_completion", + "ena_admin_q_out_of_space", + "ena_admin_q_completed_cmd", + "ena_admin_q_submitted_cmd", + "ena_admin_q_aborted_cmd", + "queue_7_rx_zc_queue_pkt_copy", + "queue_7_rx_rx_copybreak_pkt", + "queue_0_tx_cnt", + "conntrack_allowance_available", + "linklocal_allowance_exceeded", + "pps_allowance_exceeded", + "queue_0_rx_bytes" + ], "append_dimensions": { - "InstanceId": "${aws:InstanceId}" - }, - "metrics_collected": { - "ethtool": { - "interface_include": [ - "eth0", - "ens5" - ], - "metrics_include": [ - "queue_0_tx_cnt", - "queue_0_rx_cnt" - ] - } - }, - "force_flush_interval": 5 + "name": "param", + "key": "chicken", + "key2": "chicken2" + } + }, + "disk":{ + "resources":[ + "/", + "/tmp" + ], + "measurement":[ + "total", + "used" + ], + "append_dimensions":{ + "stackName":"Prod", + "name": "param" + } } -} \ No newline at end of file + } + } +} From 4fbdf0d7ffa6d0e7b93100232255095fc121a73e Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 6 Jun 2024 14:45:31 -0400 Subject: [PATCH 35/60] change cluster name --- terraform/eks/addon/gpu/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/eks/addon/gpu/main.tf b/terraform/eks/addon/gpu/main.tf index 824331fe0..5cc36dfed 100644 --- a/terraform/eks/addon/gpu/main.tf +++ b/terraform/eks/addon/gpu/main.tf @@ -21,7 +21,7 @@ locals { } resource "aws_eks_cluster" "this" { - name = "cwagent-operator-eks-integ5-${module.common.testing_id}" + name = "cwagent-operator-eks-integ-${module.common.testing_id}" role_arn = local.role_arn version = var.k8s_version enabled_cluster_log_types = [ From a1eec84bf18768d6f2cba0d96d7303bbb4ea3e25 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 6 Jun 2024 15:51:12 -0400 Subject: [PATCH 36/60] adding to test generator --- generator/test_case_generator.go | 7 ++ .../agent_configs/ethtool_config.json | 69 +++++++------------ 2 files changed, 31 insertions(+), 45 deletions(-) diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go index 9e2e85bf3..c8e0b71af 100644 --- a/generator/test_case_generator.go +++ b/generator/test_case_generator.go @@ -179,6 +179,13 @@ var testTypeToTestConfig = map[string][]testConfig{ targets: map[string]map[string]struct{}{"metadataEnabled": {"enabled": {}}}, }, }, + "eks_addon": { + { + testDir: "./test/metric_value_benchmark", + targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, + instanceType: "g4dn.xlarge", + }, + }, "eks_daemon": { { testDir: "./test/metric_value_benchmark", diff --git a/test/metric_value_benchmark/agent_configs/ethtool_config.json b/test/metric_value_benchmark/agent_configs/ethtool_config.json index b06cc3879..d2cc4912f 100644 --- a/test/metric_value_benchmark/agent_configs/ethtool_config.json +++ b/test/metric_value_benchmark/agent_configs/ethtool_config.json @@ -1,48 +1,27 @@ { - "metrics": { - "namespace": "EthtoolTest4", - "metrics_collected": { - "ethtool": { - "metrics_include": [ - "bw_in_allowance_exceeded", - "bw_out_allowance_exceeded", - "pps_allowance_exceeded", - "conntrack_allowance_exceeded", - "linklocal_allowance_exceeded", - "rx_gso_checksum_fixup", - "ena_admin_q_no_completion", - "ena_admin_q_out_of_space", - "ena_admin_q_completed_cmd", - "ena_admin_q_submitted_cmd", - "ena_admin_q_aborted_cmd", - "queue_7_rx_zc_queue_pkt_copy", - "queue_7_rx_rx_copybreak_pkt", - "queue_0_tx_cnt", - "conntrack_allowance_available", - "linklocal_allowance_exceeded", - "pps_allowance_exceeded", - "queue_0_rx_bytes" - ], - "append_dimensions": { - "name": "param", - "key": "chicken", - "key2": "chicken2" - } + "agent": { + "metrics_collection_interval": 10, + "run_as_user": "root", + "debug": true, + "logfile": "" }, - "disk":{ - "resources":[ - "/", - "/tmp" - ], - "measurement":[ - "total", - "used" - ], - "append_dimensions":{ - "stackName":"Prod", - "name": "param" - } + "metrics": { + "namespace": "MetricValueBenchmarkTest", + "append_dimensions": { + "InstanceId": "${aws:InstanceId}" + }, + "metrics_collected": { + "ethtool": { + "interface_include": [ + "eth0", + "ens5" + ], + "metrics_include": [ + "queue_0_tx_cnt", + "queue_0_rx_cnt" + ] + } + }, + "force_flush_interval": 5 } - } - } -} +} \ No newline at end of file From 05ce023c8eecdf1ad1767f44d870123a9fd7b9a5 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 6 Jun 2024 17:03:04 -0400 Subject: [PATCH 37/60] adding to generate test matrix --- generator/resources/eks_addon_test_matrix.json | 11 +++++++++++ generator/test_case_generator.go | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 generator/resources/eks_addon_test_matrix.json diff --git a/generator/resources/eks_addon_test_matrix.json b/generator/resources/eks_addon_test_matrix.json new file mode 100644 index 000000000..4992638c4 --- /dev/null +++ b/generator/resources/eks_addon_test_matrix.json @@ -0,0 +1,11 @@ +[ + { + "k8s_version": "1.29", + "instanceType":"g4dn.xlarge", + "installAgentCommand": "go run ./install/install_agent.go rpm", + "beta": "true", + "addon_name":"amazon-cloudwatch-observability", + "addon_version":"v1.6.0-eksbuild.1" + + } +] \ No newline at end of file diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go index c8e0b71af..a3531b2a1 100644 --- a/generator/test_case_generator.go +++ b/generator/test_case_generator.go @@ -181,7 +181,7 @@ var testTypeToTestConfig = map[string][]testConfig{ }, "eks_addon": { { - testDir: "./test/metric_value_benchmark", + testDir: "./test/gpu", targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, instanceType: "g4dn.xlarge", }, From b4c57ec1e78fd0b980e4e3b97285a7c8d0caed3f Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 6 Jun 2024 20:24:58 -0400 Subject: [PATCH 38/60] fixing test --- generator/resources/eks_addon_test_matrix.json | 1 - generator/test_case_generator.go | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/generator/resources/eks_addon_test_matrix.json b/generator/resources/eks_addon_test_matrix.json index 4992638c4..93cd9340d 100644 --- a/generator/resources/eks_addon_test_matrix.json +++ b/generator/resources/eks_addon_test_matrix.json @@ -2,7 +2,6 @@ { "k8s_version": "1.29", "instanceType":"g4dn.xlarge", - "installAgentCommand": "go run ./install/install_agent.go rpm", "beta": "true", "addon_name":"amazon-cloudwatch-observability", "addon_version":"v1.6.0-eksbuild.1" diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go index a3531b2a1..8643e2374 100644 --- a/generator/test_case_generator.go +++ b/generator/test_case_generator.go @@ -181,9 +181,7 @@ var testTypeToTestConfig = map[string][]testConfig{ }, "eks_addon": { { - testDir: "./test/gpu", - targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, - instanceType: "g4dn.xlarge", + testDir: "./test/gpu", }, }, "eks_daemon": { From 7e31f9b111fd0e4847a344e813b7ecea1efbbb3f Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 6 Jun 2024 20:58:59 -0400 Subject: [PATCH 39/60] update the agent image --- terraform/eks/addon/gpu/main.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/terraform/eks/addon/gpu/main.tf b/terraform/eks/addon/gpu/main.tf index 5cc36dfed..006f0daa1 100644 --- a/terraform/eks/addon/gpu/main.tf +++ b/terraform/eks/addon/gpu/main.tf @@ -138,6 +138,7 @@ resource "null_resource" "validator" { command = < Date: Thu, 6 Jun 2024 21:32:37 -0400 Subject: [PATCH 40/60] toml file --- terraform/eks/addon/gpu/variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/eks/addon/gpu/variables.tf b/terraform/eks/addon/gpu/variables.tf index ce3fc6d44..23ddb275e 100644 --- a/terraform/eks/addon/gpu/variables.tf +++ b/terraform/eks/addon/gpu/variables.tf @@ -17,7 +17,7 @@ variable "addon_name" { } variable "addon_version" { - type = string + type = string default = "v1.6.0-eksbuild.1" } From ac98069db7e44d37ea2a05f242101788a2e2dbb7 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Fri, 7 Jun 2024 15:14:57 -0400 Subject: [PATCH 41/60] adding extra vars --- .../resources/eks_addon_test_matrix.json | 6 ++--- generator/test_case_generator.go | 3 ++- terraform/eks/addon/gpu/main.tf | 8 ++++++- terraform/eks/addon/gpu/variables.tf | 24 +++++++++++++++++++ 4 files changed, 35 insertions(+), 6 deletions(-) diff --git a/generator/resources/eks_addon_test_matrix.json b/generator/resources/eks_addon_test_matrix.json index 93cd9340d..3fa51a553 100644 --- a/generator/resources/eks_addon_test_matrix.json +++ b/generator/resources/eks_addon_test_matrix.json @@ -1,10 +1,8 @@ [ { "k8s_version": "1.29", - "instanceType":"g4dn.xlarge", - "beta": "true", "addon_name":"amazon-cloudwatch-observability", - "addon_version":"v1.6.0-eksbuild.1" - + "addon_version":"v1.6.0-eksbuild.1", + "ami_type": "AL2_x86_64_GPU" } ] \ No newline at end of file diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go index 8643e2374..26384bb6a 100644 --- a/generator/test_case_generator.go +++ b/generator/test_case_generator.go @@ -181,7 +181,8 @@ var testTypeToTestConfig = map[string][]testConfig{ }, "eks_addon": { { - testDir: "./test/gpu", + testDir: "./test/gpu", + terraformDir: "terraform/eks/addon", }, }, "eks_daemon": { diff --git a/terraform/eks/addon/gpu/main.tf b/terraform/eks/addon/gpu/main.tf index 006f0daa1..bb29c6acc 100644 --- a/terraform/eks/addon/gpu/main.tf +++ b/terraform/eks/addon/gpu/main.tf @@ -138,7 +138,13 @@ resource "null_resource" "validator" { command = < Date: Fri, 7 Jun 2024 15:58:16 -0400 Subject: [PATCH 42/60] correcting terraform dir --- generator/resources/eks_addon_test_matrix.json | 3 ++- generator/test_case_generator.go | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/generator/resources/eks_addon_test_matrix.json b/generator/resources/eks_addon_test_matrix.json index 3fa51a553..76b283a75 100644 --- a/generator/resources/eks_addon_test_matrix.json +++ b/generator/resources/eks_addon_test_matrix.json @@ -3,6 +3,7 @@ "k8s_version": "1.29", "addon_name":"amazon-cloudwatch-observability", "addon_version":"v1.6.0-eksbuild.1", - "ami_type": "AL2_x86_64_GPU" + "ami_type": "AL2_x86_64_GPU", + "terraform_dir": "terraform/eks/addon/gpu" } ] \ No newline at end of file diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go index 26384bb6a..e5face343 100644 --- a/generator/test_case_generator.go +++ b/generator/test_case_generator.go @@ -182,7 +182,7 @@ var testTypeToTestConfig = map[string][]testConfig{ "eks_addon": { { testDir: "./test/gpu", - terraformDir: "terraform/eks/addon", + terraformDir: "terraform/eks/addon/gpu", }, }, "eks_daemon": { From 6fb6a51f30330cac0034033490055176f6154c2a Mon Sep 17 00:00:00 2001 From: Paramadon Date: Sun, 9 Jun 2024 14:10:55 -0400 Subject: [PATCH 43/60] correcting terraform dir --- terraform/eks/addon/gpu/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/eks/addon/gpu/main.tf b/terraform/eks/addon/gpu/main.tf index bb29c6acc..7271db98d 100644 --- a/terraform/eks/addon/gpu/main.tf +++ b/terraform/eks/addon/gpu/main.tf @@ -145,7 +145,7 @@ resource "null_resource" "validator" { "value": "${var.aws_ecr_private_registry}/${var.ecr_integration_test_repo}:${var.github_sha}" } ]' - if go test ${var.test_dir} -eksClusterName ${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia; then + if go test ../../../../test/gpu -eksClusterName ${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia; then echo "Tests passed" else echo "Tests failed" From 89d58e242ba6a95cf36973998e681fd84bb7c9ac Mon Sep 17 00:00:00 2001 From: Paramadon Date: Sun, 9 Jun 2024 14:17:30 -0400 Subject: [PATCH 44/60] fixing dir --- generator/test_case_generator.go | 2 +- terraform/eks/addon/gpu/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go index e5face343..a9fee398f 100644 --- a/generator/test_case_generator.go +++ b/generator/test_case_generator.go @@ -181,7 +181,7 @@ var testTypeToTestConfig = map[string][]testConfig{ }, "eks_addon": { { - testDir: "./test/gpu", + testDir: "../../../../test/gpu", terraformDir: "terraform/eks/addon/gpu", }, }, diff --git a/terraform/eks/addon/gpu/main.tf b/terraform/eks/addon/gpu/main.tf index 7271db98d..bb29c6acc 100644 --- a/terraform/eks/addon/gpu/main.tf +++ b/terraform/eks/addon/gpu/main.tf @@ -145,7 +145,7 @@ resource "null_resource" "validator" { "value": "${var.aws_ecr_private_registry}/${var.ecr_integration_test_repo}:${var.github_sha}" } ]' - if go test ../../../../test/gpu -eksClusterName ${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia; then + if go test ${var.test_dir} -eksClusterName ${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia; then echo "Tests passed" else echo "Tests failed" From 388971052172d9f98e7d7f2d37fb68d447f0b4ab Mon Sep 17 00:00:00 2001 From: Paramadon Date: Sun, 9 Jun 2024 22:31:33 -0400 Subject: [PATCH 45/60] removing patching of agent to see if that is the problem --- terraform/eks/addon/gpu/main.tf | 7 ------- 1 file changed, 7 deletions(-) diff --git a/terraform/eks/addon/gpu/main.tf b/terraform/eks/addon/gpu/main.tf index bb29c6acc..5cc36dfed 100644 --- a/terraform/eks/addon/gpu/main.tf +++ b/terraform/eks/addon/gpu/main.tf @@ -138,13 +138,6 @@ resource "null_resource" "validator" { command = < Date: Sun, 9 Jun 2024 22:35:16 -0400 Subject: [PATCH 46/60] adding kubectl get pods -A --- terraform/eks/addon/gpu/main.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/terraform/eks/addon/gpu/main.tf b/terraform/eks/addon/gpu/main.tf index 5cc36dfed..55bed6b7a 100644 --- a/terraform/eks/addon/gpu/main.tf +++ b/terraform/eks/addon/gpu/main.tf @@ -138,6 +138,7 @@ resource "null_resource" "validator" { command = < Date: Mon, 10 Jun 2024 13:54:06 -0400 Subject: [PATCH 47/60] adding test name --- terraform/eks/addon/gpu/main.tf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/terraform/eks/addon/gpu/main.tf b/terraform/eks/addon/gpu/main.tf index 55bed6b7a..905b8d663 100644 --- a/terraform/eks/addon/gpu/main.tf +++ b/terraform/eks/addon/gpu/main.tf @@ -126,6 +126,9 @@ resource "aws_eks_addon" "this" { cluster_name = aws_eks_cluster.this.name addon_version = var.addon_version } +output "eks_cluster_name" { + value = aws_eks_cluster.this.name +} resource "null_resource" "validator" { depends_on = [ From 5855c506e21b6a835bf32abe59201d822db9aeac Mon Sep 17 00:00:00 2001 From: Paramadon Date: Mon, 10 Jun 2024 17:17:29 -0400 Subject: [PATCH 48/60] removing go test --- terraform/eks/addon/gpu/main.tf | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/terraform/eks/addon/gpu/main.tf b/terraform/eks/addon/gpu/main.tf index 905b8d663..e63d456b3 100644 --- a/terraform/eks/addon/gpu/main.tf +++ b/terraform/eks/addon/gpu/main.tf @@ -130,25 +130,5 @@ output "eks_cluster_name" { value = aws_eks_cluster.this.name } -resource "null_resource" "validator" { - depends_on = [ - aws_eks_node_group.this, - aws_eks_addon.this, - null_resource.kubectl - ] - provisioner "local-exec" { - command = < Date: Mon, 10 Jun 2024 19:30:20 -0400 Subject: [PATCH 49/60] Test works locally needed to add some metrics to dims --- test/gpu/nvidia_test.go | 50 ++++++++++--------- test/metric/container_insights_util.go | 1 + .../eks_resources/test_schemas/pod_gpu.json | 1 - 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/test/gpu/nvidia_test.go b/test/gpu/nvidia_test.go index ced990b36..f51e73b64 100644 --- a/test/gpu/nvidia_test.go +++ b/test/gpu/nvidia_test.go @@ -17,24 +17,28 @@ import ( const ( gpuMetricIndicator = "_gpu_" - containerMemTotal = "container_gpu_memory_total" - containerMemUsed = "container_gpu_memory_used" - containerPower = "container_gpu_power_draw" - containerTemp = "container_gpu_temperature" - containerUtil = "container_gpu_utilization" - containerMemUtil = "container_gpu_memory_utilization" - podMemTotal = "pod_gpu_memory_total" - podMemUsed = "pod_gpu_memory_used" - podPower = "pod_gpu_power_draw" - podTemp = "pod_gpu_temperature" - podUtil = "pod_gpu_utilization" - podMemUtil = "pod_gpu_memory_utilization" - nodeMemTotal = "node_gpu_memory_total" - nodeMemUsed = "node_gpu_memory_used" - nodePower = "node_gpu_power_draw" - nodeTemp = "node_gpu_temperature" - nodeUtil = "node_gpu_utilization" - nodeMemUtil = "node_gpu_memory_utilization" + containerMemTotal = "container_gpu_memory_total" + containerMemUsed = "container_gpu_memory_used" + containerPower = "container_gpu_power_draw" + containerTemp = "container_gpu_temperature" + containerUtil = "container_gpu_utilization" + containerMemUtil = "container_gpu_memory_utilization" + podMemTotal = "pod_gpu_memory_total" + podMemUsed = "pod_gpu_memory_used" + podPower = "pod_gpu_power_draw" + podTemp = "pod_gpu_temperature" + podUtil = "pod_gpu_utilization" + podMemUtil = "pod_gpu_memory_utilization" + podLimit = "pod_gpu_limit" + podRequest = "pod_gpu_request" + podTotal = "pod_gpu_total" + nodeMemTotal = "node_gpu_memory_total" + nodeMemUsed = "node_gpu_memory_used" + nodePower = "node_gpu_power_draw" + nodeTemp = "node_gpu_temperature" + nodeUtil = "node_gpu_utilization" + nodeMemUtil = "node_gpu_memory_utilization" + nodeCountTotal = "node_gpu_total" nodeCountRequest = "node_gpu_request" nodeCountLimit = "node_gpu_limit" @@ -46,18 +50,16 @@ var expectedDimsToMetrics = map[string][]string{ "ClusterName": { containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil, podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, - nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil, - //nodeCountTotal, nodeCountRequest, nodeCountLimit, - //clusterCountTotal, clusterCountRequest, + nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil, podLimit, podTotal, podRequest, nodeCountTotal, nodeCountRequest, nodeCountLimit, clusterCountTotal, clusterCountRequest, }, "ClusterName-Namespace": { - podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, podLimit, podTotal, podRequest, }, //"ClusterName-Namespace-Service": { // podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, //}, "ClusterName-Namespace-PodName": { - podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, podLimit, podTotal, podRequest, }, "ClusterName-ContainerName-Namespace-PodName": { containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil, @@ -69,7 +71,7 @@ var expectedDimsToMetrics = map[string][]string{ containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil, }, "ClusterName-FullPodName-Namespace-PodName": { - podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, podLimit, podTotal, podRequest, }, "ClusterName-FullPodName-GpuDevice-Namespace-PodName": { podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go index a3ddff03d..6c474339d 100644 --- a/test/metric/container_insights_util.go +++ b/test/metric/container_insights_util.go @@ -57,6 +57,7 @@ func ValidateMetrics(env *environment.MetaData, metricFilter string, expectedDim continue } results = append(results, validateMetricsAvailability(dims, metrics, actual)) + for _, m := range metrics { // this is to prevent panic with rand.Intn when metrics are not yet ready in a cluster if _, ok := actual[m]; !ok { diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json b/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json index 4b532094f..9e3124e3b 100644 --- a/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json +++ b/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json @@ -34,7 +34,6 @@ "required": [ "ClusterName", "FullPodName", - "GpuDevice", "InstanceId", "Namespace", "NodeName", From 29288f8b0f4b2a95c7dd2078788ee77e3044e221 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Mon, 10 Jun 2024 20:26:57 -0400 Subject: [PATCH 50/60] cleaning up unused vars in terraform (test works) --- terraform/eks/addon/gpu/variables.tf | 24 ------------------------ test/metric/container_insights_util.go | 1 - 2 files changed, 25 deletions(-) diff --git a/terraform/eks/addon/gpu/variables.tf b/terraform/eks/addon/gpu/variables.tf index b143fbbcf..23ddb275e 100644 --- a/terraform/eks/addon/gpu/variables.tf +++ b/terraform/eks/addon/gpu/variables.tf @@ -25,30 +25,6 @@ variable "k8s_version" { type = string default = "1.29" } -variable "cwagent_image_repo" { - type = string - default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent" -} -variable "aws_ecr_private_registry" { - description = "The AWS ECR private registry" - type = string -} - -variable "ecr_integration_test_repo" { - description = "The ECR integration test repository" - type = string -} - -variable "github_sha" { - description = "The GitHub SHA" - type = string -} - -variable "cwagent_image_tag" { - type = string - default = "latest" -} - variable "ami_type" { type = string diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go index 6c474339d..a3ddff03d 100644 --- a/test/metric/container_insights_util.go +++ b/test/metric/container_insights_util.go @@ -57,7 +57,6 @@ func ValidateMetrics(env *environment.MetaData, metricFilter string, expectedDim continue } results = append(results, validateMetricsAvailability(dims, metrics, actual)) - for _, m := range metrics { // this is to prevent panic with rand.Intn when metrics are not yet ready in a cluster if _, ok := actual[m]; !ok { From dd4ee5533dcc6cfa77f3d3e3a8595b0ceb88f1fc Mon Sep 17 00:00:00 2001 From: Paramadon Date: Tue, 11 Jun 2024 11:11:25 -0400 Subject: [PATCH 51/60] adding test_dir to fix yml --- generator/resources/eks_addon_test_matrix.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/generator/resources/eks_addon_test_matrix.json b/generator/resources/eks_addon_test_matrix.json index 76b283a75..6a84253ea 100644 --- a/generator/resources/eks_addon_test_matrix.json +++ b/generator/resources/eks_addon_test_matrix.json @@ -4,6 +4,7 @@ "addon_name":"amazon-cloudwatch-observability", "addon_version":"v1.6.0-eksbuild.1", "ami_type": "AL2_x86_64_GPU", - "terraform_dir": "terraform/eks/addon/gpu" + "terraform_dir": "terraform/eks/addon/gpu", + "test_dir": "../../../../test/gpu" } ] \ No newline at end of file From e76ae006cbc49cd6874d5c3c8806c2680b4f13b8 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Wed, 12 Jun 2024 11:44:17 -0400 Subject: [PATCH 52/60] adding sleep and log statements --- test/gpu/nvidia_test.go | 1 + test/metric/container_insights_util.go | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/test/gpu/nvidia_test.go b/test/gpu/nvidia_test.go index f51e73b64..82da0941e 100644 --- a/test/gpu/nvidia_test.go +++ b/test/gpu/nvidia_test.go @@ -94,6 +94,7 @@ type NvidiaTestRunner struct { var _ test_runner.ITestRunner = (*NvidiaTestRunner)(nil) func (t *NvidiaTestRunner) Validate() status.TestGroupResult { + time.Sleep(120 * time.Second) var testResults []status.TestResult testResults = append(testResults, metric.ValidateMetrics(t.env, gpuMetricIndicator, expectedDimsToMetrics)...) testResults = append(testResults, metric.ValidateLogs(t.env)) diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go index a3ddff03d..00d1a1584 100644 --- a/test/metric/container_insights_util.go +++ b/test/metric/container_insights_util.go @@ -85,6 +85,7 @@ func getMetricsInClusterDimension(env *environment.MetaData, metricFilter string }, } metrics, err := listFetcher.Fetch(ContainerInsightsNamespace, "", dims) + fmt.Println(dims) if err != nil { log.Println("failed to fetch metric list", err) return nil @@ -97,6 +98,7 @@ func getMetricsInClusterDimension(env *environment.MetaData, metricFilter string var results []dimToMetrics for _, m := range metrics { // filter by metric name filter + fmt.Println("This is the metric name and filter: ", *m.MetricName, metricFilter) if metricFilter != "" && !strings.Contains(*m.MetricName, metricFilter) { continue } @@ -132,6 +134,10 @@ func validateMetricsAvailability(dims string, expected []string, actual map[stri Name: dims, Status: status.FAILED, } + fmt.Println("Dims underneath") + fmt.Println(dims) + fmt.Println("This is the expected metrics: ", expected) + fmt.Println("This is the actual metrics: ", actual) log.Printf("expected metrics: %d, actual metrics: %d", len(expected), len(actual)) if compareMetrics(expected, actual) { testResult.Status = status.SUCCESSFUL From 5ad174fac8583eb496d241c09f8d251be6b20165 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Wed, 12 Jun 2024 13:14:26 -0400 Subject: [PATCH 53/60] adding go test retry instead of retrying all of terraform --- test/gpu/nvidia_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/gpu/nvidia_test.go b/test/gpu/nvidia_test.go index 82da0941e..17e374140 100644 --- a/test/gpu/nvidia_test.go +++ b/test/gpu/nvidia_test.go @@ -94,7 +94,7 @@ type NvidiaTestRunner struct { var _ test_runner.ITestRunner = (*NvidiaTestRunner)(nil) func (t *NvidiaTestRunner) Validate() status.TestGroupResult { - time.Sleep(120 * time.Second) + time.Sleep(180 * time.Second) var testResults []status.TestResult testResults = append(testResults, metric.ValidateMetrics(t.env, gpuMetricIndicator, expectedDimsToMetrics)...) testResults = append(testResults, metric.ValidateLogs(t.env)) From 74844bdddb9f989527d160b3d927beb8c53df13e Mon Sep 17 00:00:00 2001 From: Paramadon Date: Wed, 12 Jun 2024 17:52:31 -0400 Subject: [PATCH 54/60] removing sleep --- test/gpu/nvidia_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/test/gpu/nvidia_test.go b/test/gpu/nvidia_test.go index 17e374140..f51e73b64 100644 --- a/test/gpu/nvidia_test.go +++ b/test/gpu/nvidia_test.go @@ -94,7 +94,6 @@ type NvidiaTestRunner struct { var _ test_runner.ITestRunner = (*NvidiaTestRunner)(nil) func (t *NvidiaTestRunner) Validate() status.TestGroupResult { - time.Sleep(180 * time.Second) var testResults []status.TestResult testResults = append(testResults, metric.ValidateMetrics(t.env, gpuMetricIndicator, expectedDimsToMetrics)...) testResults = append(testResults, metric.ValidateLogs(t.env)) From 63de98d78ba03c53c18bb83b283683a09c53f37a Mon Sep 17 00:00:00 2001 From: Paramadon Date: Wed, 12 Jun 2024 19:48:54 -0400 Subject: [PATCH 55/60] removing log lines --- test/metric/container_insights_util.go | 7 ------- 1 file changed, 7 deletions(-) diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go index 00d1a1584..ba4ed33bc 100644 --- a/test/metric/container_insights_util.go +++ b/test/metric/container_insights_util.go @@ -85,7 +85,6 @@ func getMetricsInClusterDimension(env *environment.MetaData, metricFilter string }, } metrics, err := listFetcher.Fetch(ContainerInsightsNamespace, "", dims) - fmt.Println(dims) if err != nil { log.Println("failed to fetch metric list", err) return nil @@ -98,7 +97,6 @@ func getMetricsInClusterDimension(env *environment.MetaData, metricFilter string var results []dimToMetrics for _, m := range metrics { // filter by metric name filter - fmt.Println("This is the metric name and filter: ", *m.MetricName, metricFilter) if metricFilter != "" && !strings.Contains(*m.MetricName, metricFilter) { continue } @@ -134,11 +132,6 @@ func validateMetricsAvailability(dims string, expected []string, actual map[stri Name: dims, Status: status.FAILED, } - fmt.Println("Dims underneath") - fmt.Println(dims) - fmt.Println("This is the expected metrics: ", expected) - fmt.Println("This is the actual metrics: ", actual) - log.Printf("expected metrics: %d, actual metrics: %d", len(expected), len(actual)) if compareMetrics(expected, actual) { testResult.Status = status.SUCCESSFUL } else { From 82c2b9e1bd1f488fa617033dcde3dc6bb8c6dc41 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Wed, 12 Jun 2024 23:47:36 -0400 Subject: [PATCH 56/60] making sure to seperate integ test metrics with e2e metrics (prev commit works) --- test/gpu/nvidia_test.go | 50 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/test/gpu/nvidia_test.go b/test/gpu/nvidia_test.go index f51e73b64..f6f08df38 100644 --- a/test/gpu/nvidia_test.go +++ b/test/gpu/nvidia_test.go @@ -6,6 +6,7 @@ package emf import ( + "flag" "time" "github.com/aws/amazon-cloudwatch-agent-test/environment" @@ -14,6 +15,8 @@ import ( "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner" ) +var useE2EMetrics = flag.Bool("useE2EMetrics", false, "Use E2E metrics mapping which uses latest build CWA") + const ( gpuMetricIndicator = "_gpu_" @@ -46,7 +49,48 @@ const ( clusterCountRequest = "cluster_gpu_request" ) -var expectedDimsToMetrics = map[string][]string{ +var expectedDimsToMetricsIntegTest = map[string][]string{ + "ClusterName": { + containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil, + podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil, + //nodeCountTotal, nodeCountRequest, nodeCountLimit, + //clusterCountTotal, clusterCountRequest, + }, + "ClusterName-Namespace": { + podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + }, + //"ClusterName-Namespace-Service": { + // podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + //}, + "ClusterName-Namespace-PodName": { + podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + }, + "ClusterName-ContainerName-Namespace-PodName": { + containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil, + }, + "ClusterName-ContainerName-FullPodName-Namespace-PodName": { + containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil, + }, + "ClusterName-ContainerName-FullPodName-GpuDevice-Namespace-PodName": { + containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil, + }, + "ClusterName-FullPodName-Namespace-PodName": { + podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + }, + "ClusterName-FullPodName-GpuDevice-Namespace-PodName": { + podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + }, + "ClusterName-InstanceId-NodeName": { + nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil, + //nodeCountTotal, nodeCountRequest, nodeCountLimit, + }, + "ClusterName-GpuDevice-InstanceId-InstanceType-NodeName": { + nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil, + }, +} + +var expectedDimsToMetricsE2E = map[string][]string{ "ClusterName": { containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil, podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, @@ -95,6 +139,10 @@ var _ test_runner.ITestRunner = (*NvidiaTestRunner)(nil) func (t *NvidiaTestRunner) Validate() status.TestGroupResult { var testResults []status.TestResult + expectedDimsToMetrics := expectedDimsToMetricsIntegTest + if *useE2EMetrics { + expectedDimsToMetrics = expectedDimsToMetricsE2E + } testResults = append(testResults, metric.ValidateMetrics(t.env, gpuMetricIndicator, expectedDimsToMetrics)...) testResults = append(testResults, metric.ValidateLogs(t.env)) return status.TestGroupResult{ From d4440eb078a9a4997a47aefffa326f9dd8a32a17 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 13 Jun 2024 09:23:47 -0400 Subject: [PATCH 57/60] adding retries to integ test to help it pass --- terraform/eks/daemon/gpu/main.tf | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/terraform/eks/daemon/gpu/main.tf b/terraform/eks/daemon/gpu/main.tf index 658b78269..f81dfed79 100644 --- a/terraform/eks/daemon/gpu/main.tf +++ b/terraform/eks/daemon/gpu/main.tf @@ -701,7 +701,6 @@ resource "kubernetes_cluster_role_binding" "rolebinding" { namespace = "amazon-cloudwatch" } } - resource "null_resource" "validator" { depends_on = [ aws_eks_node_group.this, @@ -709,11 +708,19 @@ resource "null_resource" "validator" { kubernetes_cluster_role_binding.rolebinding, kubernetes_service_account.cwagentservice, ] + provisioner "local-exec" { command = <<-EOT echo "Validating EKS metrics/logs for EMF" cd ../../../.. - go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia + for i in {1..10}; do + go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia && break || echo "Attempt $i failed, retrying..." + if [ $i -eq 10 ]; then + echo "Validation failed after 10 attempts" + exit 1 + fi + sleep 30 + done EOT } } From 13ac63d00112d44d238f556fcfbf6ea02e48525b Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 13 Jun 2024 09:55:57 -0400 Subject: [PATCH 58/60] fixing test --- terraform/eks/daemon/gpu/main.tf | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/terraform/eks/daemon/gpu/main.tf b/terraform/eks/daemon/gpu/main.tf index f81dfed79..34577803b 100644 --- a/terraform/eks/daemon/gpu/main.tf +++ b/terraform/eks/daemon/gpu/main.tf @@ -711,16 +711,14 @@ resource "null_resource" "validator" { provisioner "local-exec" { command = <<-EOT - echo "Validating EKS metrics/logs for EMF" cd ../../../.. - for i in {1..10}; do - go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia && break || echo "Attempt $i failed, retrying..." - if [ $i -eq 10 ]; then - echo "Validation failed after 10 attempts" - exit 1 - fi - sleep 30 + i=0 + while [ $i -lt 10 ]; do + i=$((i+1)) + go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia && exit 0 + sleep 10 done + exit 1 EOT } -} +} \ No newline at end of file From cbbab979cc8d93ed76476259d9eace3a736c8eb1 Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 13 Jun 2024 09:57:09 -0400 Subject: [PATCH 59/60] fixing test --- terraform/eks/daemon/gpu/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/eks/daemon/gpu/main.tf b/terraform/eks/daemon/gpu/main.tf index 34577803b..1019682fd 100644 --- a/terraform/eks/daemon/gpu/main.tf +++ b/terraform/eks/daemon/gpu/main.tf @@ -716,7 +716,7 @@ resource "null_resource" "validator" { while [ $i -lt 10 ]; do i=$((i+1)) go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia && exit 0 - sleep 10 + sleep 30 done exit 1 EOT From 234bc29848ee437b85f7d72ac61984750b60224d Mon Sep 17 00:00:00 2001 From: Paramadon Date: Thu, 13 Jun 2024 10:42:03 -0400 Subject: [PATCH 60/60] test works --- terraform/eks/daemon/gpu/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/eks/daemon/gpu/main.tf b/terraform/eks/daemon/gpu/main.tf index 1019682fd..baec68266 100644 --- a/terraform/eks/daemon/gpu/main.tf +++ b/terraform/eks/daemon/gpu/main.tf @@ -716,7 +716,7 @@ resource "null_resource" "validator" { while [ $i -lt 10 ]; do i=$((i+1)) go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia && exit 0 - sleep 30 + sleep 60 done exit 1 EOT