Skip to content

Latest commit

 

History

History
554 lines (482 loc) · 22.6 KB

README.md

File metadata and controls

554 lines (482 loc) · 22.6 KB

Terraform module for connecting an AWS EKS cluster to CAST AI

Website: https://www.cast.ai

Requirements

Using the module

A module to connect an EKS cluster to CAST AI.

Requires castai/castai and hashicorp/aws providers to be configured.

module "castai-eks-cluster" {
  source = "castai/eks-cluster/castai"

  aws_account_id     = var.aws_account_id
  aws_cluster_region = var.cluster_region
  aws_cluster_name   = var.cluster_id

  aws_assume_role_arn      = module.castai-eks-role-iam.role_arn
  autoscaler_policies_json = var.autoscaler_policies_json

  // Default node configuration will be used for all CAST provisioned nodes unless specific configuration is requested.
  default_node_configuration = module.cast-eks-cluster.castai_node_configurations["default"]

  node_configurations = {
    default = {
      subnets                   = module.vpc.private_subnets
      dns_cluster_ip            = "10.100.0.10"
      instance_profile_role_arn = var.instance_profile_arn
      ssh_public_key            = var.ssh_public_key
      security_groups           = [
        module.eks.node_security_group_id,
      ]
      tags = {
        "team" : "core"
      }
      init_script   = base64encode(var.init_script)
      docker_config = jsonencode({
        "insecure-registries"      = ["registry.com:5000"],
        "max-concurrent-downloads" = 10
      })
      kubelet_config = jsonencode({
        "registryBurst" : 20,
        "registryPullQPS" : 10
      })
      container_runtime = "dockerd"
    }
  }

  node_templates = {
    spot_tmpl = {
      configuration_id = module.cast-eks-cluster.castai_node_configurations["default"]

      should_taint = true

      custom_labels = {
        custom-label-key-1 = "custom-label-value-1"
        custom-label-key-2 = "custom-label-value-2"
      }

      custom_taints = [
        {
          key   = "custom-taint-key-1"
          value = "custom-taint-value-1"
        },
        {
          key   = "custom-taint-key-2"
          value = "custom-taint-value-2"
        }
      ]

      constraints = {
        fallback_restore_rate_seconds = 1800
        spot                          = true
        use_spot_fallbacks            = true
        min_cpu                       = 4
        max_cpu                       = 100
        instance_families             = {
          exclude = ["m5"]
        }
        compute_optimized_state = "disabled"
        storage_optimized_state = "disabled"
        is_gpu_only              = false
        architectures            = ["amd64"]
      }
    }
  }

  autoscaler_settings = {
    enabled                                 = true
    node_templates_partial_matching_enabled = false

    unschedulable_pods = {
      enabled = true

      headroom = {
        enabled           = true
        cpu_percentage    = 10
        memory_percentage = 10
      }

      headroom_spot = {
        enabled           = true
        cpu_percentage    = 10
        memory_percentage = 10
      }
    }

    node_downscaler = {
      enabled = true

      empty_nodes = {
        enabled = true
      }

      evictor = {
        aggressive_mode           = false
        cycle_interval            = "5s10s"
        dry_run                   = false
        enabled                   = true
        node_grace_period_minutes = 10
        scoped_mode               = false
      }
    }

    cluster_limits = {
      enabled = true

      cpu = {
        max_cores = 20
        min_cores = 1
      }
    }
  }
}

Migrating from 2.x.x to 3.x.x

Existing configuration:

module "castai-eks-cluster" {
  // ...
  
  subnets                   = module.vpc.private_subnets
  dns_cluster_ip            = "10.100.0.10"
  instance_profile_role_arn = var.instance_profile_arn
  ssh_public_key            = var.ssh_public_key
  override_security_groups  = [
    module.eks.node_security_group_id,
  ]
  tags = {
    "team" : "core"
  }
}

New configuration:

module "castai-eks-cluster" {
  // ...
  
  // Default node configuration will be used for all CAST provisioned nodes unless specific configuration is requested.
  default_node_configuration = module.cast-eks-cluster.castai_node_configurations["default"]

  node_configurations = {
    default = {
      subnets                   = module.vpc.private_subnets
      dns_cluster_ip            = "10.100.0.10"
      instance_profile_role_arn = var.instance_profile_arn
      ssh_public_key            = var.ssh_public_key
      security_groups           = [
        module.eks.node_security_group_id,
      ]
      tags = {
        "team" : "core"
      }
    }
  }
}

Migrating from 5.x.x to 6.x.x

Existing configuration:

module "castai-eks-cluster" {
  // ...

  node_templates = {
    // ...
  }
  autoscaler_policies_json = <<-EOT
    {
        "enabled": true,
        "unschedulablePods": {
            "enabled": true
        },
        "spotInstances": {
            "enabled": true,
            "clouds": ["aws"],
            "spotBackups": {
                "enabled": true
            },
            "spotDiversityEnabled": false,
            "spotDiversityPriceIncreaseLimitPercent": 20,
            "spotInterruptionPredictions": {
              "enabled": true,
              "type": "AWSRebalanceRecommendations"
            }
        },
        "nodeDownscaler": {
            "enabled": true,
            "emptyNodes": {
                "enabled": true
            },
            "evictor": {
                "aggressiveMode": true,
                "cycleInterval": "5m10s",
                "dryRun": false,
                "enabled": true,
                "nodeGracePeriodMinutes": 10,
                "scopedMode": false
            }
        }
    }
  EOT
}

New configuration:

module "castai-eks-cluster" {
  // ...

  node_templates = {
    default_by_castai = {
      name = "default-by-castai"
      configuration_id = module.castai-eks-cluster.castai_node_configurations["default"]
      is_default   = true
      should_taint = false

      constraints = {
        on_demand          = true
        spot               = true
        use_spot_fallbacks = true

        enable_spot_diversity                       = false
        spot_diversity_price_increase_limit_percent = 20

        spot_interruption_predictions_enabled = true
        spot_interruption_predictions_type = "aws-rebalance-recommendations"
      }
    }
  }
  autoscaler_policies_json = <<-EOT
    {
        "enabled": true,
        "unschedulablePods": {
            "enabled": true
        },
        "nodeDownscaler": {
            "enabled": true,
            "emptyNodes": {
                "enabled": true
            },
            "evictor": {
                "aggressiveMode": true,
                "cycleInterval": "5m10s",
                "dryRun": false,
                "enabled": true,
                "nodeGracePeriodMinutes": 10,
                "scopedMode": false
            }
        }
    }
  EOT
}

Migrating from 6.x.x to 7.x.x

Version 7.x.x changes:

  • Removed custom_label attribute in castai_node_template resource. Use custom_labels instead.

Old configuration:

module "castai-eks-cluster" {
  // ...

  node_templates = {
    spot_tmpl = {
      custom_label = {
        key = "custom-label-key-1"
        value = "custom-label-value-1"
      }
    }
  }
}

New configuration:

module "castai-eks-cluster" {
  // ...

  node_templates = {
    spot_tmpl = {
      custom_labels = {
        custom-label-key-1 = "custom-label-value-1"
      }
    }
  }
}

Migrating from 7.x.x to 8.x.x

Version 8.x.x changed:

  • Removed compute_optimized and storage_optimized attributes in castai_node_template resource, constraints object. Use compute_optimized_state and storage_optimized_state instead.

Old configuration:

module "castai-eks-cluster" {
  node_templates = {
    spot_tmpl = {
      constraints = {
        compute_optimized = false
        storage_optimized = true
      }
    }
  }
}

New configuration:

module "castai-eks-cluster" {
  node_templates = {
    spot_tmpl = {
      constraints = {
        compute_optimized_state = "disabled"
        storage_optimized_state = "enabled"
      }
    }
  }
}

Migrating from 9.x.x to 9.3.x

Version 9.3.x changed:

  • Deprecated autoscaler_policies_json attribute. Use autoscaler_settings instead.

Old configuration:

module "castai-eks-cluster" {
  autoscaler_policies_json = <<-EOT
    {
        "enabled": true,
        "unschedulablePods": {
            "enabled": true
        },
        "nodeDownscaler": {
            "enabled": true,
            "emptyNodes": {
                "enabled": true
            },
            "evictor": {
                "aggressiveMode": false,
                "cycleInterval": "5m10s",
                "dryRun": false,
                "enabled": true,
                "nodeGracePeriodMinutes": 10,
                "scopedMode": false
            }
        },
        "nodeTemplatesPartialMatchingEnabled": false,
        "clusterLimits": {
            "cpu": {
                "maxCores": 20,
                "minCores": 1
            },
            "enabled": true
        }
    }
  EOT
}

New configuration:

module "castai-eks-cluster" {
  autoscaler_settings = {
    enabled                                 = true
    node_templates_partial_matching_enabled = false

    unschedulable_pods = {
      enabled = true
    }

    node_downscaler = {
      enabled = true

      empty_nodes = {
        enabled = true
      }

      evictor = {
        aggressive_mode           = false
        cycle_interval            = "5m10s"
        dry_run                   = false
        enabled                   = true
        node_grace_period_minutes = 10
        scoped_mode               = false
      }
    }

    cluster_limits = {
      enabled = true

      cpu = {
        max_cores = 20
        min_cores = 1
      }
    }
  }
}

Examples

Usage examples are located in terraform provider repo

Generate docs

terraform-docs markdown table . --output-file README.md

Requirements

Name Version
terraform >= 0.13
aws >= 2.49
castai ~> 7.4.0
helm >= 2.0.0

Providers

Name Version
castai 7.4.0
helm 2.13.2
null 3.2.2

Modules

No modules.

Resources

Name Type
castai_autoscaler.castai_autoscaler_policies resource
castai_eks_cluster.my_castai_cluster resource
castai_node_configuration.this resource
castai_node_configuration_default.this resource
castai_node_template.this resource
helm_release.castai_agent resource
helm_release.castai_cluster_controller resource
helm_release.castai_cluster_controller_self_managed resource
helm_release.castai_egressd resource
helm_release.castai_egressd_self_managed resource
helm_release.castai_evictor resource
helm_release.castai_evictor_ext resource
helm_release.castai_evictor_self_managed resource
helm_release.castai_kvisor resource
helm_release.castai_kvisor_self_managed resource
helm_release.castai_pod_pinner resource
helm_release.castai_pod_pinner_self_managed resource
helm_release.castai_spot_handler resource
helm_release.castai_workload_autoscaler resource
helm_release.castai_workload_autoscaler_self_managed resource
null_resource.wait_for_cluster resource

Inputs

Name Description Type Default Required
agent_aws_access_key_id AWS access key for CAST AI agent to fetch instance details. string "" no
agent_aws_iam_service_account_role_arn Arn of the role to be used by CAST AI agent to fetch instance details. Only readonly AmazonEC2ReadOnlyAccess is needed. string "" no
agent_aws_secret_access_key AWS access key secret for CAST AI agent to fetch instance details. string "" no
agent_values List of YAML formatted string with agent values list(string) [] no
agent_version Version of castai-agent helm chart. Default latest string null no
api_grpc_addr CAST AI GRPC API address string "api-grpc.cast.ai:443" no
api_url URL of alternative CAST AI API to be used during development or testing string "https://api.cast.ai" no
autoscaler_policies_json Optional json object to override CAST AI cluster autoscaler policies. Deprecated, use autoscaler_settings instead. string null no
autoscaler_policy_overrides Optional Autoscaler policy definitions to override current autoscaler settings any null no
aws_account_id ID of AWS account the cluster is located in. string n/a yes
aws_assume_role_arn Arn of the role to be used by CAST AI for IAM access string null no
aws_cluster_name Name of the cluster to be connected to CAST AI. string n/a yes
aws_cluster_region Region of the cluster to be connected to CAST AI. string n/a yes
castai_api_token Optional CAST AI API token created in console.cast.ai API Access keys section. Used only when wait_for_cluster_ready is set to true string "" no
castai_components_labels Optional additional Kubernetes labels for CAST AI pods map(any) {} no
cluster_controller_values List of YAML formatted string with cluster-controller values list(string) [] no
cluster_controller_version Version of castai-cluster-controller helm chart. Default latest string null no
default_node_configuration ID of the default node configuration string n/a yes
delete_nodes_on_disconnect Optionally delete Cast AI created nodes when the cluster is destroyed bool false no
egressd_values List of YAML formatted string with egressd values list(string) [] no
egressd_version Version of castai-egressd helm chart. Default latest string null no
evictor_ext_values List of YAML formatted string with evictor-ext values list(string) [] no
evictor_ext_version Version of castai-evictor-ext chart. Default latest string null no
evictor_values List of YAML formatted string with evictor values list(string) [] no
evictor_version Version of castai-evictor chart. Default latest string null no
grpc_url gRPC endpoint used by pod-pinner string "grpc.cast.ai:443" no
install_egressd Optional flag for installation of Egressd (Network cost monitoring) (https://docs.cast.ai/docs/network-cost) bool false no
install_security_agent Optional flag for installation of security agent (https://docs.cast.ai/product-overview/console/security-insights/) bool false no
install_workload_autoscaler Optional flag for installation of workload autoscaler (https://docs.cast.ai/docs/workload-autoscaling-configuration) bool false no
kvisor_values List of YAML formatted string with kvisor values list(string) [] no
kvisor_version Version of kvisor chart. Default latest string null no
kvisor_controller_extra_args Map of extra arguments for the kvisor controller map(string) {
kube-linter-enabled = true
image-scan-enabled = true
kube-bench-enabled = true
kube-bench-cloud-provider = eks
}
no
node_configurations Map of EKS node configurations to create any {} no
node_templates Map of node templates to create any {} no
pod_pinner_version Version of pod-pinner helm chart. Default latest string null no
self_managed Whether CAST AI components' upgrades are managed by a customer; by default upgrades are managed CAST AI central system. bool false no
spot_handler_values List of YAML formatted string with spot-handler values list(string) [] no
spot_handler_version Version of castai-spot-handler helm chart. Default latest string null no
wait_for_cluster_ready Wait for cluster to be ready before finishing the module execution, this option requires castai_api_token to be set bool false no
workload_autoscaler_values List of YAML formatted string with cluster-workload-autoscaler values list(string) [] no
workload_autoscaler_version Version of castai-workload-autoscaler helm chart. Default latest string null no

Outputs

Name Description
castai_node_configurations Map of node configurations ids by name
castai_node_templates Map of node template by name
cluster_id CAST AI cluster id, which can be used for accessing cluster data using API