diff --git a/debian/changelog b/debian/changelog index d6dc86d357..0630df3a6c 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,50 @@ +paasta-tools (0.189.0) xenial; urgency=medium + + * 0.189.0 tagged with 'make release' + Commit: Revert "TRON-1636: Setup tron secret_volumes in + setup_tron_namespace" (#3652) + + -- Wilmer Bandres Wed, 19 Jul 2023 05:26:34 -0700 + +paasta-tools (0.188.0) xenial; urgency=medium + + * 0.188.0 tagged with 'make release' + Commit: Fixing script bug (#3651) * Fixing script bug + + -- Wilmer Bandres Wed, 19 Jul 2023 01:34:56 -0700 + +paasta-tools (0.187.0) xenial; urgency=medium + + * 0.187.0 tagged with 'make release' + Commit: Merge pull request #3615 from Yelp/u/vit/tron-1636-add- + secret-volume TRON-1636: Setup tron secret_volumes in + setup_tron_namespace + + -- Vincent Thibault Tue, 18 Jul 2023 14:12:30 -0700 + +paasta-tools (0.186.0) xenial; urgency=medium + + * 0.186.0 tagged with 'make release' + Commit: Adding extensions to update_crds function (#3649) * Adding + extensions to update_crds function * removing unused client * + Fixing internal update crd * fixing internal update crd + + -- Wilmer Bandres Mon, 17 Jul 2023 01:23:12 -0700 + +paasta-tools (0.185.0) xenial; urgency=medium + + * 0.185.0 tagged with 'make release' + Commit: Adding support for extensions v1 v1beta1 (#3648) * Adding + support for extensions v1 v1beta1 * Fixing tests * updating + setup_kubernetes_cr * Fixing call to super * fixing typo * adding + minimal req for tests * putting older version back * Rebasing * + Removing second client * Adding comments for clairification * + Update paasta_tools/setup_kubernetes_cr.py Co-authored-by: Luis + Pérez * Fixing some whitespace errors --------- + Co-authored-by: Luis Pérez + + -- Wilmer Bandres Fri, 14 Jul 2023 00:18:35 -0700 + paasta-tools (0.184.0) xenial; urgency=medium * 0.184.0 tagged with 'make release' diff --git a/docs/source/autoscaling.rst b/docs/source/autoscaling.rst index 088bab585c..c3f6865e5f 100644 --- a/docs/source/autoscaling.rst +++ b/docs/source/autoscaling.rst @@ -80,6 +80,10 @@ The currently available metrics providers are: the port that your uWSGI master process will respond to with stats. Defaults to 8889. +:gunicorn: + With the ``gunicorn`` metrics provider, Paasta will configure your pods to run an additional container with the `statsd_exporter `_ image. + This sidecar will listen on port 9117 and receive stats from the gunicorn service. The ``statsd_exporter`` will translate the stats into Prometheus format, which Prometheus will scrape. + Decision policies ^^^^^^^^^^^^^^^^^ diff --git a/docs/source/yelpsoa_configs.rst b/docs/source/yelpsoa_configs.rst index ed4b65af32..c1457dbbc1 100644 --- a/docs/source/yelpsoa_configs.rst +++ b/docs/source/yelpsoa_configs.rst @@ -387,7 +387,7 @@ instance MAY have: * ``autoscaling``: See the `autoscaling docs `_ for details * ``metrics_provider``: Which method the autoscaler will use to determine a service's utilization. - Should be ``cpu`` or ``uwsgi``. + Should be ``cpu``, ``uwsgi``, or ``gunicorn``. * ``decision_policy``: Which method the autoscaler will use to determine when to autoscale a service. Should be ``proportional`` or ``bespoke``. diff --git a/paasta_tools/__init__.py b/paasta_tools/__init__.py index c214113303..e7b99b31aa 100644 --- a/paasta_tools/__init__.py +++ b/paasta_tools/__init__.py @@ -17,4 +17,4 @@ # setup phase, the dependencies may not exist on disk yet. # # Don't bump version manually. See `make release` docs in ./Makefile -__version__ = "0.184.0" +__version__ = "0.189.0" diff --git a/paasta_tools/kubernetes_tools.py b/paasta_tools/kubernetes_tools.py index 0a7d609597..b0bb828a0a 100644 --- a/paasta_tools/kubernetes_tools.py +++ b/paasta_tools/kubernetes_tools.py @@ -47,6 +47,8 @@ from kubernetes.client import models from kubernetes.client import V1Affinity from kubernetes.client import V1AWSElasticBlockStoreVolumeSource +from kubernetes.client import V1beta1CustomResourceDefinition +from kubernetes.client import V1beta1CustomResourceDefinitionList from kubernetes.client import V1beta1PodDisruptionBudget from kubernetes.client import V1beta1PodDisruptionBudgetSpec from kubernetes.client import V1Capabilities @@ -64,12 +66,12 @@ from kubernetes.client import V1EnvVar from kubernetes.client import V1EnvVarSource from kubernetes.client import V1ExecAction +from kubernetes.client import V1Handler from kubernetes.client import V1HostPathVolumeSource from kubernetes.client import V1HTTPGetAction from kubernetes.client import V1KeyToPath from kubernetes.client import V1LabelSelector from kubernetes.client import V1Lifecycle -from kubernetes.client import V1LifecycleHandler from kubernetes.client import V1Namespace from kubernetes.client import V1Node from kubernetes.client import V1NodeAffinity @@ -172,7 +174,12 @@ } HACHECK_POD_NAME = "hacheck" UWSGI_EXPORTER_POD_NAME = "uwsgi--exporter" -SIDECAR_CONTAINER_NAMES = [HACHECK_POD_NAME, UWSGI_EXPORTER_POD_NAME] +GUNICORN_EXPORTER_POD_NAME = "gunicorn--exporter" +SIDECAR_CONTAINER_NAMES = [ + HACHECK_POD_NAME, + UWSGI_EXPORTER_POD_NAME, + GUNICORN_EXPORTER_POD_NAME, +] KUBERNETES_NAMESPACE = "paasta" PAASTA_WORKLOAD_OWNER = "compute_infra_platform_experience" MAX_EVENTS_TO_RETRIEVE = 200 @@ -320,6 +327,7 @@ def _set_disrupted_pods(self: Any, disrupted_pods: Mapping[str, datetime]) -> No "paasta.yelp.com/prometheus_shard": str, "paasta.yelp.com/scrape_uwsgi_prometheus": str, "paasta.yelp.com/scrape_piscina_prometheus": str, + "paasta.yelp.com/scrape_gunicorn_prometheus": str, "paasta.yelp.com/service": str, "paasta.yelp.com/autoscaled": str, "yelp.com/paasta_git_sha": str, @@ -517,6 +525,13 @@ def __init__( self.core = kube_client.CoreV1Api(self.api_client) self.policy = kube_client.PolicyV1beta1Api(self.api_client) self.apiextensions = kube_client.ApiextensionsV1Api(self.api_client) + + # We need to support apiextensions /v1 and /v1beta1 in order + # to make our upgrade to k8s 1.22 smooth, otherwise + # updating the CRDs make this script fail + self.apiextensions_v1_beta1 = kube_client.ApiextensionsV1beta1Api( + self.api_client + ) self.custom = kube_client.CustomObjectsApi(self.api_client) self.autoscaling = kube_client.AutoscalingV2beta2Api(self.api_client) self.rbac = kube_client.RbacAuthorizationV1Api(self.api_client) @@ -772,7 +787,7 @@ def get_autoscaling_metric_spec( ), ) ) - elif metrics_provider in {"uwsgi", "piscina"}: + elif metrics_provider in {"uwsgi", "piscina", "gunicorn"}: metrics.append( V2beta2MetricSpec( type="Object", @@ -951,12 +966,17 @@ def get_sidecar_containers( uwsgi_exporter_container = self.get_uwsgi_exporter_sidecar_container( system_paasta_config ) + gunicorn_exporter_container = self.get_gunicorn_exporter_sidecar_container( + system_paasta_config + ) sidecars = [] if hacheck_container: sidecars.append(hacheck_container) if uwsgi_exporter_container: sidecars.append(uwsgi_exporter_container) + if gunicorn_exporter_container: + sidecars.append(gunicorn_exporter_container) return sidecars def get_readiness_check_prefix( @@ -1016,7 +1036,7 @@ def get_hacheck_sidecar_container( return V1Container( image=system_paasta_config.get_hacheck_sidecar_image_url(), lifecycle=V1Lifecycle( - pre_stop=V1LifecycleHandler( + pre_stop=V1Handler( _exec=V1ExecAction( command=[ "/bin/sh", @@ -1064,7 +1084,7 @@ def get_uwsgi_exporter_sidecar_container( env=self.get_kubernetes_environment() + [stats_port_env], ports=[V1ContainerPort(container_port=9117)], lifecycle=V1Lifecycle( - pre_stop=V1LifecycleHandler( + pre_stop=V1Handler( _exec=V1ExecAction( command=[ "/bin/sh", @@ -1095,6 +1115,42 @@ def should_run_uwsgi_exporter_sidecar( return True return False + def get_gunicorn_exporter_sidecar_container( + self, + system_paasta_config: SystemPaastaConfig, + ) -> Optional[V1Container]: + + if self.should_run_gunicorn_exporter_sidecar(): + return V1Container( + image=system_paasta_config.get_gunicorn_exporter_sidecar_image_url(), + resources=self.get_sidecar_resource_requirements("gunicorn_exporter"), + name=GUNICORN_EXPORTER_POD_NAME, + env=self.get_kubernetes_environment(), + ports=[V1ContainerPort(container_port=9117)], + lifecycle=V1Lifecycle( + pre_stop=V1Handler( + _exec=V1ExecAction( + command=[ + "/bin/sh", + "-c", + # we sleep for the same amount of time as we do after an hadown to ensure that we have accurate + # metrics up until our Pod dies + f"sleep {DEFAULT_HADOWN_PRESTOP_SLEEP_SECONDS}", + ] + ) + ) + ), + ) + + return None + + def should_run_gunicorn_exporter_sidecar(self) -> bool: + if self.is_autoscaling_enabled(): + autoscaling_params = self.get_autoscaling_params() + if autoscaling_params["metrics_provider"] == "gunicorn": + return True + return False + def should_setup_piscina_prometheus_scraping( self, ) -> bool: @@ -1395,20 +1451,20 @@ def get_readiness_probe( else: return self.get_liveness_probe(service_namespace_config) - def get_kubernetes_container_termination_action(self) -> V1LifecycleHandler: + def get_kubernetes_container_termination_action(self) -> V1Handler: command = self.config_dict.get("lifecycle", KubeLifecycleDict({})).get( "pre_stop_command", [] ) # default pre stop hook for the container if not command: - return V1LifecycleHandler( + return V1Handler( _exec=V1ExecAction( command=["/bin/sh", "-c", f"sleep {DEFAULT_PRESTOP_SLEEP_SECONDS}"] ) ) if isinstance(command, str): command = [command] - return V1LifecycleHandler(_exec=V1ExecAction(command=command)) + return V1Handler(_exec=V1ExecAction(command=command)) def get_pod_volumes( self, @@ -1919,7 +1975,7 @@ def has_routable_ip( ) -> str: """Return whether the routable_ip label should be true or false. - Services with a `prometheus_port` defined or that use the uwsgi_exporter sidecar must have a routable IP + Services with a `prometheus_port` defined or that use certain sidecars must have a routable IP address to allow Prometheus shards to scrape metrics. """ if ( @@ -1927,6 +1983,7 @@ def has_routable_ip( or service_namespace_config.is_in_smartstack() or self.get_prometheus_port() is not None or self.should_run_uwsgi_exporter_sidecar(system_paasta_config) + or self.should_run_gunicorn_exporter_sidecar() ): return "true" return "false" @@ -2099,6 +2156,10 @@ def get_pod_template_spec( labels["paasta.yelp.com/deploy_group"] = self.get_deploy_group() labels["paasta.yelp.com/scrape_piscina_prometheus"] = "true" + elif self.should_run_gunicorn_exporter_sidecar(): + labels["paasta.yelp.com/deploy_group"] = self.get_deploy_group() + labels["paasta.yelp.com/scrape_gunicorn_prometheus"] = "true" + return V1PodTemplateSpec( metadata=V1ObjectMeta( labels=labels, @@ -3818,30 +3879,37 @@ def mode_to_int(mode: Optional[Union[str, int]]) -> Optional[int]: def update_crds( kube_client: KubeClient, - desired_crds: Collection[V1CustomResourceDefinition], - existing_crds: V1CustomResourceDefinitionList, + desired_crds: Collection[ + Union[V1CustomResourceDefinition, V1beta1CustomResourceDefinition] + ], + existing_crds: Union[ + V1CustomResourceDefinitionList, V1beta1CustomResourceDefinitionList + ], ) -> bool: - success = True for desired_crd in desired_crds: existing_crd = None for crd in existing_crds.items: if crd.metadata.name == desired_crd.metadata["name"]: existing_crd = crd break - try: + + if "apiextensions.k8s.io/v1beta1" == desired_crd.api_version: + apiextensions = kube_client.apiextensions_v1_beta1 + else: + apiextensions = kube_client.apiextensions + if existing_crd: desired_crd.metadata[ "resourceVersion" ] = existing_crd.metadata.resource_version - kube_client.apiextensions.replace_custom_resource_definition( + + apiextensions.replace_custom_resource_definition( name=desired_crd.metadata["name"], body=desired_crd ) else: try: - kube_client.apiextensions.create_custom_resource_definition( - body=desired_crd - ) + apiextensions.create_custom_resource_definition(body=desired_crd) except ValueError as err: # TODO: kubernetes server will sometimes reply with conditions:null, # figure out how to deal with this correctly, for more details: @@ -3857,9 +3925,9 @@ def update_crds( f"status: {exc.status}, reason: {exc.reason}" ) log.debug(exc.body) - success = False + return False - return success + return True def sanitise_label_value(value: str) -> str: diff --git a/paasta_tools/long_running_service_tools.py b/paasta_tools/long_running_service_tools.py index 7f987dd1a6..2322ee5a58 100644 --- a/paasta_tools/long_running_service_tools.py +++ b/paasta_tools/long_running_service_tools.py @@ -31,6 +31,7 @@ DEFAULT_AUTOSCALING_SETPOINT = 0.8 DEFAULT_UWSGI_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800 DEFAULT_PISCINA_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800 +DEFAULT_GUNICORN_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800 # we set a different default moving average window so that we can reuse our existing PromQL # without having to write a different query for existing users that want to autoscale on # instantaneous CPU diff --git a/paasta_tools/setup_kubernetes_cr.py b/paasta_tools/setup_kubernetes_cr.py index c63935db00..ac9037a746 100644 --- a/paasta_tools/setup_kubernetes_cr.py +++ b/paasta_tools/setup_kubernetes_cr.py @@ -152,44 +152,61 @@ def setup_all_custom_resources( service: str = None, instance: str = None, ) -> bool: - cluster_crds = { - crd.spec.names.kind - for crd in kube_client.apiextensions.list_custom_resource_definition( - label_selector=paasta_prefixed("service") - ).items - } - log.debug(f"CRDs found: {cluster_crds}") - results = [] - for crd in custom_resource_definitions: - if crd.kube_kind.singular not in cluster_crds: - # TODO: kube_kind.singular seems to correspond to `crd.names.kind` - # and not `crd.names.singular` - log.warning(f"CRD {crd.kube_kind.singular} " f"not found in {cluster}") - continue - # by convention, entries where key begins with _ are used as templates - # and will be filter out here - config_dicts = load_all_configs( - cluster=cluster, file_prefix=crd.file_prefix, soa_dir=soa_dir - ) + got_results = False + # We support two versions due to our upgrade to 1.22 + # this functions runs succefully when any of the two apiextensions + # succeed to update the CRDs as the cluster could be in any version + # we need to try both possibilities + for apiextension in [ + kube_client.apiextensions, + kube_client.apiextensions_v1_beta1, + ]: + cluster_crds = { + crd.spec.names.kind + for crd in apiextension.list_custom_resource_definition( + label_selector=paasta_prefixed("service") + ).items + } + log.debug(f"CRDs found: {cluster_crds}") + results = [] + for crd in custom_resource_definitions: + if crd.kube_kind.singular not in cluster_crds: + # TODO: kube_kind.singular seems to correspond to `crd.names.kind` + # and not `crd.names.singular` + log.warning(f"CRD {crd.kube_kind.singular} " f"not found in {cluster}") + continue + + # by convention, entries where key begins with _ are used as templates + # and will be filter out here + config_dicts = load_all_configs( + cluster=cluster, file_prefix=crd.file_prefix, soa_dir=soa_dir + ) - ensure_namespace( - kube_client=kube_client, namespace=f"paasta-{crd.kube_kind.plural}" - ) - results.append( - setup_custom_resources( - kube_client=kube_client, - kind=crd.kube_kind, - crd=crd, - config_dicts=config_dicts, - version=crd.version, - group=crd.group, - cluster=cluster, - service=service, - instance=instance, + ensure_namespace( + kube_client=kube_client, namespace=f"paasta-{crd.kube_kind.plural}" ) - ) - return any(results) if results else True + results.append( + setup_custom_resources( + kube_client=kube_client, + kind=crd.kube_kind, + crd=crd, + config_dicts=config_dicts, + version=crd.version, + group=crd.group, + cluster=cluster, + service=service, + instance=instance, + ) + ) + if results: + got_results = True + if any(results): + return True + # we want to return True if we never called `setup_custom_resources` + # (i.e., we noop'd) or if any call to `setup_custom_resources` + # succeed (handled above) - otherwise, we want to return False + return not got_results def setup_custom_resources( diff --git a/paasta_tools/setup_kubernetes_crd.py b/paasta_tools/setup_kubernetes_crd.py index 856a9d4136..eabc667c06 100644 --- a/paasta_tools/setup_kubernetes_crd.py +++ b/paasta_tools/setup_kubernetes_crd.py @@ -27,7 +27,9 @@ from typing import Sequence import service_configuration_lib +from kubernetes.client import V1beta1CustomResourceDefinition from kubernetes.client import V1CustomResourceDefinition +from kubernetes.client.exceptions import ApiException from paasta_tools.kubernetes_tools import KubeClient from paasta_tools.kubernetes_tools import paasta_prefixed @@ -104,7 +106,22 @@ def setup_kube_crd( label_selector=paasta_prefixed("service") ) + # This step can fail in k8s 1.22 since this version is not existing anymore + # we need to support this for the transition + try: + existing_crds_v1_beta1 = ( + kube_client.apiextensions_v1_beta1.list_custom_resource_definition( + label_selector=paasta_prefixed("service") + ) + ) + except ApiException: + existing_crds_v1_beta1 = [] + log.debug( + "Listing CRDs with apiextensions/v1beta1 not supported on this cluster, falling back to v1" + ) + desired_crds = [] + desired_crds_v1_beta1 = [] for service in services: crd_config = service_configuration_lib.read_extra_service_information( service, f"crd-{cluster}", soa_dir=soa_dir @@ -118,18 +135,32 @@ def setup_kube_crd( metadata["labels"] = {} metadata["labels"]["yelp.com/paasta_service"] = service metadata["labels"][paasta_prefixed("service")] = service - desired_crd = V1CustomResourceDefinition( - api_version=crd_config.get("apiVersion"), - kind=crd_config.get("kind"), - metadata=metadata, - spec=crd_config.get("spec"), - ) - desired_crds.append(desired_crd) + + if "apiextensions.k8s.io/v1beta1" == crd_config["apiVersion"]: + desired_crd = V1beta1CustomResourceDefinition( + api_version=crd_config.get("apiVersion"), + kind=crd_config.get("kind"), + metadata=metadata, + spec=crd_config.get("spec"), + ) + desired_crds_v1_beta1.append(desired_crd) + else: + desired_crd = V1CustomResourceDefinition( + api_version=crd_config.get("apiVersion"), + kind=crd_config.get("kind"), + metadata=metadata, + spec=crd_config.get("spec"), + ) + desired_crds.append(desired_crd) return update_crds( kube_client=kube_client, desired_crds=desired_crds, existing_crds=existing_crds, + ) and update_crds( + kube_client=kube_client, + desired_crds=desired_crds_v1_beta1, + existing_crds=existing_crds_v1_beta1, ) diff --git a/paasta_tools/setup_kubernetes_internal_crd.py b/paasta_tools/setup_kubernetes_internal_crd.py index 0d1d3e2778..ae84e08e5d 100755 --- a/paasta_tools/setup_kubernetes_internal_crd.py +++ b/paasta_tools/setup_kubernetes_internal_crd.py @@ -143,9 +143,10 @@ def setup_kube_internal_crd( existing_crds = kube_client.apiextensions.list_custom_resource_definition( label_selector=paasta_prefixed("internal") ) - return update_crds( - kube_client=kube_client, desired_crds=INTERNAL_CRDS, existing_crds=existing_crds + kube_client=kube_client, + desired_crds=INTERNAL_CRDS, + existing_crds=existing_crds, ) diff --git a/paasta_tools/setup_prometheus_adapter_config.py b/paasta_tools/setup_prometheus_adapter_config.py index 78477c97ba..4539763192 100755 --- a/paasta_tools/setup_prometheus_adapter_config.py +++ b/paasta_tools/setup_prometheus_adapter_config.py @@ -43,6 +43,9 @@ from paasta_tools.long_running_service_tools import ( DEFAULT_CPU_AUTOSCALING_MOVING_AVERAGE_WINDOW, ) +from paasta_tools.long_running_service_tools import ( + DEFAULT_GUNICORN_AUTOSCALING_MOVING_AVERAGE_WINDOW, +) from paasta_tools.long_running_service_tools import ( DEFAULT_PISCINA_AUTOSCALING_MOVING_AVERAGE_WINDOW, ) @@ -200,6 +203,19 @@ def should_create_uwsgi_scaling_rule( return False, "did not request uwsgi autoscaling" +def should_create_gunicorn_scaling_rule( + autoscaling_config: AutoscalingParamsDict, +) -> Tuple[bool, Optional[str]]: + """ + Determines whether we should configure the prometheus adapter for a given service. + Returns a 2-tuple of (should_create, reason_to_skip) + """ + if autoscaling_config["metrics_provider"] == "gunicorn": + return True, None + + return False, "did not request gunicorn autoscaling" + + def should_create_piscina_scaling_rule( autoscaling_config: AutoscalingParamsDict, ) -> Tuple[bool, Optional[str]]: @@ -574,6 +590,106 @@ def create_instance_cpu_scaling_rule( } +def create_instance_gunicorn_scaling_rule( + service: str, + instance: str, + autoscaling_config: AutoscalingParamsDict, + paasta_cluster: str, + namespace: str = "paasta", +) -> PrometheusAdapterRule: + """ + Creates a Prometheus adapter rule config for a given service instance. + """ + setpoint = autoscaling_config["setpoint"] + moving_average_window = autoscaling_config.get( + "moving_average_window_seconds", + DEFAULT_GUNICORN_AUTOSCALING_MOVING_AVERAGE_WINDOW, + ) + + deployment_name = get_kubernetes_app_name(service=service, instance=instance) + + # In order for autoscaling to work safely while a service migrates from one namespace to another, the HPA needs to + # make sure that the deployment in the new namespace is scaled up enough to handle _all_ the load. + # This is because once the new deployment is 100% healthy, cleanup_kubernetes_job will delete the deployment out of + # the old namespace all at once, suddenly putting all the load onto the deployment in the new namespace. + # To ensure this, we must: + # - DO NOT filter on namespace in worker_filter_terms (which is used when calculating desired_instances). + # - DO filter on namespace in replica_filter_terms (which is used to calculate current_replicas). + # This makes sure that desired_instances includes load from all namespaces, but that the scaling ratio calculated + # by (desired_instances / current_replicas) is meaningful for each namespace. + worker_filter_terms = f"paasta_cluster='{paasta_cluster}',paasta_service='{service}',paasta_instance='{instance}'" + replica_filter_terms = f"paasta_cluster='{paasta_cluster}',deployment='{deployment_name}',namespace='{namespace}'" + + current_replicas = f""" + sum( + label_join( + ( + kube_deployment_spec_replicas{{{replica_filter_terms}}} >= 0 + or + max_over_time( + kube_deployment_spec_replicas{{{replica_filter_terms}}}[{DEFAULT_EXTRAPOLATION_TIME}s] + ) + ), + "kube_deployment", "", "deployment" + ) + ) by (kube_deployment) + """ + # k8s:deployment:pods_status_ready is a metric created by summing kube_pod_status_ready + # over paasta service/instance/cluster. it counts the number of ready pods in a paasta + # deployment. + ready_pods = f""" + (sum( + k8s:deployment:pods_status_ready{{{worker_filter_terms}}} >= 0 + or + max_over_time( + k8s:deployment:pods_status_ready{{{worker_filter_terms}}}[{DEFAULT_EXTRAPOLATION_TIME}s] + ) + ) by (kube_deployment)) + """ + load_per_instance = f""" + avg( + gunicorn_worker_busy{{{worker_filter_terms}}} + ) by (kube_pod, kube_deployment) + """ + missing_instances = f""" + clamp_min( + {ready_pods} - count({load_per_instance}) by (kube_deployment), + 0 + ) + """ + total_load = f""" + ( + sum( + {load_per_instance} + ) by (kube_deployment) + + + {missing_instances} + ) + """ + desired_instances_at_each_point_in_time = f""" + {total_load} / {setpoint} + """ + desired_instances = f""" + avg_over_time( + ( + {desired_instances_at_each_point_in_time} + )[{moving_average_window}s:] + ) + """ + metrics_query = f""" + {desired_instances} / {current_replicas} + """ + + metric_name = f"{deployment_name}-gunicorn-prom" + + return { + "name": {"as": metric_name}, + "seriesQuery": f"gunicorn_worker_busy{{{worker_filter_terms}}}", + "resources": {"template": "kube_<<.Resource>>"}, + "metricsQuery": _minify_promql(metrics_query), + } + + def should_create_arbitrary_promql_scaling_rule( autoscaling_config: AutoscalingParamsDict, ) -> Tuple[bool, Optional[str]]: @@ -664,6 +780,7 @@ def get_rules_for_service_instance( (should_create_uwsgi_scaling_rule, create_instance_uwsgi_scaling_rule), (should_create_piscina_scaling_rule, create_instance_piscina_scaling_rule), (should_create_cpu_scaling_rule, create_instance_cpu_scaling_rule), + (should_create_gunicorn_scaling_rule, create_instance_gunicorn_scaling_rule), ): should_create, skip_reason = should_create_scaling_rule( autoscaling_config=autoscaling_config, diff --git a/paasta_tools/utils.py b/paasta_tools/utils.py index 0dfc2a85ea..5eadfc7fb5 100644 --- a/paasta_tools/utils.py +++ b/paasta_tools/utils.py @@ -2003,6 +2003,7 @@ class SystemPaastaConfigDict(TypedDict, total=False): taskproc: Dict tron: Dict uwsgi_exporter_sidecar_image_url: str + gunicorn_exporter_sidecar_image_url: str vault_cluster_map: Dict vault_environment: str volumes: List[DockerVolume] @@ -2684,6 +2685,13 @@ def get_uwsgi_exporter_sidecar_image_url(self) -> str: def default_should_run_uwsgi_exporter_sidecar(self) -> bool: return self.config_dict.get("default_should_run_uwsgi_exporter_sidecar", False) + def get_gunicorn_exporter_sidecar_image_url(self) -> str: + """Get the docker image URL for the gunicorn_exporter sidecar container""" + return self.config_dict.get( + "gunicorn_exporter_sidecar_image_url", + "docker-paasta.yelpcorp.com/gunicorn_exporter-k8s-sidecar:v0.24.0-yelp0", + ) + def get_mark_for_deployment_max_polling_threads(self) -> int: return self.config_dict.get("mark_for_deployment_max_polling_threads", 4) diff --git a/requirements.txt b/requirements.txt index 65296519c3..89da4ec80b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -44,7 +44,7 @@ jmespath==0.9.3 jsonref==0.1 jsonschema==2.5.1 kazoo==2.8.0 -kubernetes==24.2.0 +kubernetes==21.7.0 ldap3==2.6 manhole==1.5.0 marathon==0.12.0 diff --git a/tests/test_kubernetes_tools.py b/tests/test_kubernetes_tools.py index 24c1611dff..cf49372f85 100644 --- a/tests/test_kubernetes_tools.py +++ b/tests/test_kubernetes_tools.py @@ -24,12 +24,12 @@ from kubernetes.client import V1EnvVar from kubernetes.client import V1EnvVarSource from kubernetes.client import V1ExecAction +from kubernetes.client import V1Handler from kubernetes.client import V1HostPathVolumeSource from kubernetes.client import V1HTTPGetAction from kubernetes.client import V1KeyToPath from kubernetes.client import V1LabelSelector from kubernetes.client import V1Lifecycle -from kubernetes.client import V1LifecycleHandler from kubernetes.client import V1NodeAffinity from kubernetes.client import V1NodeSelector from kubernetes.client import V1NodeSelectorRequirement @@ -574,7 +574,7 @@ def test_get_sidecar_containers(self): ], image="some-docker-image", lifecycle=V1Lifecycle( - pre_stop=V1LifecycleHandler( + pre_stop=V1Handler( _exec=V1ExecAction( command=[ "/bin/sh", @@ -620,7 +620,7 @@ def test_get_sidecar_containers(self): ], image="some-docker-image", lifecycle=V1Lifecycle( - pre_stop=V1LifecycleHandler( + pre_stop=V1Handler( _exec=V1ExecAction( command=[ "/bin/sh", @@ -905,6 +905,50 @@ def test_should_run_uwsgi_exporter_sidecar_defaults(self): is True ) + def test_get_gunicorn_exporter_sidecar_container_should_run(self): + system_paasta_config = mock.Mock( + get_gunicorn_exporter_sidecar_image_url=mock.Mock( + return_value="gunicorn_exporter_image" + ) + ) + with mock.patch.object( + self.deployment, "should_run_gunicorn_exporter_sidecar", return_value=True + ): + ret = self.deployment.get_gunicorn_exporter_sidecar_container( + system_paasta_config + ) + assert ret is not None + assert ret.image == "gunicorn_exporter_image" + assert ret.ports[0].container_port == 9117 + + def test_get_gunicorn_exporter_sidecar_container_shouldnt_run(self): + system_paasta_config = mock.Mock( + get_gunicorn_exporter_sidecar_image_url=mock.Mock( + return_value="gunicorn_exporter_image" + ) + ) + with mock.patch.object( + self.deployment, "should_run_gunicorn_exporter_sidecar", return_value=False + ): + assert ( + self.deployment.get_gunicorn_exporter_sidecar_container( + system_paasta_config + ) + is None + ) + + def test_should_run_gunicorn_exporter_sidecar(self): + self.deployment.config_dict.update( + { + "max_instances": 5, + "autoscaling": { + "metrics_provider": "gunicorn", + }, + } + ) + + assert self.deployment.should_run_gunicorn_exporter_sidecar() is True + def test_get_env(self): with mock.patch( "paasta_tools.kubernetes_tools.LongRunningServiceConfig.get_env", @@ -1052,7 +1096,7 @@ def test_get_kubernetes_containers(self, prometheus_port, expected_ports): resources=mock_get_resource_requirements.return_value, image=mock_get_docker_url.return_value, lifecycle=V1Lifecycle( - pre_stop=V1LifecycleHandler( + pre_stop=V1Handler( _exec=V1ExecAction(command=["/bin/sh", "-c", "sleep 30"]) ) ), @@ -1637,14 +1681,6 @@ def test_format_kubernetes_app_dict(self, _): "paasta_tools.kubernetes_tools.KubernetesDeploymentConfig.get_volumes", autospec=True, ) - @mock.patch( - "paasta_tools.kubernetes_tools.KubernetesDeploymentConfig.get_service", - autospec=True, - ) - @mock.patch( - "paasta_tools.kubernetes_tools.KubernetesDeploymentConfig.get_instance", - autospec=True, - ) @mock.patch( "paasta_tools.kubernetes_tools.KubernetesDeploymentConfig.get_kubernetes_containers", autospec=True, @@ -1674,6 +1710,9 @@ def test_format_kubernetes_app_dict(self, _): "paasta_tools.kubernetes_tools.create_pod_topology_spread_constraints", autospec=True, ) + @pytest.mark.parametrize( + "autoscaling_metric_provider", [None, "uwsgi", "piscina", "gunicorn"] + ) @pytest.mark.parametrize( "in_smtstk,routable_ip,node_affinity,anti_affinity,spec_affinity,termination_grace_period,pod_topology", [ @@ -1715,8 +1754,6 @@ def test_get_pod_template_spec( mock_get_node_affinity, mock_get_pod_volumes, mock_get_kubernetes_containers, - mock_get_instance, - mock_get_service, mock_get_volumes, in_smtstk, routable_ip, @@ -1725,6 +1762,7 @@ def test_get_pod_template_spec( anti_affinity, spec_affinity, termination_grace_period, + autoscaling_metric_provider, ): mock_service_namespace_config = mock.Mock() mock_load_service_namespace_config.return_value = mock_service_namespace_config @@ -1740,9 +1778,27 @@ def test_get_pod_template_spec( mock_system_paasta_config.get_pod_defaults.return_value = dict(dns_policy="foo") mock_get_termination_grace_period.return_value = termination_grace_period - ret = self.deployment.get_pod_template_spec( - git_sha="aaaa123", system_paasta_config=mock_system_paasta_config - ) + if autoscaling_metric_provider: + mock_config_dict = KubernetesDeploymentConfigDict( + min_instances=1, + max_instances=3, + autoscaling={"metrics_provider": autoscaling_metric_provider}, + deploy_group="fake_group", + ) + autoscaled_deployment = KubernetesDeploymentConfig( + service="kurupt", + instance="fm", + cluster="brentford", + config_dict=mock_config_dict, + branch_dict=None, + ) + ret = autoscaled_deployment.get_pod_template_spec( + git_sha="aaaa123", system_paasta_config=mock_system_paasta_config + ) + else: + ret = self.deployment.get_pod_template_spec( + git_sha="aaaa123", system_paasta_config=mock_system_paasta_config + ) assert mock_load_service_namespace_config.called assert mock_service_namespace_config.is_in_smartstack.called @@ -1763,12 +1819,14 @@ def test_get_pod_template_spec( expected_labels = { "paasta.yelp.com/pool": "default", "yelp.com/paasta_git_sha": "aaaa123", - "yelp.com/paasta_instance": mock_get_instance.return_value, - "yelp.com/paasta_service": mock_get_service.return_value, + "yelp.com/paasta_instance": "fm", + "yelp.com/paasta_service": "kurupt", "paasta.yelp.com/git_sha": "aaaa123", - "paasta.yelp.com/instance": mock_get_instance.return_value, - "paasta.yelp.com/service": mock_get_service.return_value, - "paasta.yelp.com/autoscaled": "false", + "paasta.yelp.com/instance": "fm", + "paasta.yelp.com/service": "kurupt", + "paasta.yelp.com/autoscaled": "true" + if autoscaling_metric_provider + else "false", "paasta.yelp.com/cluster": "brentford", "registrations.paasta.yelp.com/kurupt.fm": "true", "yelp.com/owner": "compute_infra_platform_experience", @@ -1777,18 +1835,32 @@ def test_get_pod_template_spec( if in_smtstk: expected_labels["paasta.yelp.com/weight"] = "10" - assert ret == V1PodTemplateSpec( + if autoscaling_metric_provider: + expected_labels["paasta.yelp.com/deploy_group"] = "fake_group" + expected_labels[ + f"paasta.yelp.com/scrape_{autoscaling_metric_provider}_prometheus" + ] = "true" + if autoscaling_metric_provider in ("uwsgi", "gunicorn"): + routable_ip = "true" + + expected_annotations = { + "smartstack_registrations": '["kurupt.fm"]', + "paasta.yelp.com/routable_ip": routable_ip, + "iam.amazonaws.com/role": "", + } + if autoscaling_metric_provider == "uwsgi": + expected_annotations["autoscaling"] = "uwsgi" + + expected = V1PodTemplateSpec( metadata=V1ObjectMeta( labels=expected_labels, - annotations={ - "smartstack_registrations": '["kurupt.fm"]', - "paasta.yelp.com/routable_ip": routable_ip, - "iam.amazonaws.com/role": "", - }, + annotations=expected_annotations, ), spec=V1PodSpec(**pod_spec_kwargs), ) + assert ret == expected + @mock.patch( "paasta_tools.kubernetes_tools.KubernetesDeploymentConfig.get_prometheus_port", autospec=True, @@ -1797,31 +1869,41 @@ def test_get_pod_template_spec( "paasta_tools.kubernetes_tools.KubernetesDeploymentConfig.should_run_uwsgi_exporter_sidecar", autospec=True, ) + @mock.patch( + "paasta_tools.kubernetes_tools.KubernetesDeploymentConfig.should_run_gunicorn_exporter_sidecar", + autospec=True, + ) @pytest.mark.parametrize( - "ip_configured,in_smtstk,prometheus_port,should_run_uwsgi_exporter_sidecar_retval,expected", + "ip_configured,in_smtstk,prometheus_port,should_run_uwsgi_exporter_sidecar_retval,should_run_gunicorn_exporter_sidecar_retval,expected", [ - (False, True, 8888, False, "true"), - (False, False, 8888, False, "true"), - (False, True, None, False, "true"), - (True, False, None, False, "true"), - (False, False, None, True, "true"), - (False, False, None, False, "false"), + (False, True, 8888, False, False, "true"), + (False, False, 8888, False, False, "true"), + (False, True, None, False, False, "true"), + (True, False, None, False, False, "true"), + (False, False, None, True, False, "true"), + (False, False, None, False, False, "false"), + (False, False, None, False, True, "true"), ], ) def test_routable_ip( self, mock_should_run_uwsgi_exporter_sidecar, + mock_should_run_gunicorn_exporter_sidecar, mock_get_prometheus_port, ip_configured, in_smtstk, prometheus_port, should_run_uwsgi_exporter_sidecar_retval, + should_run_gunicorn_exporter_sidecar_retval, expected, ): mock_get_prometheus_port.return_value = prometheus_port mock_should_run_uwsgi_exporter_sidecar.return_value = ( should_run_uwsgi_exporter_sidecar_retval ) + mock_should_run_gunicorn_exporter_sidecar.return_value = ( + should_run_gunicorn_exporter_sidecar_retval + ) mock_service_namespace_config = mock.Mock() mock_service_namespace_config.is_in_smartstack.return_value = in_smtstk mock_system_paasta_config = mock.Mock() @@ -1996,7 +2078,7 @@ def test_kubernetes_container_termination_action( self.deployment.config_dict["lifecycle"] = { "pre_stop_command": termination_action } - handler = V1LifecycleHandler(_exec=V1ExecAction(command=expected)) + handler = V1Handler(_exec=V1ExecAction(command=expected)) assert self.deployment.get_kubernetes_container_termination_action() == handler @pytest.mark.parametrize( @@ -2330,6 +2412,83 @@ def test_get_autoscaling_metric_spec_uwsgi_prometheus( == return_value ) + @mock.patch( + "paasta_tools.kubernetes_tools.load_system_paasta_config", + autospec=True, + return_value=mock.Mock( + get_legacy_autoscaling_signalflow=lambda: "fake_signalflow_query" + ), + ) + def test_get_autoscaling_metric_spec_gunicorn_prometheus( + self, fake_system_paasta_config + ): + config_dict = KubernetesDeploymentConfigDict( + { + "min_instances": 1, + "max_instances": 3, + "autoscaling": { + "metrics_provider": "gunicorn", + "setpoint": 0.5, + "forecast_policy": "moving_average", + "moving_average_window_seconds": 300, + }, + } + ) + mock_config = KubernetesDeploymentConfig( # type: ignore + service="service", + cluster="cluster", + instance="instance", + config_dict=config_dict, + branch_dict=None, + ) + return_value = KubernetesDeploymentConfig.get_autoscaling_metric_spec( + mock_config, + "fake_name", + "cluster", + KubeClient(), + ) + expected_res = V2beta2HorizontalPodAutoscaler( + kind="HorizontalPodAutoscaler", + metadata=V1ObjectMeta( + name="fake_name", + namespace="paasta", + annotations={}, + ), + spec=V2beta2HorizontalPodAutoscalerSpec( + max_replicas=3, + min_replicas=1, + metrics=[ + V2beta2MetricSpec( + type="Object", + object=V2beta2ObjectMetricSource( + metric=V2beta2MetricIdentifier( + name="service-instance-gunicorn-prom", + ), + target=V2beta2MetricTarget( + type="Value", + value=1, + ), + described_object=V2beta2CrossVersionObjectReference( + api_version="apps/v1", + kind="Deployment", + name="fake_name", + ), + ), + ), + ], + scale_target_ref=V2beta2CrossVersionObjectReference( + api_version="apps/v1", + kind="Deployment", + name="fake_name", + ), + ), + ) + + assert ( + self.patch_expected_autoscaling_spec(expected_res, mock_config) + == return_value + ) + def test_override_scaledown_policies(self): config_dict = KubernetesDeploymentConfigDict( { @@ -4012,7 +4171,7 @@ def test_warning_big_bounce(): job_config.format_kubernetes_app().spec.template.metadata.labels[ "paasta.yelp.com/config_sha" ] - == "configd6531f39" + == "config5abf6b07" ), "If this fails, just change the constant in this test, but be aware that deploying this change will cause every service to bounce!" @@ -4058,7 +4217,7 @@ def test_warning_big_bounce_routable_pod(): job_config.format_kubernetes_app().spec.template.metadata.labels[ "paasta.yelp.com/config_sha" ] - == "configa5abd828" + == "config8ee1bd70" ), "If this fails, just change the constant in this test, but be aware that deploying this change will cause every smartstack-registered service to bounce!" diff --git a/tests/test_setup_kubernetes_cr.py b/tests/test_setup_kubernetes_cr.py index 21027400a8..6626a14829 100644 --- a/tests/test_setup_kubernetes_cr.py +++ b/tests/test_setup_kubernetes_cr.py @@ -60,8 +60,12 @@ def test_setup_all_custom_resources(): cassandra_crd.spec.names = mock.Mock( plural="cassandraclusters", kind="CassandraCluster" ) + mock_client.apiextensions_v1_beta1.list_custom_resource_definition.return_value = mock.Mock( + items=[flink_crd, cassandra_crd] + ) + mock_client.apiextensions.list_custom_resource_definition.return_value = ( - mock.Mock(items=[flink_crd, cassandra_crd]) + mock.Mock(items=[]) ) custom_resource_definitions = [ diff --git a/tests/test_setup_prometheus_adapter_config.py b/tests/test_setup_prometheus_adapter_config.py index e9c2838c8f..e02381d9c1 100644 --- a/tests/test_setup_prometheus_adapter_config.py +++ b/tests/test_setup_prometheus_adapter_config.py @@ -9,11 +9,17 @@ from paasta_tools.setup_prometheus_adapter_config import ( create_instance_cpu_scaling_rule, ) +from paasta_tools.setup_prometheus_adapter_config import ( + create_instance_gunicorn_scaling_rule, +) from paasta_tools.setup_prometheus_adapter_config import ( create_instance_uwsgi_scaling_rule, ) from paasta_tools.setup_prometheus_adapter_config import get_rules_for_service_instance from paasta_tools.setup_prometheus_adapter_config import should_create_cpu_scaling_rule +from paasta_tools.setup_prometheus_adapter_config import ( + should_create_gunicorn_scaling_rule, +) from paasta_tools.setup_prometheus_adapter_config import ( should_create_uwsgi_scaling_rule, ) @@ -196,6 +202,79 @@ def test_create_instance_cpu_scaling_rule() -> None: ) +@pytest.mark.parametrize( + "autoscaling_config,expected", + [ + ( + { + "metrics_provider": "cpu", + "decision_policy": "bespoke", + "moving_average_window_seconds": 123, + "setpoint": 0.653, + }, + False, + ), + ( + { + "metrics_provider": "gunicorn", + "moving_average_window_seconds": 124, + "setpoint": 0.425, + }, + True, + ), + ], +) +def test_should_create_gunicorn_scaling_rule( + autoscaling_config: AutoscalingParamsDict, expected: bool +) -> None: + should_create, reason = should_create_gunicorn_scaling_rule( + autoscaling_config=autoscaling_config + ) + + assert should_create == expected + if expected: + assert reason is None + else: + assert reason is not None + + +def test_create_instance_gunicorn_scaling_rule() -> None: + service_name = "test_service" + instance_name = "test_instance" + paasta_cluster = "test_cluster" + autoscaling_config: AutoscalingParamsDict = { + "metrics_provider": "gunicorn", + "setpoint": 0.1234567890, + "moving_average_window_seconds": 20120302, + "use_prometheus": True, + } + + with mock.patch( + "paasta_tools.setup_prometheus_adapter_config.load_system_paasta_config", + autospec=True, + return_value=MOCK_SYSTEM_PAASTA_CONFIG, + ): + rule = create_instance_gunicorn_scaling_rule( + service=service_name, + instance=instance_name, + paasta_cluster=paasta_cluster, + autoscaling_config=autoscaling_config, + ) + + # we test that the format of the dictionary is as expected with mypy + # and we don't want to test the full contents of the retval since then + # we're basically just writting a change-detector test - instead, we test + # that we're actually using our inputs + assert service_name in rule["seriesQuery"] + assert instance_name in rule["seriesQuery"] + assert paasta_cluster in rule["seriesQuery"] + # these two numbers are distinctive and unlikely to be used as constants + assert str(autoscaling_config["setpoint"]) in rule["metricsQuery"] + assert ( + str(autoscaling_config["moving_average_window_seconds"]) in rule["metricsQuery"] + ) + + @pytest.mark.parametrize( "autoscaling_config,expected_rules", [ diff --git a/yelp_package/Makefile b/yelp_package/Makefile index 9123f9e42b..7b6d5c3bad 100644 --- a/yelp_package/Makefile +++ b/yelp_package/Makefile @@ -13,7 +13,7 @@ # limitations under the License. # Edit this release and run "make release" -RELEASE=0.184.0 +RELEASE=0.189.0 SHELL=/bin/bash