diff --git a/design/MetadataProfile.md b/design/MetadataProfile.md new file mode 100644 index 000000000..265ef28b0 --- /dev/null +++ b/design/MetadataProfile.md @@ -0,0 +1,81 @@ +# Metadata Profile + +The metadata profile contains a list of queries used to retrieve datasource metadata such as list of namespaces, workloads +and containers. Users can create metadata profiles based on their cluster or datasource provider, such as Prometheus or +Thanos. These profiles can be tagged to import metadata API, which will then fetch metadata according to the metadata +profile, which further helps to create experiments followed by generating recommendations. + +This document describes the fields of Metadata Profile and the different set of queries supported by Kruize. +Documentation still in progress stay tuned. + +## Attributes + +- **apiVersion** \ + A string representing version of the Kubernetes API to create metadata profile +- **kind** \ + A string representing type of kubernetes object +- **metadata** \ + A JSON object containing Data that helps to uniquely identify the metadata profile, including a name string + - **name** \ + A unique string name for identifying each metadata profile. +- **profile_version** \ + A double value specifying the current version of the profile. +- **datasource** \ + A string representing the datasource to import metadata from +- **query_variables** \ + Define the query variables to be used + - **name** \ + name of the variable + - **datasource** \ + datasource of the query + - **value_type** \ + can be double or integer + - **query** \ + one of the query or _aggregation_functions_ is mandatory. Both can be present. + - **kubernetes_object** \ + k8s object that this query is tied to: "_deployment_", "_pod_" or "_container_" + - **aggregation_functions** \ + aggregate functions associated with this variable + - **function** \ + can be '_avg_', '_sum_', '_min_', '_max_' + - **query** \ + corresponding query + - **version** \ + Any specific version that this query is tied to + +### Different set of metadata queries + +#### Queries to import metadata across the cluster + +These set of queries fetch list of all the namespaces, workloads and containers present across the cluster + +| Name | Query | +|-------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| namespacesAcrossCluster | sum by (namespace) (avg_over_time(kube_namespace_status_phase{namespace!=""}[$MEASUREMENT_DURATION_IN_MIN$m])) | +| workloadsAcrossCluster | sum by (namespace, workload, workload_type) (avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{workload!=""}[$MEASUREMENT_DURATION_IN_MIN$m])) | +| containersAcrossCluster | sum by (container, image, workload, workload_type, namespace) (avg_over_time(kube_pod_container_info{container!=""}[$MEASUREMENT_DURATION_IN_MIN$m])
* on (pod, namespace) group_left(workload, workload_type) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{workload!=""}[$MEASUREMENT_DURATION_IN_MIN$m])) | + + +
+ +#### Queries to import metadata for specific org_id and cluster_id + +These set of queries fetch list of namespaces, workloads and containers for specific `org_id` and `cluster_id` + +| Name | Query | +|------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| namespacesForOrgAndClusterId | sum by (namespace) (avg_over_time(kube_namespace_status_phase{namespace!="", org_id="$ORG_ID$", cluster_id="$CLUSTER_ID$"}[$MEASUREMENT_DURATION_IN_MIN$m])) | +| workloadsForOrgAndClusterId | sum by (namespace, workload, workload_type) (avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{workload!="", org_id="$ORG_ID$", cluster_id="$CLUSTER_ID$"}[$MEASUREMENT_DURATION_IN_MIN$m])) | +| containersForOrgAndClusterId | sum by (container, image, workload, workload_type, namespace) (avg_over_time(kube_pod_container_info{container!="", org_id="$ORG_ID$", cluster_id="$CLUSTER_ID$"}[$MEASUREMENT_DURATION_IN_MIN$m])
* on (pod, namespace) group_left(workload, workload_type) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{workload!="", org_id="$ORG_ID$", cluster_id="$CLUSTER_ID$"}[$MEASUREMENT_DURATION_IN_MIN$m])) | + +
+ +#### Queries to import metadata for custom label - ADDITIONAL_LABEL + +These set of queries fetch list of namespaces, workloads and containers for specific `ADDITIONAL_LABEL` - currently used by bulk and thanos demos + +| Name | Query | +|------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| namespacesForAdditionalLabel | sum by (namespace) (avg_over_time(kube_namespace_status_phase{namespace!="" ADDITIONAL_LABEL}[$MEASUREMENT_DURATION_IN_MIN$m])) | +| workloadsForAdditionalLabel | sum by (namespace, workload, workload_type) (avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{workload!="" ADDITIONAL_LABEL}[$MEASUREMENT_DURATION_IN_MIN$m])) | +| containersForAdditionalLabel | sum by (container, image, workload, workload_type, namespace) (avg_over_time(kube_pod_container_info{container!="" ADDITIONAL_LABEL}[$MEASUREMENT_DURATION_IN_MIN$m])
* on (pod, namespace) group_left(workload, workload_type) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{workload!="" ADDITIONAL_LABEL}[$MEASUREMENT_DURATION_IN_MIN$m])) | diff --git a/manifests/autotune/metadata-profiles/bulk_cluster_metadata_local_monitoring.json b/manifests/autotune/metadata-profiles/bulk_cluster_metadata_local_monitoring.json new file mode 100644 index 000000000..b1248012a --- /dev/null +++ b/manifests/autotune/metadata-profiles/bulk_cluster_metadata_local_monitoring.json @@ -0,0 +1,48 @@ +{ + "apiVersion": "recommender.com/v1", + "kind": "KruizeMetadataProfile", + "metadata": { + "name": "cluster-metadata-local-monitoring" + }, + "profile_version": 1, + "k8s_type": "openshift", + "datasource": "prometheus", + "query_variables": [ + { + "name": "namespacesForAdditionalLabel", + "datasource": "prometheus", + "value_type": "double", + "kubernetes_object": "container", + "aggregation_functions": [ + { + "function": "sum", + "query": "sum by (namespace) (avg_over_time(kube_namespace_status_phase{namespace!=\"\" ADDITIONAL_LABEL}[$MEASUREMENT_DURATION_IN_MIN$m]))" + } + ] + }, + { + "name": "workloadsForAdditionalLabel", + "datasource": "prometheus", + "value_type": "double", + "kubernetes_object": "container", + "aggregation_functions": [ + { + "function": "sum", + "query": "sum by (namespace, workload, workload_type) (avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{workload!=\"\" ADDITIONAL_LABEL}[$MEASUREMENT_DURATION_IN_MIN$m]))" + } + ] + }, + { + "name": "containersForAdditionalLabel", + "datasource": "prometheus", + "value_type": "double", + "kubernetes_object": "container", + "aggregation_functions": [ + { + "function": "sum", + "query": "sum by (container, image, workload, workload_type, namespace) (avg_over_time(kube_pod_container_info{container!=\"\" ADDITIONAL_LABEL}[$MEASUREMENT_DURATION_IN_MIN$m]) * on (pod, namespace) group_left(workload, workload_type) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{workload!=\"\" ADDITIONAL_LABEL}[$MEASUREMENT_DURATION_IN_MIN$m]))" + } + ] + } + ] +} diff --git a/manifests/autotune/metadata-profiles/bulk_cluster_metadata_local_monitoring.yaml b/manifests/autotune/metadata-profiles/bulk_cluster_metadata_local_monitoring.yaml new file mode 100644 index 000000000..bacf66948 --- /dev/null +++ b/manifests/autotune/metadata-profiles/bulk_cluster_metadata_local_monitoring.yaml @@ -0,0 +1,32 @@ +apiVersion: "recommender.com/v1" +kind: "KruizeMetadataProfile" +metadata: + name: "cluster-metadata-local-monitoring" +profile_version: 1.0 +k8s_type: openshift +datasource: prometheus +query_variables: + +- name: namespacesForAdditionalLabel + datasource: prometheus + value_type: "double" + kubernetes_object: "namespace" + aggregation_functions: + - function: sum + query: 'sum by (namespace) (avg_over_time(kube_namespace_status_phase{namespace!="" ADDITIONAL_LABEL}[$MEASUREMENT_DURATION_IN_MIN$m]))' + +- name: workloadsForAdditionalLabel + datasource: prometheus + value_type: "double" + kubernetes_object: "container" + aggregation_functions: + - function: sum + query: 'sum by (namespace, workload, workload_type) (avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{workload!="" ADDITIONAL_LABEL}[$MEASUREMENT_DURATION_IN_MIN$m]))' + +- name: containersForAdditionalLabel + datasource: prometheus + value_type: "double" + kubernetes_object: "container" + aggregation_functions: + - function: sum + query: 'sum by (container, image, workload, workload_type, namespace) (avg_over_time(kube_pod_container_info{container!="" ADDITIONAL_LABEL}[$MEASUREMENT_DURATION_IN_MIN$m]) * on (pod, namespace) group_left(workload, workload_type) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{workload!="" ADDITIONAL_LABEL}[$MEASUREMENT_DURATION_IN_MIN$m]))' diff --git a/manifests/autotune/metadata-profiles/cluster_metadata_local_monitoring.json b/manifests/autotune/metadata-profiles/cluster_metadata_local_monitoring.json new file mode 100644 index 000000000..d24e06805 --- /dev/null +++ b/manifests/autotune/metadata-profiles/cluster_metadata_local_monitoring.json @@ -0,0 +1,48 @@ +{ + "apiVersion": "recommender.com/v1", + "kind": "KruizeMetadataProfile", + "metadata": { + "name": "cluster-metadata-local-monitoring" + }, + "profile_version": 1, + "k8s_type": "openshift", + "datasource": "prometheus", + "query_variables": [ + { + "name": "namespacesAcrossCluster", + "datasource": "prometheus", + "value_type": "double", + "kubernetes_object": "container", + "aggregation_functions": [ + { + "function": "sum", + "query": "sum by (namespace) (avg_over_time(kube_namespace_status_phase{namespace!=\"\"}[$MEASUREMENT_DURATION_IN_MIN$m]))" + } + ] + }, + { + "name": "workloadsAcrossCluster", + "datasource": "prometheus", + "value_type": "double", + "kubernetes_object": "container", + "aggregation_functions": [ + { + "function": "sum", + "query": "sum by (namespace, workload, workload_type) (avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{workload!=\"\"}[$MEASUREMENT_DURATION_IN_MIN$m]))" + } + ] + }, + { + "name": "containersAcrossCluster", + "datasource": "prometheus", + "value_type": "double", + "kubernetes_object": "container", + "aggregation_functions": [ + { + "function": "sum", + "query": "sum by (container, image, workload, workload_type, namespace) (avg_over_time(kube_pod_container_info{container!=\"\"}[$MEASUREMENT_DURATION_IN_MIN$m]) * on (pod, namespace) group_left(workload, workload_type) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{workload!=\"\"}[$MEASUREMENT_DURATION_IN_MIN$m]))" + } + ] + } + ] +} diff --git a/manifests/autotune/metadata-profiles/cluster_metadata_local_monitoring.yaml b/manifests/autotune/metadata-profiles/cluster_metadata_local_monitoring.yaml new file mode 100644 index 000000000..6d232c9e3 --- /dev/null +++ b/manifests/autotune/metadata-profiles/cluster_metadata_local_monitoring.yaml @@ -0,0 +1,32 @@ +apiVersion: "recommender.com/v1" +kind: "KruizeMetadataProfile" +metadata: + name: "cluster-metadata-local-monitoring" +profile_version: 1.0 +k8s_type: openshift +datasource: prometheus +query_variables: + +- name: namespacesAcrossCluster + datasource: prometheus + value_type: "double" + kubernetes_object: "namespace" + aggregation_functions: + - function: sum + query: 'sum by (namespace) (avg_over_time(kube_namespace_status_phase{namespace!=""}[$MEASUREMENT_DURATION_IN_MIN$m]))' + +- name: workloadsAcrossCluster + datasource: prometheus + value_type: "double" + kubernetes_object: "container" + aggregation_functions: + - function: sum + query: 'sum by (namespace, workload, workload_type) (avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{workload!=""}[$MEASUREMENT_DURATION_IN_MIN$m]))' + +- name: containersAcrossCluster + datasource: prometheus + value_type: "double" + kubernetes_object: "container" + aggregation_functions: + - function: sum + query: 'sum by (container, image, workload, workload_type, namespace) (avg_over_time(kube_pod_container_info{container!=""}[$MEASUREMENT_DURATION_IN_MIN$m]) * on (pod, namespace) group_left(workload, workload_type) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{workload!=""}[$MEASUREMENT_DURATION_IN_MIN$m]))' diff --git a/manifests/autotune/metadata-profiles/kruize-metadata-profile-crd.yaml b/manifests/autotune/metadata-profiles/kruize-metadata-profile-crd.yaml new file mode 100644 index 000000000..94be682f6 --- /dev/null +++ b/manifests/autotune/metadata-profiles/kruize-metadata-profile-crd.yaml @@ -0,0 +1,88 @@ +apiVersion: "recommender.com/v1" +kind: CustomResourceDefinition +metadata: + #name must match the spec fields below, and be in the form: . + name: kruizemetadataprofiles.recommender.com +spec: + # group name to use for REST API: /apis// + group: "recommender.com" + names: + plural: kruizemetadataprofiles + singular: kruizemetadataprofile + #types can be identified with this tag + kind: KruizeMetadataProfile + scope: Namespaced + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/ + community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/ + community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + profile_version: + description: 'Version of the profile' + type: number + k8s_type: + description: 'minikube or openshift' + type: string + datasource: + description: 'datasource to import metadata from for eg. Prometheus, Thanos, Datadog etc' + type: string + query_variables: + description: 'Query variables to be used' + type: array + items: + type: object + properties: + name: + description: 'name of the variable' + type: string + datasource: + description: 'datasource of the query' + type: string + value_type: + description: 'can be double or integer' + type: string + kubernetes_object: + description: 'k8s object that this query is tied to: "deployment", "pod", "namespace" or "container"' + type: string + query: + description: 'one of the query or aggregation_functions is mandatory' + type: string + aggregation_functions: + description: 'one of the query or aggregation_functions is mandatory' + type: array + items: + type: object + properties: + function: + description: 'aggregate functions associated with this variable' + type: string + query: + description: 'query' + type: string + version: + description: 'Any specific version that this query is tied to' + type: string + required: + - function + - query + required: + - name + - datasource + - value_type + required: + - query_variables diff --git a/manifests/autotune/metadata-profiles/metadata-profile-template.yaml b/manifests/autotune/metadata-profiles/metadata-profile-template.yaml new file mode 100644 index 000000000..37836316f --- /dev/null +++ b/manifests/autotune/metadata-profiles/metadata-profile-template.yaml @@ -0,0 +1,66 @@ +apiVersion: "recommender.com/v1" +kind: "KruizeMetadataProfile" +metadata: + name: "add_name_here" + +# Version of the profile. +# This helps when queries change often +profile_version: 1.0 + +# Is this for a specific kubernetes type? +# OPTIONAL. +# If not present, assumed to be used universally. +# If specified, it should have a corresponding config in +# manifests/configmaps dir (i.e. supported target by Kruize) +# Eg. manifests/configmaps/openshift-config.yaml +k8s_type: openshift + +# Name of the datasource to import the metadata from +# By default Prometheus datasource is connected to Kruize during initialization +# MANDATORY +datasource: prometheus + +# Describe the query variables to be used +# MANDATORY +query_variables: + +# name of the variable +# MANDATORY +- name: namespacesAcrossCluster + # datasource of the query + datasource: prometheus + # value_type. Supported are "double" or "integer". + # MANDATORY + value_type: "double" + + # Any k8s object that this query is tied to + # eg. "deployment", "pod", "container" + # OPTIONAL + kubernetes_object: "namespace" + + # One of query or aggregation_functions mandatory + # Both can be present together + query: 'sum by (namespace) (avg_over_time(kube_namespace_status_phase{namespace!=""}[$MEASUREMENT_DURATION_IN_MIN$d]))' + + # aggregate functions associated with this variable + # Eg. "avg", "sum", "max", "min" + aggregation_functions: + # MANDATORY + - function: sum + # query + # MANDATORY + query: 'sum by (namespace) (avg_over_time(kube_namespace_status_phase{namespace!=""}[$MEASUREMENT_DURATION_IN_MIN$m]))' + + # Any specific versions that this query is tied to + # OPTIONAL + version: ">4.9" + +# list of all the workloads present in the cluster +- name: workloadsAcrossCluster + datasource: prometheus + value_type: "double" + kubernetes_object: "container" + # sum of the pod ownership metrics, grouped by namespace, workload, and workload_type, filtering out empty workloads. + aggregation_functions: + - function: sum + query: 'sum by (namespace, workload, workload_type) (avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{workload!=""}[$MEASUREMENT_DURATION_IN_MIN$m]))'