From 18dffa65dc35dcc168c01ac54dec3917116df429 Mon Sep 17 00:00:00 2001 From: 35C4n0r Date: Wed, 29 Jan 2025 21:42:23 +0530 Subject: [PATCH 1/2] feat: grafana tempo topology Signed-off-by: 35C4n0r --- .../documentation/grafana-provider.mdx | 12 +- .../argocd_provider/argocd_provider.py | 2 +- .../grafana_provider/grafana_provider.py | 134 +++++++++++++++++- 3 files changed, 142 insertions(+), 6 deletions(-) diff --git a/docs/providers/documentation/grafana-provider.mdx b/docs/providers/documentation/grafana-provider.mdx index 500d6b15c..886a98e6c 100644 --- a/docs/providers/documentation/grafana-provider.mdx +++ b/docs/providers/documentation/grafana-provider.mdx @@ -1,8 +1,8 @@ --- title: "Grafana Provider" -description: "Grafana Provider allows either pull/push alerts from Grafana to Keep." +description: "Grafana Provider allows either pull/push alerts and pull Topology Map from Grafana to Keep." --- -Grafana currently supports pulling/pushing alerts. We will add querying and notifying soon. +Grafana currently supports pulling/pushing alerts & Topology Map. We will add querying and notifying soon. ## Legacy vs Unified Alerting @@ -114,6 +114,14 @@ If Keep is not accessible externally and the webhook cannot be created, you can 4. **Network and Connectivity Check:** - Use network monitoring tools to ensure Grafana can reach Keep or any alternative endpoint configured for alerts. + +**Topology Map** is generated from the traces collect by Tempo. +To get the Datasource UID, go to: +1. Connections > Data Sources. +2. Click the Prometheus instance which is scraping data from Tempo > Your URL is in the format `https://host/connections/datasources/edit/` +3. Copy that DATASOURCE_UID and use it while installing the provider. + + ## Webhook Integration Modifications The webhook integration adds Keep as a contact point in the Grafana instance. This integration can be located under the "Contact Points" section. Keep also gains access to the following scopes: diff --git a/keep/providers/argocd_provider/argocd_provider.py b/keep/providers/argocd_provider/argocd_provider.py index 897a1ad59..5fed4a460 100644 --- a/keep/providers/argocd_provider/argocd_provider.py +++ b/keep/providers/argocd_provider/argocd_provider.py @@ -234,4 +234,4 @@ def pull_topology(self): node["uid"] ] = "unknown" - return list(service_topology.values()) + return list(service_topology.values()), {} diff --git a/keep/providers/grafana_provider/grafana_provider.py b/keep/providers/grafana_provider/grafana_provider.py index dad11c42e..ccc56116b 100644 --- a/keep/providers/grafana_provider/grafana_provider.py +++ b/keep/providers/grafana_provider/grafana_provider.py @@ -12,9 +12,10 @@ from packaging.version import Version from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus +from keep.api.models.db.topology import TopologyServiceInDto from keep.contextmanager.contextmanager import ContextManager from keep.exceptions.provider_exception import ProviderException -from keep.providers.base.base_provider import BaseProvider +from keep.providers.base.base_provider import BaseProvider, BaseTopologyProvider from keep.providers.base.provider_exceptions import GetAlertException from keep.providers.grafana_provider.grafana_alert_format_description import ( GrafanaAlertFormatDescription, @@ -47,11 +48,18 @@ class GrafanaProviderAuthConfig: "validation": "any_http_url", }, ) + datasource_uid: str = dataclasses.field( + metadata={ + "required": False, + "description": "Datasource UID", + "hint": "Provide if you want to pull topology data", + }, + ) -class GrafanaProvider(BaseProvider): +class GrafanaProvider(BaseTopologyProvider): PROVIDER_DISPLAY_NAME = "Grafana" - """Pull/Push alerts from Grafana.""" + """Pull/Push alerts & Topology map from Grafana.""" PROVIDER_CATEGORY = ["Monitoring", "Developer Tools"] KEEP_GRAFANA_WEBHOOK_INTEGRATION_NAME = "keep-grafana-webhook-integration" @@ -856,6 +864,126 @@ def simulate_alert(cls, **kwargs) -> dict: return {"keep_source_type": "grafana", "event": final_payload} return final_payload + def query_datasource_for_topology(self): + self.logger.info("Attempting to query datasource for topology data.") + headers = {"Authorization": f"Bearer {self.authentication_config.token}", "Content-Type": "application/json",} + json_data = { + "queries": [ + { + "format": "table", + "refId": "traces_service_graph_request_total", + "expr": "sum by (client, server) (rate(traces_service_graph_request_total[3600s]))", + "instant": True, + "exemplar": False, + "requestId": "service_map_request", + "utcOffsetSec": 19800, + "interval": "", + "legendFormat": "", + "datasource": { + "uid": self.authentication_config.datasource_uid, + }, + "datasourceId": 1, + "intervalMs": 5000, + "maxDataPoints": 954, + }, + { + "format": "table", + "refId": "traces_service_graph_request_server_seconds_sum", + "expr": "sum by (client, server) (rate(traces_service_graph_request_server_seconds_sum[3600s]))", + "instant": True, + "exemplar": False, + "requestId": "service_map_request_avg", + "utcOffsetSec": 19800, + "interval": "", + "legendFormat": "", + "datasource": { + "uid": self.authentication_config.datasource_uid, + }, + "datasourceId": 1, + "intervalMs": 5000, + "maxDataPoints": 954, + }, + ], + "to": "now", + } + try: + response = requests.post( + f"{self.authentication_config.host}/api/ds/query", + verify=False, + headers=headers, + json=json_data, + timeout=10, + ) + if response.status_code != 200: + raise Exception(response.text) + return response.json() + except Exception as e: + self.logger.error("Error while querying datasource for topology map", extra={"exception": str(e)}) + + @staticmethod + def __extract_schema_value_pair(results, query: str): + client_server_data = {} + for frames in results.get(query, {}).get("frames", []): + value_index = 0 + for fields in frames.get("schema", {}).get("fields", []): + if ( + "labels" in fields + and "client" in fields["labels"] + and "server" in fields["labels"] + ): + client_server_data[ + (fields["labels"]["client"], fields["labels"]["server"]) + ] = float(frames["data"]["values"][value_index][0]) + break + value_index += 1 + return client_server_data + + def pull_topology(self): + self.logger.info("Pulling Topology data from Grafana...") + try: + service_topology = {} + results = self.query_datasource_for_topology().get("results", {}) + + self.logger.info("Scraping traces_service_graph_request_total data from the response") + requests_per_second_data = GrafanaProvider.__extract_schema_value_pair( + results=results, query="traces_service_graph_request_total" + ) + + self.logger.info("Scraping traces_service_graph_request_server_seconds_sum data from the response") + total_response_times_data = GrafanaProvider.__extract_schema_value_pair( + results=results, query="traces_service_graph_request_server_seconds_sum" + ) + + self.logger.info("Building Topology map.") + for client_server in requests_per_second_data: + client, server = client_server + requests_per_second = requests_per_second_data[client_server] + total_response_time = total_response_times_data.get(client_server, None) + + if client not in service_topology: + service_topology[client] = TopologyServiceInDto( + source_provider_id=self.provider_id, + service=client, + display_name=client, + ) + if server not in service_topology: + service_topology[server] = TopologyServiceInDto( + source_provider_id=self.provider_id, + service=server, + display_name=server, + ) + + service_topology[client].dependencies[server] = ( + "unknown" + if total_response_time is None + else f"{round(requests_per_second, 2)}r/sec || {round((total_response_time / requests_per_second) * 1000, 2)}ms/r" + ) + self.logger.info("Successfully pulled Topology data from Grafana...") + return list(service_topology.values()), {} + except Exception as e: + self.logger.error("Error while pulling topology data from Grafana", extra={"exception": str(e)}) + raise e + if __name__ == "__main__": # Output debug messages From 391b85158e1fc7eaeece058c2ccbf4fa49b8050c Mon Sep 17 00:00:00 2001 From: 35C4n0r Date: Wed, 29 Jan 2025 21:47:17 +0530 Subject: [PATCH 2/2] chore: quickstart instructions Signed-off-by: 35C4n0r --- keep/providers/grafana_provider/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/keep/providers/grafana_provider/README.md b/keep/providers/grafana_provider/README.md index f82746dd9..145b33ddd 100644 --- a/keep/providers/grafana_provider/README.md +++ b/keep/providers/grafana_provider/README.md @@ -56,3 +56,7 @@ curl -X POST -H "Content-Type: application/json" \ # and get {"id":1,"name":"keep-token","key":"glsa_XXXXXX"}% ``` + +### For Topology Quickstart +Follow this guide: +https://grafana.com/docs/tempo/latest/getting-started/docker-example/ \ No newline at end of file