Add Get Alarm Runtime Status API. (#13028)

apache · Feb 12, 2025 · 0cd4354 · 0cd4354
1 parent b7e961b
commit 0cd4354
Show file tree

Hide file tree

Showing 10 changed files with 389 additions and 1 deletion.
diff --git a/docs/en/changes/changes.md b/docs/en/changes/changes.md
@@ -70,6 +70,8 @@
 * Add type descriptor when converting Envoy logs to JSON for persistence, to avoid conversion error.
 * Bseline: Support query baseline with MQE and use in the Alarm Rule.
 * Bump up netty to 4.11.118 to fix CVE-2025-24970.
+* Add `Get Alarm Runtime Status` API.
+* Add `lock` when query the Alarm metrics window values.
 
 #### UI
 

diff --git a/docs/en/status/query_alarm_runtime_status.md b/docs/en/status/query_alarm_runtime_status.md
@@ -0,0 +1,161 @@
+# Get Alarm Runtime Status
+
+OAP calculates the alarm conditions in the memory based on the alarm rules and the metrics data.
+The following APIs are exposed to make the alerting running kernel visible.
+
+## Get Alarm Running Rules
+
+Return the list of alarm running rules.
+
+- URL, `http://{core restHost}:{core restPort}/status/alarm/rules`
+- HTTP GET method.
+
+```json
+{
+  "ruleNames": [
+    "service_percentile_rule",
+    "service_resp_time_rule"
+  ]
+}
+```
+
+## Get Alarm Running Rule Info
+
+Return the detailed information of the alarm running rule.
+
+- URL, `http://{core restHost}:{core restPort}/status/alarm/rules/{ruleName}`
+- HTTP GET method.
+
+```json
+{
+  "ruleName": "service_resp_time_rule",
+  "expression": "sum(service_resp_time > baseline(service_resp_time,upper)) >= 1",
+  "period": 10,
+  "silentPeriod": 10,
+  "additonalPeriod": 0,
+  "includeNames": [
+    "mock_a_service",
+    "mock_b_service",
+    "mock_c_service"
+  ],
+  "excludeNames": [],
+  "includeNamesRegex": "",
+  "excludeNamesRegex": "",
+  "affectedEntities": [
+    {
+      "scope": "SERVICE",
+      "name": "mock_b_service"
+    },
+    {
+      "scope": "SERVICE",
+      "name": "mock_a_service"
+    },
+    {
+      "scope": "SERVICE",
+      "name": "mock_c_service"
+    }
+  ],
+  "tags": [
+    {
+      "key": "level",
+      "value": "WARNING"
+    }
+  ],
+  "hooks": [
+    "webhook.default",
+    "wechat.default"
+  ],
+  "includeMetrics": [
+    "service_resp_time"
+  ],
+  "formattedMessages": [
+    {
+      "mock_b_service": "Response time of service mock_b_service is more than upper baseline in 1 minutes of last 10 minutes."
+    },
+    {
+      "mock_a_service": "Response time of service mock_a_service is more than upper baseline in 1 minutes of last 10 minutes."
+    },
+    {
+      "mock_c_service": "Response time of service mock_c_service is more than upper baseline in 1 minutes of last 10 minutes."
+    }
+  ]
+}
+```
+
+- `additonalPeriod` is the additional period if the expression includes the [increase/rate function](../api/metrics-query-expression.md#trend-operation).
+This additional period is used to enlarge window size for calculating the trend value.
+- `affectedEntities` is the entities that have metrics data and being calculated by the alarm rule.
+- `formattedMessages` is the result message according to the message template and the affected entities.
+
+## Get Alarm Running Context
+
+Return the running context of the alarm rule.
+
+- URL, `http://{core restHost}:{core restPort}/status/alarm/{ruleName}/{entityName}`
+- HTTP GET method.
+
+```json
+{
+  "expression": "sum(service_resp_time > baseline(service_resp_time,upper)) >= 1",
+  "endTime": "2025-02-12T13:39:00.000",
+  "additionalPeriod": 0,
+  "size": 10,
+  "silenceCountdown": 10,
+  "windowValues": [
+    {
+      "index": 0,
+      "metrics": []
+    },
+    {
+      "index": 1,
+      "metrics": []
+    },
+    {
+      "index": 2,
+      "metrics": []
+    },
+    {
+      "index": 3,
+      "metrics": []
+    },
+    {
+      "index": 4,
+      "metrics": []
+    },
+    {
+      "index": 5,
+      "metrics": []
+    },
+    {
+      "index": 6,
+      "metrics": []
+    },
+    {
+      "index": 7,
+      "metrics": [
+        {
+          "timeBucket": 202502121437,
+          "name": "service_resp_time",
+          "value": "6000"
+        }
+      ]
+    },
+    {
+      "index": 8,
+      "metrics": []
+    },
+    {
+      "index": 9,
+      "metrics": []
+    }
+  ],
+  "mqeMetricsSnapshot": {
+    "service_resp_time": "[{\"metric\":{\"labels\":[]},\"values\":[{\"id\":\"202502121430\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121431\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121432\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121433\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121434\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121435\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121436\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121437\",\"doubleValue\":6000.0,\"isEmptyValue\":false},{\"id\":\"202502121438\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121439\",\"doubleValue\":0.0,\"isEmptyValue\":true}]}]",
+    "baseline(service_resp_time,upper)": "[{\"metric\":{\"labels\":[]},\"values\":[{\"id\":\"202502121430\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121431\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121432\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121433\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121434\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121435\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121436\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121437\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121438\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121439\",\"doubleValue\":10.0,\"isEmptyValue\":false}]}]"
+  }
+}
+```
+`size` is the window size. Equal to the `period + additionalPeriod`.
+`silenceCountdown` is the countdown of the silence period. -1 means silence countdown is not running.
+`windowValues` is the original metrics data. The `index` is the index of the window, starting from 0.
+`mqeMetricsSnapshot` is the metrics data in the MQE format. When checking conditions, these data will be calculated according to the expression.
diff --git a/docs/en/status/status_apis.md b/docs/en/status/status_apis.md
@@ -11,6 +11,7 @@ logs and self-observability solutions.
 - [Tracing Query Execution APIs](../debugging/query-tracing.md)
 - [Get Effective TTL Configurations API](query_ttl_setup.md)
 - [Query Cluster Nodes API](query_cluster_nodes.md)
+- [Get Alarm Runtime Status API](query_alarm_runtime_status.md)
 
 If you have a proposal about new status API, please don't hesitate
 to [create a discussion](https://github.com/apache/skywalking/discussions/new?category=ideas).

diff --git a/docs/menu.yml b/docs/menu.yml
@@ -346,6 +346,8 @@ catalog:
                 path: "/en/status/query_ttl_setup"
               - name: "Get Node List in the Cluster"
                 path: "/en/status/query_cluster_nodes"
+              - name: "Get Alarm Runtime Status"
+                path: "/en/status/query_alarm_runtime_status"
   - name: "Customization"
     catalog:
       - name: "Overview"

diff --git a/...main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmMessageFormatter.java b/...main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmMessageFormatter.java
@@ -20,6 +20,7 @@
 
 import java.util.ArrayList;
 import java.util.List;
+import lombok.Getter;
 
 /**
  * This is a formatter especially for alarm message.
@@ -28,6 +29,7 @@
  * <p>
  * - Successful rate of endpoint {name} is lower than 75%
  */
+@Getter
 public class AlarmMessageFormatter {
     private List<String> formatSegments;
     private List<ValueFrom> valueFroms;
@@ -88,7 +90,7 @@ public String format(AlarmEntity alarmEntity) {
         return message.toString();
     }
 
-    private enum ValueFrom {
+    public enum ValueFrom {
         ID, NAME
     }
 }
diff --git a/...c/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmModuleProvider.java b/...c/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmModuleProvider.java
@@ -20,6 +20,7 @@
 
 import java.io.FileNotFoundException;
 import java.io.Reader;
+import lombok.Getter;
 import org.apache.skywalking.oap.server.configuration.api.ConfigurationModule;
 import org.apache.skywalking.oap.server.configuration.api.DynamicConfigurationService;
 import org.apache.skywalking.oap.server.core.CoreModule;
@@ -35,6 +36,7 @@
 public class AlarmModuleProvider extends ModuleProvider {
 
     private NotifyHandler notifyHandler;
+    @Getter
     private AlarmRulesWatcher alarmRulesWatcher;
 
     @Override

diff --git a/...lugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java b/...lugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
@@ -29,8 +29,10 @@
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.locks.ReentrantLock;
+import java.util.function.Consumer;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
+import lombok.Getter;
 import lombok.RequiredArgsConstructor;
 import lombok.ToString;
 import lombok.extern.slf4j.Slf4j;
@@ -68,6 +70,7 @@
  * RunningRule represents each rule in running status. Based on the {@link AlarmRule} definition,
  */
 @Slf4j
+@Getter
 public class RunningRule {
     private static DateTimeFormatter TIME_BUCKET_FORMATTER = DateTimeFormat.forPattern("yyyyMMddHHmm");
 
@@ -243,12 +246,17 @@ public List<AlarmMessage> check() {
      * buckets.
      */
     public class Window {
+        @Getter
         private LocalDateTime endTime;
+        @Getter
         private final int additionalPeriod;
+        @Getter
         private final int size;
+        @Getter
         private int silenceCountdown;
         private LinkedList<Map<String, Metrics>> values;
         private ReentrantLock lock = new ReentrantLock();
+        @Getter
         private JsonObject mqeMetricsSnapshot;
         private AlarmEntity entity;
 
@@ -356,6 +364,7 @@ public Optional<AlarmMessage> checkAlarm() {
         }
 
         private boolean isMatch() {
+            this.lock.lock();
             int isMatch = 0;
             try {
                 TRACE_CONTEXT.set(new DebuggingTraceContext(expression, false, false));
@@ -407,6 +416,7 @@ private boolean isMatch() {
                 this.mqeMetricsSnapshot = visitor.getMqeMetricsSnapshot();
                 return isMatch == 1;
             } finally {
+                this.lock.unlock();
                 TRACE_CONTEXT.remove();
             }
         }
@@ -422,6 +432,15 @@ public boolean isExpired() {
             return true;
         }
 
+        public void scanWindowValues(Consumer<LinkedList<Map<String, Metrics>>> scanFunction) {
+            lock.lock();
+            try {
+                scanFunction.accept(values);
+            } finally {
+                lock.unlock();
+            }
+        }
+
         private void init() {
             values = new LinkedList<>();
             for (int i = 0; i < size; i++) {

diff --git a/oap-server/server-query-plugin/status-query-plugin/pom.xml b/oap-server/server-query-plugin/status-query-plugin/pom.xml
@@ -44,5 +44,10 @@
             <artifactId>zipkin-query-plugin</artifactId>
             <version>${project.version}</version>
         </dependency>
+        <dependency>
+            <groupId>org.apache.skywalking</groupId>
+            <artifactId>server-alarm-plugin</artifactId>
+            <version>${project.version}</version>
+        </dependency>
     </dependencies>
 </project>