diff --git a/monitor/aws/docker-compose.yml b/monitor/aws/docker-compose.yml index 41b87b1..19a4362 100644 --- a/monitor/aws/docker-compose.yml +++ b/monitor/aws/docker-compose.yml @@ -28,6 +28,7 @@ services: - 'fc:172.31.43.254' - "monitor:${SERVER_IP}" - 'ac:172.31.20.115' + - 'dc:172.31.25.96' command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' @@ -48,6 +49,7 @@ services: extra_hosts: - 'fc:172.31.43.254' - "monitor:${SERVER_IP}" + - 'dc:172.31.25.96' environment: - "GF_AUTH_ANONYMOUS_ENABLED=true" - "GF_AUTH_ANONYMOUS_ORG_NAME=${GRAFANA_ORG_NAME}" diff --git a/monitor/aws/grafana/provisioning/dashboards/domain_crawler_dashboard.json b/monitor/aws/grafana/provisioning/dashboards/domain_crawler_dashboard.json new file mode 100644 index 0000000..bde2ba1 --- /dev/null +++ b/monitor/aws/grafana/provisioning/dashboards/domain_crawler_dashboard.json @@ -0,0 +1,890 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 17, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "FC AWS Embedded Prometheus", + "description": "Compares the last hour's queue activity with the same hour yesterday.", + "fill": 1, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(delta(kafka_log_logendoffset[1h] offset 1d)) by (topic) - sum(delta(kafka_log_logendoffset[1h])) by (topic)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "24hr Offset Activity Variation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "FC AWS Embedded Prometheus", + "fill": 1, + "gridPos": { + "h": 5, + "w": 9, + "x": 0, + "y": 5 + }, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kafka_crawl_messages_total) by (topic, outcome)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ topic }} {{ outcome }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Kafka Message Handling", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "FC AWS Embedded Prometheus", + "fill": 1, + "gridPos": { + "h": 5, + "w": 9, + "x": 9, + "y": 5 + }, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kafka_crawl_messages_total[15m])) by (topic, outcome)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ topic }} {{ outcome }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Kafka Message Handling Rates", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "FC AWS Embedded Prometheus", + "fill": 1, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 5 + }, + "hideTimeOverride": false, + "id": 15, + "interval": "", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kafka_log_logendoffset - 1) by (topic) - sum(kafka_partition_offsets) by (topic)", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ topic }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Candidates Lag", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "FC AWS Embedded Prometheus", + "fill": 1, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 10 + }, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(heritrix3_crawl_job_uris_total) by (kind)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ kind }}", + "refId": "A" + }, + { + "expr": "scalar(heritrix3_crawl_job_uris_total{kind='queued'}) - scalar(heritrix3_crawl_job_uris_total{kind='downloaded'})", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "PENDING", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "URI Totals", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "FC AWS Embedded Prometheus", + "fill": 1, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 10 + }, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(heritrix3_crawl_job_uris_total{kind='queued'}[10m])) by (kind)", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ kind }}", + "refId": "C" + }, + { + "expr": "sum(rate(heritrix3_crawl_job_uris_total{kind='downloaded'}[10m])) by (kind)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ kind }}", + "refId": "A" + }, + { + "expr": "sum(rate(heritrix3_crawl_job_uris_total{kind='novel'}[10m])) by (kind)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ kind }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Downloaded URI Rates", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "FC AWS Embedded Prometheus", + "fill": 1, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(heritrix3_crawl_job_bytes_total[1h])) by (kind)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ kind }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Download Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "FC AWS Embedded Prometheus", + "fill": 1, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(heritrix3_crawl_job_queues_total{kind!='total',kind!='exhausted'}) by (kind)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ kind }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Queues (excluding Total and Exhausted)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "FC AWS Embedded Prometheus", + "fill": 1, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(heritrix3_crawl_job_threads_total) by (kind)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ kind }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Crawl Threads", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "1m", + "schemaVersion": 18, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-7d", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Frequent Crawler Dashboard", + "uid": "wafc1", + "version": 1 +} diff --git a/monitor/aws/grafana/provisioning/datasources/domain_crawl.yaml-template b/monitor/aws/grafana/provisioning/datasources/domain_crawl.yaml-template new file mode 100644 index 0000000..df42438 --- /dev/null +++ b/monitor/aws/grafana/provisioning/datasources/domain_crawl.yaml-template @@ -0,0 +1,13 @@ +# Instances of grafana will only update datasource configurations +# for apiVersion numbers of same or lower. Thus, a new lower number +# will not affect higher number configuration settings. +# Typically BLUKWA keep this set to 1 +apiVersion: 1 + +datasources: + - name: 'DC AWS Embedded Prometheus' + type: "prometheus" + access: "proxy" + orgId: 1 + url: "http://${DC_EMBEDDED_SERVICE_IP}:9191" + isDefault: "false" diff --git a/monitor/aws/prometheus/alert.rules.yml b/monitor/aws/prometheus/alert.rules.yml index dde2527..01a920c 100644 --- a/monitor/aws/prometheus/alert.rules.yml +++ b/monitor/aws/prometheus/alert.rules.yml @@ -1,5 +1,5 @@ groups: -- name: UKWA metrics +- name: FC metrics rules: - alert: low_crawler_activity # The NPLD crawler is busier: @@ -31,6 +31,27 @@ groups: description: "The frequent crawls do not appear to be running as fast as it should be." +- name: DC metrics + rules: + - alert: low_dc_crawler_activity + expr: increase(heritrix3_crawl_job_uris_total{kind="finished", jobname="dc2024"}[4h]) < 10 + for: 10m + labels: + severity: severe + annotations: + description: Low crawl rate for crawl job {{ $labels.jobname }}. + summary: The {{ $labels.jobname }} crawl job is crawling too slowly. + + - alert: dc_crawl_is_slow + expr: sum(rate(kafka_log_logendoffset{topic="dc.crawled"}[10m])) < 5 + for: 30m + labels: + severity: severe + annotations: + summary: "The domain crawl is not running as fast as expected!" + description: "The domain crawl does not appear to be running as fast as it should be." + + - name: Generic metrics rules: diff --git a/monitor/aws/prometheus/prometheus.yml-template b/monitor/aws/prometheus/prometheus.yml-template index 57539e5..8674a20 100644 --- a/monitor/aws/prometheus/prometheus.yml-template +++ b/monitor/aws/prometheus/prometheus.yml-template @@ -28,6 +28,7 @@ scrape_configs: - "${PROMETHEUS_SERVICE_NAME}:9100" - "${FC_PROMETHEUS_SERVICE_NAME}:9100" - "${AC_PROMETHEUS_SERVICE_NAME}:9100" + - "${DC_PROMETHEUS_SERVICE_NAME}:9100" - job_name: 'federate' scrape_interval: 20s @@ -38,6 +39,9 @@ scrape_configs: - '{job="npld-heritrix-workers"}' - '{job="bypm-heritrix-workers"}' - '{job="kafka"}' + - '{job="npld-dc-heritrix-worker"}' + - '{job="kafka-1"}' static_configs: - targets: - "${FC_PROMETHEUS_SERVICE_NAME}:9191" + - "${DC_PROMETHEUS_SERVICE_NAME}:9191" diff --git a/monitor/aws/start_monitor.sh b/monitor/aws/start_monitor.sh index 43be7d2..75c4098 100755 --- a/monitor/aws/start_monitor.sh +++ b/monitor/aws/start_monitor.sh @@ -52,6 +52,7 @@ export GRAFANA_SERVICE_NAME='monitor' export GRAFANA_PORT=3000 export GRAFANA_DATA=${STORAGE_PATH}/grafana export FC_EMBEDDED_SERVICE_IP='172.31.43.254' +export DC_EMBEDDED_SERVICE_IP='172.31.25.96' export GRAFANA_ORG_NAME='blukwa' [[ -d ${GRAFANA_DATA}/ ]] || mkdir -p ${GRAFANA_DATA} chown -R ${USER}:${USER} ${GRAFANA_DATA} @@ -59,6 +60,7 @@ envsubst < ./grafana/grafana.ini-template > ./grafana/grafana.ini envsubst < ./grafana/provisioning/dashboards/blukwa.yaml-template > ./grafana/provisioning/dashboards/blukwa.yaml envsubst < ./grafana/provisioning/datasources/prometheus.yaml-template > ./grafana/provisioning/datasources/prometheus.yaml envsubst < ./grafana/provisioning/datasources/frequent_crawl.yaml-template > ./grafana/provisioning/datasources/frequent_crawl.yaml +envsubst < ./grafana/provisioning/datasources/domain_crawl.yaml-template > ./grafana/provisioning/datasources/domain_crawl.yaml # start monitoring stacks