-
Notifications
You must be signed in to change notification settings - Fork 125
/
Copy pathprometheus.ConfigMap.yaml
405 lines (379 loc) · 17.3 KB
/
prometheus.ConfigMap.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
apiVersion: v1
data:
prometheus.yml: |
global:
scrape_interval: 30s
evaluation_interval: 30s
alerting:
alertmanagers:
- kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_name]
regex: alertmanager
action: keep
rule_files:
- '*_rules.yml'
- "/sg_config_prometheus/*_rules.yml"
- "/sg_prometheus_add_ons/*_rules.yml"
# A scrape configuration for running Prometheus on a Kubernetes cluster.
# This uses separate scrape configs for cluster components (i.e. API server, node)
# and services to allow each to use different authentication configs.
#
# Kubernetes labels will be added as Prometheus labels on metrics via the
# `labelmap` relabeling action.
# Scrape config for API servers.
#
# Kubernetes exposes API servers as endpoints to the default/kubernetes
# service so this uses `endpoints` role and uses relabelling to only keep
# the endpoints associated with the default/kubernetes service using the
# default named port `https`. This works for single API server deployments as
# well as HA API server deployments.
scrape_configs:
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
# insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# Keep only the default/kubernetes service endpoints for the https port. This
# will add targets for each API server which Kubernetes adds an endpoint to
# the default/kubernetes service.
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-nodes'
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
# Scrape config for service endpoints.
#
# The relabeling allows the actual service scrape endpoint to be configured
# via the following annotations:
#
# * `prometheus.io/scrape`: Only scrape services that have a value of `true`
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
# to set this to `https` & most likely set the `tls_config` of the scrape config.
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
# service then set this appropriately.
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_sourcegraph_prometheus_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: (.+)(?::\d+);(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
# Sourcegraph specific customization. We want a more convenient to type label.
# target_label: kubernetes_namespace
target_label: ns
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
# Sourcegraph specific customization. We want a nicer name for job
- source_labels: [app]
action: replace
target_label: job
# Sourcegraph specific customization. We want a nicer name for instance
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: instance
# Scrape config for federated Prometheus instance endpoints. This is currently only
# used to proxy the metrics from multiple processes inside the precise code intel
# containers to be made available (as we can't have Prometheus scrape a dynamic port
# range based on a ConfigMap value).
#
# The relabeling allows the actual service scrape endpoint to be configured
# via the following annotations:
#
# * `prometheus.io/federate`: Only scrape services that have a value of `true`
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
# to set this to `https` & most likely set the `tls_config` of the scrape config.
# * `prometheus.io/port`: The port the Prometheus cluster is exposed on (usually 9090).
#
# TODO(efritz,uwedeportivo) - see if the value for 'match[]' can be supplied as an
# annotation so that this selector can be generalized if we want to use federation for
# another application in the future.
- job_name: 'kubernetes-federate'
honor_labels: true
metrics_path: '/federate'
params:
'match[]':
- '{__name__=~"lsif_.*"}'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_sourcegraph_prometheus_federate]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: (.+)(?::\d+);(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
# Sourcegraph specific customization. We want a more convenient to type label.
# target_label: kubernetes_namespace
target_label: ns
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
# Sourcegraph specific customization. We want a nicer name for job
- source_labels: [app]
action: replace
target_label: job
# Sourcegraph specific customization. We want a nicer name for instance
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: instance
# Example scrape config for probing services via the Blackbox Exporter.
#
# The relabeling allows the actual service scrape endpoint to be configured
# via the following annotations:
#
# * `prometheus.io/probe`: Only probe services that have a value of `true`
- job_name: 'kubernetes-services'
metrics_path: /probe
params:
module: [http_2xx]
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_service_namespace]
# Sourcegraph specific customization. We want a more convenient to type label.
# target_label: kubernetes_namespace
target_label: ns
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name
# Example scrape config for pods
#
# The relabeling allows the actual pod scrape endpoint to be configured via the
# following annotations:
#
# * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_sourcegraph_prometheus_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: (.+):(?:\d+);(\d+)
replacement: ${1}:${2}
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
# Sourcegraph specific customization. We want a more convenient to type label.
# target_label: kubernetes_namespace
target_label: ns
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
alert_rules.yml: |
groups:
- name: alert.rules
rules:
- alert: PodsMissing
expr: app:up:ratio{app!=""} < 0.9
for: 10m
labels:
severity: page
annotations:
description: 'Pods missing from {{`{{`}} $labels.app {{`}}`}}: {{`{{`}} $value
{{`}}`}}'
help: Alerts when pods are missing.
summary: Pods missing from {{`{{`}} $labels.app {{`}}`}}
- alert: NoPodsRunning
expr: app:up:ratio{app!=""} < 0.1
for: 2m
labels:
severity: page
annotations:
description: 'No pods are running for {{`{{`}} $labels.app {{`}}`}}: {{`{{`}}
$value {{`}}`}}'
help: Alerts when no pods are running for a service.
summary: No pods are running for {{`{{`}} $labels.app {{`}}`}}
- alert: ProdPageLoadLatency
expr: histogram_quantile(0.9, sum by(le) (rate(src_http_request_duration_seconds_bucket{job="sourcegraph-frontend",route!="raw"}[10m])))
> 20
labels:
severity: page
annotations:
description: 'Page load latency > 20s (90th percentile over all routes; current
value: {{`{{`}}$value{{`}}`}}s)'
help: Alerts when the page load latency is too high.
summary: High page load latency
- alert: GoroutineLeak
expr: go_goroutines >= 10000
for: 10m
annotations:
description: '{{`{{`}} $labels.app {{`}}`}} has more than 10k goroutines. This
is probably a regression causing a goroutine leak'
help: Alerts when a service has excessive running goroutines.
summary: Excessive number of goroutines
- alert: FSINodesRemainingLow
expr: sum by(instance) (container_fs_inodes_total{pod_name!=""}) > 3e+06
labels:
severity: page
annotations:
description: '{{`{{`}}$labels.instance{{`}}`}} is using {{`{{`}}humanize $value{{`}}`}}
inodes'
help: Alerts when a node's remaining FS inodes are low.
summary: '{{`{{`}}$labels.instance{{`}}`}} remaining fs inodes is low'
- alert: DiskSpaceLow
expr: node:k8snode_filesystem_avail_bytes:ratio < 0.1
annotations:
help: Alerts when a node has less than 10% available disk space.
summary: '{{`{{`}}$labels.exported_name{{`}}`}} has less than 10% available
disk space'
- alert: DiskSpaceLowCritical
expr: node:k8snode_filesystem_avail_bytes:ratio{exported_name=~".*prod.*"} < 0.05
labels:
severity: page
annotations:
help: Alerts when a node has less than 5% available disk space.
summary: Critical! {{`{{`}}$labels.exported_name{{`}}`}} has less than 5% available
disk space
- alert: GitserverDiskSpaceLow
expr: src_gitserver_disk_space_available / src_gitserver_disk_space_total < 0.1
annotations:
help: Alerts when gitserverdisk space is low.
summary: gitserver {{`{{`}}$labels.instance{{`}}`}} disk space is less than 10% of available disk space
- alert: GitserverDiskSpaceLowCritical
expr: src_gitserver_disk_space_available / src_gitserver_disk_space_total < 0.05
labels:
severity: page
annotations:
help: Alerts when gitserverdisk space is critically low.
summary: Critical! gitserver {{`{{`}}$labels.instance{{`}}`}} disk space is less than 5% of available disk space
- alert: SearcherErrorRatioTooHigh
expr: searcher_errors:ratio10m > 0.1
for: 20m
annotations:
help: Alerts when the search service has more than 10% of requests failing.
summary: Error ratio exceeds 10%
- alert: PrometheusMetricsBloat
expr: http_response_size_bytes{handler="prometheus",job!="kubernetes-apiservers",job!="kubernetes-nodes",quantile="0.5"}
> 20000
annotations:
help: Alerts when a service is probably leaking metrics (unbounded attribute).
summary: '{{`{{`}}$labels.job{{`}}`}} in {{`{{`}}$labels.ns{{`}}`}} is probably
leaking metrics (unbounded attribute)'
extra_rules.yml: ""
node_rules.yml: |
groups:
- name: nodes.rules
rules:
- record: node:container_cpu_usage_seconds_total:ratio_rate5m
expr: sum by(instance) (rate(container_cpu_usage_seconds_total{kubernetes_pod_name=""}[5m]))
/ max by(instance) (machine_cpu_cores)
- record: task:container_memory_usage_bytes:max
expr: max by(namespace, container_name) (container_memory_usage_bytes{container_name!=""})
- record: task:container_cpu_usage_seconds_total:sum
expr: sum by(id, namespace, container_name) (irate(container_cpu_usage_seconds_total{container_name!=""}[1m]))
- record: node:k8snode_filesystem_avail_bytes:ratio
expr: min by(exported_name) (k8snode_filesystem_avail_bytes / k8snode_filesystem_size_bytes)
sourcegraph_rules.yml: |
groups:
- name: sourcegraph.rules
rules:
- record: app:up:sum
expr: sum by(app) (up)
- record: app:up:count
expr: count by(app) (up)
- record: app:up:ratio
expr: app:up:sum / on(app) app:up:count
kind: ConfigMap
metadata:
labels:
deploy: sourcegraph
name: prometheus