Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into dataobj-explorer
Browse files Browse the repository at this point in the history
  • Loading branch information
cyriltovena committed Jan 23, 2025
2 parents fd1abad + 93cee63 commit bd6fefd
Show file tree
Hide file tree
Showing 88 changed files with 2,276 additions and 1,123 deletions.
34 changes: 21 additions & 13 deletions .github/renovate.json5
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,16 @@
],
"prHourlyLimit": 4,
"baseBranches": [
"main"
"main",
"release-3.3.x", // Update when a new release is out, 2 minors, 1 major.
"release-3.2.x", // Also ensure to update the 'packageRules' section to match
"release-2.9.x"
],
"packageRules": [
{
"matchBaseBranches": [
"release-2.9.x",
"release-2.8.x"
],
"enabled": false,
"matchPackageNames": [
"*"
]
// Disable updates for all branches - we only want security updates
"matchBaseBranches": ["release-3.3.x", "release-3.2.x", "release-2.9.x"],
"enabled": false
},
{
// Disable Go version updates
Expand Down Expand Up @@ -60,12 +58,20 @@
"automerge": false
},
{
// Enable all other updates
// Enable all other updates, and auto-merge minor and patch updates
"matchFileNames": ["!operator/go.mod", "!operator/api/loki/go.mod"],
"groupName": "{{packageName}}",
"enabled": true,
"matchUpdateTypes": ["major", "minor", "patch"],
// After we have tested the above configuration, we can enable the following
"matchUpdateTypes": ["minor", "patch"],
"automerge": true,
"autoApprove": true
},
{
// Enable all other updates, don't auto-merge major updates
"matchFileNames": ["!operator/go.mod", "!operator/api/loki/go.mod"],
"groupName": "{{packageName}}",
"enabled": true,
"matchUpdateTypes": ["major"],
"automerge": false,
"autoApprove": false
}
Expand All @@ -77,7 +83,9 @@
"enabled": true,
"addLabels": [
"area/security"
]
],
"automerge": true,
"autoApprove": true
},
"osvVulnerabilityAlerts": true,
"prConcurrentLimit": 10,
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/helm-diff-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@ name: Helm Loki Diff CI
on:
pull_request:
paths:
- 'production/helm/loki/**'
- "production/helm/loki/**"

# These permissions are needed to assume roles from Github's OIDC.
permissions:
contents: read
pull-requests: write
Expand Down Expand Up @@ -273,6 +274,7 @@ jobs:
summary-diff-outputs:
name: Summary Diffs
runs-on: ubuntu-latest
if: github.event.pull_request.head.repo.fork == false
needs:
- single-binary-diff
- default-values-diff
Expand All @@ -283,6 +285,8 @@ jobs:
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
persist-credentials: false

- uses: actions/download-artifact@v4
with:
Expand Down
2 changes: 1 addition & 1 deletion clients/cmd/logstash/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM logstash:8.17.0
FROM logstash:8.17.1

USER logstash
ENV PATH /usr/share/logstash/vendor/jruby/bin:/usr/share/logstash/vendor/bundle/jruby/2.5.0/bin:/usr/share/logstash/jdk/bin:$PATH
Expand Down
7 changes: 0 additions & 7 deletions cmd/logql-analyzer/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import (
"github.com/prometheus/client_golang/prometheus"

"github.com/grafana/loki/v3/pkg/logqlanalyzer"
"github.com/grafana/loki/v3/pkg/sizing"
util_log "github.com/grafana/loki/v3/pkg/util/log"
)

Expand Down Expand Up @@ -48,12 +47,6 @@ func createServer(cfg server.Config, logger log.Logger) (*server.Server, error)
s.HTTP.Use(logqlanalyzer.CorsMiddleware())
s.HTTP.Handle("/api/logql-analyze", &logqlanalyzer.LogQLAnalyzeHandler{}).Methods(http.MethodPost, http.MethodOptions)

sizingHandler := sizing.NewHandler(log.With(logger, "component", "sizing"))

s.HTTP.Handle("/api/sizing/helm", http.HandlerFunc(sizingHandler.GenerateHelmValues)).Methods(http.MethodGet, http.MethodOptions)
s.HTTP.Handle("/api/sizing/nodes", http.HandlerFunc(sizingHandler.Nodes)).Methods(http.MethodGet, http.MethodOptions)
s.HTTP.Handle("/api/sizing/cluster", http.HandlerFunc(sizingHandler.Cluster)).Methods(http.MethodGet, http.MethodOptions)

s.HTTP.HandleFunc("/ready", func(w http.ResponseWriter, _ *http.Request) {
http.Error(w, "ready", http.StatusOK)
}).Methods(http.MethodGet)
Expand Down
2 changes: 2 additions & 0 deletions cmd/querytee/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"flag"
"os"
"time"

"github.com/go-kit/log/level"
"github.com/grafana/dskit/log"
Expand Down Expand Up @@ -62,6 +63,7 @@ func lokiReadRoutes(cfg Config) []querytee.Route {
Tolerance: cfg.ProxyConfig.ValueComparisonTolerance,
UseRelativeError: cfg.ProxyConfig.UseRelativeError,
SkipRecentSamples: cfg.ProxyConfig.SkipRecentSamples,
SkipSamplesBefore: time.Time(cfg.ProxyConfig.SkipSamplesBefore),
})

return []querytee.Route{
Expand Down
42 changes: 19 additions & 23 deletions docs/sources/operations/storage/wal.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,17 @@ This section will use Kubernetes as a reference deployment paradigm in the examp

The Write Ahead Log in Loki takes a few particular tradeoffs compared to other WALs you may be familiar with. The WAL aims to add additional durability guarantees, but _not at the expense of availability_. Particularly, there are two scenarios where the WAL sacrifices these guarantees.

1) Corruption/Deletion of the WAL prior to replaying it
1. Corruption/Deletion of the WAL prior to replaying it

In the event the WAL is corrupted/partially deleted, Loki will not be able to recover all of its data. In this case, Loki will attempt to recover any data it can, but will not prevent Loki from starting.
In the event the WAL is corrupted/partially deleted, Loki will not be able to recover all of its data. In this case, Loki will attempt to recover any data it can, but will not prevent Loki from starting.

You can use the Prometheus metric `loki_ingester_wal_corruptions_total` to track and alert when this happens.
You can use the Prometheus metric `loki_ingester_wal_corruptions_total` to track and alert when this happens.

1) No space left on disk
1. No space left on disk

In the event the underlying WAL disk is full, Loki will not fail incoming writes, but neither will it log them to the WAL. In this case, the persistence guarantees across process restarts will not hold.

You can use the Prometheus metric `loki_ingester_wal_disk_full_failures_total` to track and alert when this happens.
In the event the underlying WAL disk is full, Loki will not fail incoming writes, but neither will it log them to the WAL. In this case, the persistence guarantees across process restarts will not hold.

You can use the Prometheus metric `loki_ingester_wal_disk_full_failures_total` to track and alert when this happens.

### Backpressure

Expand All @@ -47,18 +46,16 @@ The following metrics are available for monitoring the WAL:

1. Since ingesters need to have the same persistent volume across restarts/rollout, all the ingesters should be run on [StatefulSet](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) with fixed volumes.

2. Following flags needs to be set
1. Following flags needs to be set
* `--ingester.wal-enabled` to `true` which enables writing to WAL during ingestion.
* `--ingester.wal-dir` to the directory where the WAL data should be stored and/or recovered from. Note that this should be on the mounted volume.
* `--ingester.checkpoint-duration` to the interval at which checkpoints should be created.
* `--ingester.wal-replay-memory-ceiling` (default 4GB) may be set higher/lower depending on your resource settings. It handles memory pressure during WAL replays, allowing a WAL many times larger than available memory to be replayed. This is provided to minimize reconciliation time after very bad situations, i.e. an outage, and will likely not impact regular operations/rollouts _at all_. We suggest setting this to a high percentage (~75%) of available memory.

## Changes in lifecycle when WAL is enabled


Flushing of data to chunk store during rollouts or scale down is disabled. This is because during a rollout of statefulset there are no ingesters that are simultaneously leaving and joining, rather the same ingester is shut down and brought back again with updated config. Hence flushing is skipped and the data is recovered from the WAL. If you need to ensure that data is always flushed to the chunk store when your pod shuts down, you can set the `--ingester.flush-on-shutdown` flag to `true`.


## Disk space requirements

Based on tests in real world:
Expand All @@ -67,7 +64,7 @@ Based on tests in real world:
* Checkpoint period was 5mins.
* disk utilization on a WAL-only disk was steady at ~10-15GB.

You should not target 100% disk utilisation.
You should not target 100% disk utilization.

## Migrating from stateless deployments

Expand All @@ -76,17 +73,17 @@ The ingester _Deployment without WAL_ and _StatefulSet with WAL_ should be scale
Let's take an example of 4 ingesters. The migration would look something like this:

1. Bring up one stateful ingester `ingester-0` and wait until it's ready (accepting read and write requests).
2. Scale down the old ingester deployment to 3 and wait until the leaving ingester flushes all the data to chunk store.
3. Once that ingester has disappeared from `kc get pods ...`, add another stateful ingester and wait until it's ready. Now you have `ingester-0` and `ingester-1`.
4. Repeat step 2 to reduce remove another ingester from old deployment.
5. Repeat step 3 to add another stateful ingester. Now you have `ingester-0 ingester-1 ingester-2`.
6. Repeat step 4 and 5, and now you will finally have `ingester-0 ingester-1 ingester-2 ingester-3`.
1. Scale down the old ingester deployment to 3 and wait until the leaving ingester flushes all the data to chunk store.
1. Once that ingester has disappeared from `kc get pods ...`, add another stateful ingester and wait until it's ready. Now you have `ingester-0` and `ingester-1`.
1. Repeat step 2 to reduce remove another ingester from old deployment.
1. Repeat step 3 to add another stateful ingester. Now you have `ingester-0 ingester-1 ingester-2`.
1. Repeat step 4 and 5, and now you will finally have `ingester-0 ingester-1 ingester-2 ingester-3`.

## How to scale up/down

### Scale up

Scaling up is same as what you would do without WAL or statefulsets. Nothing to change here.
Scaling up is same as what you would do without WAL or StatefulSets. Nothing to change here.

### Scale down

Expand All @@ -100,12 +97,11 @@ After hitting the endpoint for `ingester-2 ingester-3`, scale down the ingesters

Also you can set the `--ingester.flush-on-shutdown` flag to `true`. This enables chunks to be flushed to long-term storage when the ingester is shut down.


## Additional notes

### Kubernetes hacking

Statefulsets are significantly more cumbersome to work with, upgrade, and so on. Much of this stems from immutable fields on the specification. For example, if one wants to start using the WAL with single store Loki and wants separate volume mounts for the WAL and the boltdb-shipper, you may see immutability errors when attempting updates the Kubernetes statefulsets.
StatefulSets are significantly more cumbersome to work with, upgrade, and so on. Much of this stems from immutable fields on the specification. For example, if one wants to start using the WAL with single store Loki and wants separate volume mounts for the WAL and the boltdb-shipper, you may see immutability errors when attempting updates the Kubernetes StatefulSets.

In this case, try `kubectl -n <namespace> delete sts ingester --cascade=false`.
This will leave the Pods alive but delete the StatefulSet.
Expand All @@ -115,16 +111,16 @@ Then you may recreate the (updated) StatefulSet and one-by-one start deleting th

1. **StatefulSets for Ordered Scaling Down**: The Loki ingesters should be scaled down one by one, which is efficiently handled by Kubernetes StatefulSets. This ensures an ordered and reliable scaling process, as described in the [Deployment and Scaling Guarantees](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#deployment-and-scaling-guarantees) documentation.

2. **Using PreStop Lifecycle Hook**: During the Pod scaling down process, the PreStop [lifecycle hook](https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/) triggers the `/flush_shutdown` endpoint on the ingester. This action flushes the chunks and removes the ingester from the ring, allowing it to register as unready and become eligible for deletion.
1. **Using PreStop Lifecycle Hook**: During the Pod scaling down process, the PreStop [lifecycle hook](https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/) triggers the `/flush_shutdown` endpoint on the ingester. This action flushes the chunks and removes the ingester from the ring, allowing it to register as unready and become eligible for deletion.

3. **Using terminationGracePeriodSeconds**: Provides time for the ingester to flush its data before being deleted, if flushing data takes more than 30 minutes, you may need to increase it.
1. **Using terminationGracePeriodSeconds**: Provides time for the ingester to flush its data before being deleted, if flushing data takes more than 30 minutes, you may need to increase it.

4. **Cleaning Persistent Volumes**: Persistent volumes are automatically cleaned up by leveraging the [enableStatefulSetAutoDeletePVC](https://kubernetes.io/blog/2021/12/16/kubernetes-1-23-statefulset-pvc-auto-deletion/) feature in Kubernetes.
1. **Cleaning Persistent Volumes**: Persistent volumes are automatically cleaned up by leveraging the [enableStatefulSetAutoDeletePVC](https://kubernetes.io/blog/2021/12/16/kubernetes-1-23-statefulset-pvc-auto-deletion/) feature in Kubernetes.

By following the above steps, you can ensure a smooth scaling down process for the Loki ingesters while maintaining data integrity and minimizing potential disruptions.

### Non-Kubernetes or baremetal deployments

* When the ingester restarts for any reason (upgrade, crash, etc), it should be able to attach to the same volume in order to recover back the WAL and tokens.
* 2 ingesters should not be working with the same volume/directory for the WAL.
* A rollout should bring down an ingester completely and then start the new ingester, not the other way around.
* A rollout should bring down an ingester completely and then start the new ingester, not the other way around.
2 changes: 1 addition & 1 deletion docs/sources/send-data/alloy/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
title: Ingesting logs to Loki using Alloy
menuTitle: Grafana Alloy
description: Configuring Grafana Alloy to send logs to Loki.
weight: 250
weight: 100
---


Expand Down
4 changes: 2 additions & 2 deletions docs/sources/send-data/otel/_index.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
---
title: Ingesting logs to Loki using OpenTelemetry Collector
menuTitle: OTel Collector
menuTitle: OpenTelemetry
description: Configuring the OpenTelemetry Collector to send logs to Loki.
aliases:
- ../clients/k6/
weight: 250
weight: 200
---

# Ingesting logs to Loki using OpenTelemetry Collector
Expand Down
2 changes: 1 addition & 1 deletion docs/sources/send-data/promtail/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ menuTitle: Promtail
description: How to use the Promtail agent to ship logs to Loki
aliases:
- ../clients/promtail/ # /docs/loki/latest/clients/promtail/
weight: 200
weight: 300
---
# Promtail agent

Expand Down
2 changes: 1 addition & 1 deletion docs/sources/send-data/promtail/cloud/eks/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ initContainer:

image:
repository: grafana/promtail
tag: 3.3.1
tag: 3.3.2
pullPolicy: IfNotPresent
## Optionally specify an array of imagePullSecrets.
## Secrets must be manually created in the namespace.
Expand Down
Loading

0 comments on commit bd6fefd

Please sign in to comment.