Skip to content

Commit

Permalink
fix: Qos latency spikes due to low IOPS (#2615)
Browse files Browse the repository at this point in the history
* fix: Qos latency spikes due to low IOPS
  • Loading branch information
rahulguptajss authored Jan 31, 2024
1 parent fb65ef3 commit 26e5f27
Show file tree
Hide file tree
Showing 9 changed files with 128 additions and 74 deletions.
2 changes: 1 addition & 1 deletion cmd/collectors/restperf/restperf.go
Original file line number Diff line number Diff line change
Expand Up @@ -1162,7 +1162,7 @@ func (r *RestPerf) pollData(startTime time.Time, perfRecords []rest.PerfRecord)
if property == "average" || property == "percent" {

if strings.HasSuffix(metric.GetName(), "latency") {
skips, err = curMat.DivideWithThreshold(key, counter.denominator, r.perfProp.latencyIoReqd, r.Logger)
skips, err = curMat.DivideWithThreshold(key, counter.denominator, r.perfProp.latencyIoReqd, cachedData, prevMat, r.Logger)
} else {
skips, err = curMat.Divide(key, counter.denominator, r.Logger)
}
Expand Down
4 changes: 2 additions & 2 deletions cmd/collectors/restperf/restperf_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ func TestQosVolume(t *testing.T) {
pollDataPath2: "testdata/qos-volume-poll-2.json",
numInstances: 9,
numMetrics: 234,
sum: 81,
sum: 18,
},
}
for _, tt := range tests {
Expand All @@ -296,7 +296,7 @@ func TestQosVolume(t *testing.T) {
t.Fatal(err)
}

future := now.Add(time.Minute * 15)
future := now.Add(time.Minute * 1)
pollData = jsonToPerfRecords(tt.pollDataPath2)
pollData[0].Timestamp = future.UnixNano()

Expand Down
20 changes: 10 additions & 10 deletions cmd/collectors/restperf/testdata/qos-volume-poll-1.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
},
{
"name": "read_latency",
"value": 1
"value": 200
},
{
"name": "read_ops",
Expand Down Expand Up @@ -158,7 +158,7 @@
},
{
"name": "read_latency",
"value": 1
"value": 200
},
{
"name": "read_ops",
Expand Down Expand Up @@ -278,7 +278,7 @@
},
{
"name": "read_latency",
"value": 1
"value": 200
},
{
"name": "read_ops",
Expand Down Expand Up @@ -398,7 +398,7 @@
},
{
"name": "read_latency",
"value": 1
"value": 200
},
{
"name": "read_ops",
Expand Down Expand Up @@ -518,7 +518,7 @@
},
{
"name": "read_latency",
"value": 1
"value": 200
},
{
"name": "read_ops",
Expand Down Expand Up @@ -638,7 +638,7 @@
},
{
"name": "read_latency",
"value": 1
"value": 200
},
{
"name": "read_ops",
Expand Down Expand Up @@ -758,7 +758,7 @@
},
{
"name": "read_latency",
"value": 1
"value": 200
},
{
"name": "read_ops",
Expand Down Expand Up @@ -878,7 +878,7 @@
},
{
"name": "read_latency",
"value": 1
"value": 200
},
{
"name": "read_ops",
Expand Down Expand Up @@ -998,7 +998,7 @@
},
{
"name": "read_latency",
"value": 1
"value": 200
},
{
"name": "read_ops",
Expand Down Expand Up @@ -1118,7 +1118,7 @@
},
{
"name": "read_latency",
"value": 1
"value": 200
},
{
"name": "read_ops",
Expand Down
40 changes: 20 additions & 20 deletions cmd/collectors/restperf/testdata/qos-volume-poll-2.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@
},
{
"name": "read_latency",
"value": 100
"value": 2000
},
{
"name": "read_ops",
"value": 110
"value": 1000
},
{
"name": "sequential_reads_percent",
Expand Down Expand Up @@ -158,11 +158,11 @@
},
{
"name": "read_latency",
"value": 100
"value": 2000
},
{
"name": "read_ops",
"value": 110
"value": 1000
},
{
"name": "sequential_reads_percent",
Expand Down Expand Up @@ -278,11 +278,11 @@
},
{
"name": "read_latency",
"value": 100
"value": 2000
},
{
"name": "read_ops",
"value": 110
"value": 1000
},
{
"name": "sequential_reads_percent",
Expand Down Expand Up @@ -398,11 +398,11 @@
},
{
"name": "read_latency",
"value": 100
"value": 2000
},
{
"name": "read_ops",
"value": 110
"value": 1000
},
{
"name": "sequential_reads_percent",
Expand Down Expand Up @@ -518,11 +518,11 @@
},
{
"name": "read_latency",
"value": 100
"value": 2000
},
{
"name": "read_ops",
"value": 110
"value": 1000
},
{
"name": "sequential_reads_percent",
Expand Down Expand Up @@ -638,11 +638,11 @@
},
{
"name": "read_latency",
"value": 100
"value": 2000
},
{
"name": "read_ops",
"value": 110
"value": 1000
},
{
"name": "sequential_reads_percent",
Expand Down Expand Up @@ -758,11 +758,11 @@
},
{
"name": "read_latency",
"value": 100
"value": 2000
},
{
"name": "read_ops",
"value": 110
"value": 1000
},
{
"name": "sequential_reads_percent",
Expand Down Expand Up @@ -878,11 +878,11 @@
},
{
"name": "read_latency",
"value": 100
"value": 2000
},
{
"name": "read_ops",
"value": 110
"value": 1000
},
{
"name": "sequential_reads_percent",
Expand Down Expand Up @@ -998,11 +998,11 @@
},
{
"name": "read_latency",
"value": 100
"value": 2000
},
{
"name": "read_ops",
"value": 110
"value": 1000
},
{
"name": "sequential_reads_percent",
Expand Down Expand Up @@ -1118,11 +1118,11 @@
},
{
"name": "read_latency",
"value": 100
"value": 2000
},
{
"name": "read_ops",
"value": 110
"value": 1000
},
{
"name": "sequential_reads_percent",
Expand Down
2 changes: 1 addition & 1 deletion cmd/collectors/zapiperf/zapiperf.go
Original file line number Diff line number Diff line change
Expand Up @@ -846,7 +846,7 @@ func (z *ZapiPerf) PollData() (map[string]*matrix.Matrix, error) {
if property == "average" || property == "percent" {

if strings.HasSuffix(metric.GetName(), "latency") {
skips, err = curMat.DivideWithThreshold(key, metric.GetComment(), z.latencyIoReqd, z.Logger)
skips, err = curMat.DivideWithThreshold(key, metric.GetComment(), z.latencyIoReqd, cachedData, prevMat, z.Logger)
} else {
skips, err = curMat.Divide(key, metric.GetComment(), z.Logger)
}
Expand Down
2 changes: 1 addition & 1 deletion docs/configure-rest.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ of these parameters can be defined in the Harvest or object configuration files
|--------------------|----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------:|
| `use_insecure_tls` | bool, optional | skip verifying TLS certificate of the target system | false |
| `client_timeout` | duration (Go-syntax) | how long to wait for server responses | 30s |
| `latency_io_reqd` | int, optional | threshold of IOPs for calculating latency metrics (latencies based on very few IOPs are unreliable) | 100 |
| `latency_io_reqd` | int, optional | threshold of IOPs for calculating latency metrics (latencies based on very few IOPs are unreliable) | 10 |
| `schedule` | list, required | the poll frequencies of the collector/object, should include exactly these three elements in the exact same other: | |
| - `counter` | duration (Go-syntax) | poll frequency of updating the counter metadata cache | 20 minutes |
| - `instance` | duration (Go-syntax) | poll frequency of updating the instance cache | 10 minutes |
Expand Down
2 changes: 1 addition & 1 deletion docs/configure-zapi.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ of these parameters can be defined in the Harvest or object configuration files
| `use_insecure_tls` | bool, optional | skip verifying TLS certificate of the target system | `false` |
| `client_timeout` | duration (Go-syntax) | how long to wait for server responses | 30s |
| `batch_size` | int, optional | max instances per API request | `500` |
| `latency_io_reqd` | int, optional | threshold of IOPs for calculating latency metrics (latencies based on very few IOPs are unreliable) | `100` |
| `latency_io_reqd` | int, optional | threshold of IOPs for calculating latency metrics (latencies based on very few IOPs are unreliable) | `10` |
| `schedule` | list, required | the poll frequencies of the collector/object, should include exactly these three elements in the exact same other: | |
| - `counter` | duration (Go-syntax) | poll frequency of updating the counter metadata cache (example value: `20m`) | |
| - `instance` | duration (Go-syntax) | poll frequency of updating the instance cache (example value: `10m`) | |
Expand Down
40 changes: 36 additions & 4 deletions pkg/matrix/matrix.go
Original file line number Diff line number Diff line change
Expand Up @@ -432,15 +432,25 @@ func (m *Matrix) Divide(metricKey string, baseKey string, logger *logging.Logger
return skips, nil
}

func (m *Matrix) DivideWithThreshold(metricKey string, baseKey string, threshold int, logger *logging.Logger) (int, error) {
// DivideWithThreshold applicable for latency counters
func (m *Matrix) DivideWithThreshold(metricKey string, baseKey string, threshold int, curRawMat *Matrix, prevRawMat *Matrix, logger *logging.Logger) (int, error) {
var skips int
x := float64(threshold)
curRawMetric := curRawMat.GetMetric(metricKey)
curBaseRawMetric := curRawMat.GetMetric(baseKey)
prevRawMetric := prevRawMat.GetMetric(metricKey)
prevBaseRawMetric := prevRawMat.GetMetric(baseKey)
metric := m.GetMetric(metricKey)
base := m.GetMetric(baseKey)
time := m.GetMetric("timestamp")
var tValues []float64
if time != nil {
tValues = time.values
}
sValues := base.values
sRecord := base.GetRecords()
if len(metric.values) != len(sValues) {
return 0, errs.New(ErrUnequalVectors, fmt.Sprintf("numerator=%d, denominator=%d", len(metric.values), len(sValues)))
if len(metric.values) != len(sValues) || len(sValues) != len(tValues) {
return 0, errs.New(ErrUnequalVectors, fmt.Sprintf("numerator=%d, denominator=%d, time=%d", len(metric.values), len(sValues), len(tValues)))
}
for i := 0; i < len(metric.values); i++ {
v := metric.values[i]
Expand All @@ -456,8 +466,30 @@ func (m *Matrix) DivideWithThreshold(metricKey string, baseKey string, threshold
Float64("denominator", sValues[i]).
Msg("Negative values")
} else if metric.record[i] && sRecord[i] {
if sValues[i] >= x {
// For a latency counter, ensure that the base counter has sufficient operations for accurate calculation.
minimumBase := tValues[i] * x
if metric.GetName() == "optimal_point_latency" {
// An exception is made for headroom latency because the base counter always has a few IOPS
minimumBase = 0
}
if sValues[i] > minimumBase {
metric.values[i] /= sValues[i]
// if cooked latency is greater than 5 secs log delta values
if metric.values[i] > 5_000_000 {
if len(metric.values) == len(curRawMetric.values) && len(curRawMetric.values) == len(prevRawMetric.values) &&
len(prevRawMetric.values) == len(curBaseRawMetric.values) && len(curBaseRawMetric.values) == len(prevBaseRawMetric.values) {
logger.Debug().
Str("metric", metric.GetName()).
Str("key", metricKey).
Float64("numerator", v).
Float64("denominator", sValues[i]).
Float64("prev_raw_latency", prevRawMetric.values[i]).
Float64("current_raw_latency", curRawMetric.values[i]).
Float64("prev_raw_base", prevBaseRawMetric.values[i]).
Float64("current_raw_base", curBaseRawMetric.values[i]).
Msg("Detected high latency value in the metric")
}
}
} else {
metric.values[i] = 0
}
Expand Down
Loading

0 comments on commit 26e5f27

Please sign in to comment.