Skip to content

Commit

Permalink
More minor updates
Browse files Browse the repository at this point in the history
Signed-off-by: Thomas Newton <[email protected]>
  • Loading branch information
Tom-Newton committed Apr 19, 2024
1 parent 598bd8c commit c35d80a
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 39 deletions.
2 changes: 1 addition & 1 deletion deployment/stats/prometheus/flyteadmin-dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"refresh": "10s",
"rows": [
{
"collapse": false,
"collapse": true,
"editable": true,
"height": "250px",
"panels": [
Expand Down
38 changes: 19 additions & 19 deletions deployment/stats/prometheus/flytepropeller-dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@
"yaxes": [
{
"decimals": null,
"format": "ops",
"format": "none",
"label": null,
"logBase": 1,
"max": null,
Expand Down Expand Up @@ -446,7 +446,7 @@
"instant": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "abort",
"legendFormat": "not-found",
"metric": "",
"query": "sum(rate(flyte:propeller:all:round:not_found[5m]))",
"refId": "D",
Expand Down Expand Up @@ -618,7 +618,7 @@
"yaxes": [
{
"decimals": null,
"format": "none",
"format": "ops",
"label": null,
"logBase": 1,
"max": null,
Expand Down Expand Up @@ -2256,7 +2256,7 @@
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Failures from metastore",
"title": "Metastore failure rate",
"tooltip": {
"msResolution": true,
"shared": true,
Expand Down Expand Up @@ -2890,7 +2890,7 @@
"instant": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "proto-fetch",
"legendFormat": "proto-fetch-P{{quantile}}",
"metric": "",
"query": "sum(flyte:propeller:all:metastore:proto_fetch_ms) by (quantile, wf)",
"refId": "A",
Expand All @@ -2905,7 +2905,7 @@
"instant": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "remote-fetch",
"legendFormat": "remote-fetch-P{{quantile}}",
"metric": "",
"query": "sum(flyte:propeller:all:metastore:remote_fetch_ms) by (quantile, wf)",
"refId": "B",
Expand Down Expand Up @@ -3468,7 +3468,7 @@
"instant": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "user error",
"legendFormat": "unknown error",
"metric": "",
"query": "sum(rate(flyte:propeller:all:node:perma_unknown_error_duration_unlabeled_ms[5m]))",
"refId": "C",
Expand All @@ -3479,7 +3479,7 @@
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "node event recording rate",
"title": "node event recording error rate breakdown",
"tooltip": {
"msResolution": true,
"shared": true,
Expand Down Expand Up @@ -4001,7 +4001,7 @@
"instant": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "success",
"legendFormat": "success-{{wf}}",
"metric": "",
"query": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count[5m])) by (wf)",
"refId": "A",
Expand All @@ -4016,7 +4016,7 @@
"instant": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "failure",
"legendFormat": "failure-{{wf}}",
"metric": "",
"query": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count[5m])) by (wf)",
"refId": "B",
Expand Down Expand Up @@ -4270,7 +4270,7 @@
"instant": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "success wf",
"legendFormat": "success-{{wf}}",
"metric": "",
"query": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count[5m])) by (wf)",
"refId": "A",
Expand All @@ -4285,7 +4285,7 @@
"instant": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "failure",
"legendFormat": "failure-{{wf}}",
"metric": "",
"query": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count[5m])) by (wf)",
"refId": "B",
Expand Down Expand Up @@ -6618,7 +6618,7 @@
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Update events from informer",
"title": "Update event rate from informer",
"tooltip": {
"msResolution": true,
"shared": true,
Expand All @@ -6637,7 +6637,7 @@
"yaxes": [
{
"decimals": null,
"format": "short",
"format": "ops",
"label": null,
"logBase": 1,
"max": null,
Expand Down Expand Up @@ -6745,7 +6745,7 @@
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Update events dropped becacuse they have the same resource version",
"title": "Update events drop rate becacuse they have the same resource version",
"tooltip": {
"msResolution": true,
"shared": true,
Expand All @@ -6764,7 +6764,7 @@
"yaxes": [
{
"decimals": null,
"format": "short",
"format": "ops",
"label": null,
"logBase": 1,
"max": null,
Expand Down Expand Up @@ -6901,7 +6901,7 @@
"yaxes": [
{
"decimals": null,
"format": "short",
"format": "ops",
"label": null,
"logBase": 1,
"max": null,
Expand Down Expand Up @@ -7028,7 +7028,7 @@
"yaxes": [
{
"decimals": null,
"format": "short",
"format": "ops",
"label": null,
"logBase": 1,
"max": null,
Expand Down Expand Up @@ -7155,7 +7155,7 @@
"yaxes": [
{
"decimals": null,
"format": "short",
"format": "ops",
"label": null,
"logBase": 1,
"max": null,
Expand Down
1 change: 1 addition & 0 deletions stats/flyteadmin.dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ def create_all_apis(interval: int = 5) -> typing.List[Row]:
def grpc_latency_row() -> Graph:
return Row(
title="GRPC latency metrics",
collapse=True,
panels=[
BarGauge(
title="All GRPC calls latency",
Expand Down
38 changes: 19 additions & 19 deletions stats/flytepropeller.dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def create_free_workers() -> Graph:
),
],
yAxes=YAxes(
YAxis(format=OPS_FORMAT),
YAxis(format=NO_FORMAT),
YAxis(format=SHORT_FORMAT),
),
)
Expand Down Expand Up @@ -105,7 +105,7 @@ def error_breakdown() -> Graph:
Target(
expr="sum(rate(flyte:propeller:all:round:not_found[5m]))",
refId="D",
legendFormat="abort",
legendFormat="not-found",
),
Target(
expr="sum(rate(flyte:propeller:all:round:skipped[5m]))",
Expand All @@ -132,7 +132,7 @@ def streak_rate() -> Graph:
),
],
yAxes=YAxes(
YAxis(format=NO_FORMAT),
YAxis(format=OPS_FORMAT),
YAxis(format=SHORT_FORMAT),
),
)
Expand Down Expand Up @@ -284,7 +284,7 @@ def node_input_latency() -> Graph:
def metastore_failures():
# Copy counts sum(rate(flyte:propeller:all:metastore:copy:overall_unlabeled_ms_count[5m]))
return Graph(
title=f"Failures from metastore",
title=f"Metastore failure rate",
dataSource=DATASOURCE,
targets=[
Target(
Expand Down Expand Up @@ -392,12 +392,12 @@ def metastore_latencies(collapse: bool) -> Row:
targets=[
Target(
expr="sum(flyte:propeller:all:metastore:proto_fetch_ms) by (quantile, wf)",
legendFormat="proto-fetch",
legendFormat="proto-fetch-P{{quantile}}",
refId="A",
),
Target(
expr="sum(flyte:propeller:all:metastore:remote_fetch_ms) by (quantile, wf)",
legendFormat="remote-fetch",
legendFormat="remote-fetch-P{{quantile}}",
refId="B",
),
],
Expand Down Expand Up @@ -477,12 +477,12 @@ def task_event_recording() -> typing.List[Graph]:
targets=[
Target(
expr=f"sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count[5m])) by (wf)",
legendFormat="success wf",
legendFormat="success-{{wf}}",
refId="A",
),
Target(
expr=f"sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count[5m])) by (wf)",
legendFormat="failure",
legendFormat="failure-{{wf}}",
refId="B",
),
],
Expand Down Expand Up @@ -510,12 +510,12 @@ def node_event_recording() -> typing.List[Graph]:
targets=[
Target(
expr=f"sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count[5m])) by (wf)",
legendFormat="success",
legendFormat="success-{{wf}}",
refId="A",
),
Target(
expr=f"sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count[5m])) by (wf)",
legendFormat="failure",
legendFormat="failure-{{wf}}",
refId="B",
),
],
Expand Down Expand Up @@ -678,7 +678,7 @@ def metastore_metrics(interval: int, collapse: bool) -> Row:
@staticmethod
def node_errors() -> Graph:
return Graph(
title="node event recording rate",
title="node event recording error rate breakdown",
dataSource=DATASOURCE,
targets=[
Target(
Expand All @@ -693,7 +693,7 @@ def node_errors() -> Graph:
),
Target(
expr=f"sum(rate(flyte:propeller:all:node:perma_unknown_error_duration_unlabeled_ms[5m]))",
legendFormat="user error",
legendFormat="unknown error",
refId="C",
),
],
Expand Down Expand Up @@ -887,26 +887,26 @@ def k8s_pod_informers(collapse: bool) -> Row:
collapse=collapse,
panels=[
Graph(
title=f"Update events from informer",
title=f"Update event rate from informer",
dataSource=DATASOURCE,
targets=[
Target(
expr=f"sum(rate(flyte:propeller:all:node:container:container:informer_update[5m]))",
refId="A",
),
],
yAxes=single_y_axis(format=SHORT_FORMAT),
yAxes=single_y_axis(format=OPS_FORMAT),
),
Graph(
title=f"Update events dropped becacuse they have the same resource version",
title=f"Update events drop rate becacuse they have the same resource version",
dataSource=DATASOURCE,
targets=[
Target(
expr=f"sum(rate(flyte:propeller:all:node:container:container:informer_update_dropped[5m]))",
refId="A",
),
],
yAxes=single_y_axis(format=SHORT_FORMAT),
yAxes=single_y_axis(format=OPS_FORMAT),
),
],
)
Expand Down Expand Up @@ -979,7 +979,7 @@ def workflowstore(collapse: bool) -> Row:
refId="A",
),
],
yAxes=single_y_axis(format=SHORT_FORMAT),
yAxes=single_y_axis(format=OPS_FORMAT),
),
Graph(
title="Evict workflows rate",
Expand All @@ -990,7 +990,7 @@ def workflowstore(collapse: bool) -> Row:
refId="A",
),
],
yAxes=single_y_axis(format=SHORT_FORMAT),
yAxes=single_y_axis(format=OPS_FORMAT),
),
Graph(
title="Workflow redundant updates rate",
Expand All @@ -1001,7 +1001,7 @@ def workflowstore(collapse: bool) -> Row:
refId="A",
),
],
yAxes=single_y_axis(format=SHORT_FORMAT),
yAxes=single_y_axis(format=OPS_FORMAT),
),
],
)
Expand Down

0 comments on commit c35d80a

Please sign in to comment.