From 77f6557e17c94c784674c48a0c7d3269535aed7a Mon Sep 17 00:00:00 2001
From: Adam Gardner Let's plumb that together now. Sample k6 teardown test finished event For information only, no action is required. This is already coded into the demo load test script. Ensure you are still on the The UI will change this to Remove
Automate the Site Reliability Gu
Sample k6 teardown test finished event#
+
+export function teardown() {
- let post_params = {
- headers: {
- 'Content-Type': 'application/json',
- 'Authorization': `Api-Token ${__ENV.K6_DYNATRACE_APITOKEN}`
- },
- };
-
- let test_duration = 2m;
-
- // Send SDLC event at the end of the test
- let payload = {
- "event.provider": "k6",
- "event.type": "test",
- "event.category": "finished",
- "service": "checkoutservice",
- "duration": test_duration
- }
- let res = http.post(`${__ENV.K6_DYNATRACE_URL}/platform/ingest/v1/events.sdlc`, JSON.stringify(payload), post_params);
-}
-
export function teardown() {
+ // Send event at the end of the test
+ let payload = {
+ "entitySelector": "type(SERVICE),entityName.equals(checkoutservice)",
+ "eventType": "CUSTOM_INFO",
+ "properties": {
+ "tool": "k6",
+ "action": "test",
+ "state": "finished",
+ "purpose": `${__ENV.LOAD_TEST_PURPOSE}`,
+ "duration": test_duration
+ },
+ "title": "k6 load test finished"
+ }
+
+ let res = http.post(`${__ENV.K6_DYNATRACE_URL}/api/v2/events/ingest`, JSON.stringify(payload), post_params);
+ }
+}
+
Create a Workflow to Trigger Guardian#
Three golden signals (checkoutservice)
screen.
@@ -654,25 +655,30 @@
-Create a Workflow to Trigger Guar
event type
from bizevents
to events
.Filter query
to:
+event.type == "test"
-AND event.category == "finished"
-AND service == "checkoutservice"
-
event.type == "CUSTOM_INFO" and
+dt.entity.service.name == "checkoutservice" and
+tool == "k6" and
+action == "test" and
+state == "finished"
+
-run_validation
node.event.timeframe.from
and replace with:
+now-{{ event()['duration'] }}
-
now-{{ event()['duration'] }}
+
now-event.duration
.
-
-event.timeframe.to
and replace with:
-now
-
-
Save
button.event.timeframe.to
and replace with:
+now
+
Click the Save
button.
The workflow is now created and connected to the guardian. It will be triggered whenever the platform receives an event like below.
diff --git a/cleanup/index.html b/cleanup/index.html index dd7054b..3783697 100755 --- a/cleanup/index.html +++ b/cleanup/index.html @@ -203,7 +203,7 @@
Go to https://github.com/codespaces and delete the codespace which will delete the demo environment.
+You may also wish to delete the API token.
Note: This process can be automated for at-scale usage using Monaco or Terraform.
-## TODO: Need secondary tutorial on this? Link to it for each...
-
+Automate at scale
+This process can be automated for at-scale usage using Monaco or Terraform.
+In a real scenario, these test runs would likely be spread over hours, days or weeks. This provides Dynatrace with ample time to gather sufficient usage data.
For demo purposes, 5 seperate "load tests" will be triggered in quick succession to enable the baselining.
First, open a new terminal window and apply the load test script:
-kubectl apply -f .devcontainer/k6/k6-load-test-script.yaml
-
+kubectl apply -f .devcontainer/k6/k6-load-test-script.yaml
+
kubectl apply -f .devcontainer/k6/k6-srg-training-run1.yaml
-
+kubectl apply -f .devcontainer/k6/k6-srg-training-run1.yaml
+
Wait a few seconds and trigger the second load test:
-kubectl apply -f .devcontainer/k6/k6-srg-training-run2.yaml
-
+kubectl apply -f .devcontainer/k6/k6-srg-training-run2.yaml
+
Wait a few seconds and trigger the third load test:
-kubectl apply -f .devcontainer/k6/k6-srg-training-run3.yaml
-
+kubectl apply -f .devcontainer/k6/k6-srg-training-run3.yaml
+
Wait a few seconds and trigger the fourth load test:
-kubectl apply -f .devcontainer/k6/k6-srg-training-run4.yaml
-
+kubectl apply -f .devcontainer/k6/k6-srg-training-run4.yaml
+
Wait a few seconds and trigger the final (fifth) load test:
-kubectl apply -f .devcontainer/k6/k6-srg-training-run5.yaml
-
+kubectl apply -f .devcontainer/k6/k6-srg-training-run5.yaml
+
Each load test runs for 1 minute so keep running the following command until you see all jobs listed as Complete
:
kubectl get jobs
-
-➜ /workspaces/obslab-release-validation (main) $ kubectl get jobs
+Each load test runs for 1 minute. Run this command to wait for all jobs to complete.
+This command will appear to hang until the jobs are done. Be patient. It should take about 2mins:
+kubectl -n default wait --for=condition=Complete --all --timeout 120s jobs
+
+➜ /workspaces/obslab-release-validation (main) $ kubectl get jobs
NAME STATUS COMPLETIONS DURATION AGE
k6-training-run1 Complete 1/1 95s 2m2s
k6-training-run2 Complete 1/1 93s 115s
k6-training-run3 Complete 1/1 93s 108s
k6-training-run4 Complete 1/1 90s 100s
k6-training-run5 Complete 1/1 84s 94s
-
+
View Completed Training Runs#
In Dynatrace, go to workflows
and select Executions
. You should see 5 successful workflow executions:
View SRG Status using DQL#
You can also use this DQL to see the Site Reliability Guardian results in a notebook:
-fetch bizevents
+fetch bizevents
| filter event.provider == "dynatrace.site.reliability.guardian"
| filter event.type == "guardian.validation.finished"
| fieldsKeep guardian.id, validation.id, timestamp, guardian.name, validation.status, validation.summary, validation.from, validation.to
-
+
View SRG Status in the Site Reliability Guardian App#
The SRG results are also available in the Site Reliabiltiy Guardian app:
@@ -830,9 +813,11 @@ View SRG Status in
You should see the 5
runs listed:
-Training Complete#
+
+Training Complete
The automatic baselines for the guardian are now enabled.
You can proceed to use the guardian for "real" evaluations.
+
- Click Here to Continue
diff --git a/enable-change/index.html b/enable-change/index.html
index 39177e6..0300002 100755
--- a/enable-change/index.html
+++ b/enable-change/index.html
@@ -203,7 +203,7 @@
- GitHub
+ View Code on GitHub
@@ -249,7 +249,7 @@
- GitHub
+ View Code on GitHub
@@ -451,6 +451,15 @@
+
+
+
+
+
+ Change Flag Value
+
+
+
@@ -590,6 +599,15 @@
+
+
+
+
+
+ Change Flag Value
+
+
+
@@ -633,25 +651,31 @@ 8. Make a Change
A product manager informs you that they're ready to release their new feature. They ask you to enable the feature and run the load test in a dev environment.
They tell you that the new feature is behind a flag called paymentServiceFailure
(yes, an obvious name for this demo) and they tell you to change the defaultValue
from off
to on
.
Update the Feature Flag and Inform Dynatrce#
-Run the following script which:
-
-- Notifies Dynatrace using a
CUSTOM_INFO
event of the change inc. the new value
-- Changes the
defaultValue
of the paymentServiceFailure
feature flag to on
-- Applies the configuration change
-
-./runtimeChange.sh paymentServiceFailure on
-
+Run the following script which notifies Dynatrace using a CUSTOM_INFO
event of the change inc. the new value.
+./runtimeChange.sh paymentServiceFailure on
+
+Change Flag Value#
+Locate the flags.yaml
file.
+Change the defaultValue
of the paymentServiceFailure
flag from "off"
to "on"
(line 84
).
+Apply those changes:
+kubectl apply -f $CODESPACE_VSCODE_FOLDER/flags.yaml
+
+You should see:
+configmap/my-otel-demo-flagd-config configured
+
Run Acceptance Load Test#
It is time to run an acceptance load test to see if the new feature has caused a regression.
This load test will run for 3 minutes and then trigger the site reliability guardian again:
-kubectl apply -f .devcontainer/k6/k6-after-change.yaml
-
+kubectl apply -f .devcontainer/k6/k6-after-change.yaml
+
Configuration Change Events#
-While you are waiting for the load test to complete, it is worth noting that each time a feature flag is changed, the runtimeChange.sh
shell script sends an event to the service that is affected.
+While you are waiting for the load test to complete, it is worth noting that each time a feature flag is changed, you should execute runtimeChange.sh
shell script to send an event to the service that is affected.
The feature flag changes the behaviour of the paymentservice
(which the checkoutservice
depends on).
-Look at the paymentservice
and notice the configuration changed events noted.
-Dynatrace AI uses these events as part of the root cause analysis engine.
+Look at the paymentservice
and notice the configuration changed events.
+
+Tip
You can send event for anything you like: deployments, load tests, security scans, configuration changes and more.
+
diff --git a/getting-started/index.html b/getting-started/index.html
index 817a766..6a473b1 100755
--- a/getting-started/index.html
+++ b/getting-started/index.html
@@ -203,7 +203,7 @@
- GitHub
+ View Code on GitHub
@@ -249,7 +249,7 @@
- GitHub
+ View Code on GitHub
@@ -329,12 +329,27 @@
-
+
+
+ Format Dynatrace Environment URL
+
+
+
+
+
-
@@ -581,12 +596,27 @@
-
+
+
+ Format Dynatrace Environment URL
+
+
+
+
+
-
@@ -622,9 +652,18 @@
Getting Startedsign up here)
- A Dynatrace API token (see below)
-Save the Dynatrace environment URL without the trailing slash and without the .apps.
in the URL:
-https://abc12345.live.dynatrace.com
-
+Format Dynatrace Environment URL#
+Save the Dynatrace environment URL:
+
+- Without the trailing slash
+- Without
.apps.
in the URL
+
+The generic format is:
+https://<EnvironmentID>.<Environment>.<URL>
+
+For example:
+
https://abc12345.live.dynatrace.com
+
Create API Token#
In Dynatrace:
@@ -634,7 +673,6 @@ Create API Token
- GitHub
+ View Code on GitHub
@@ -247,7 +247,7 @@
- GitHub
+ View Code on GitHub
@@ -668,7 +668,7 @@ Logical ArchitectureThe load testing tool is responsible for sending an event to signal "test is finished".
Integrators are responsible for crafting this event to contain any important information required by Dynatrace
such as the test duration.
diff --git a/resources/index.html b/resources/index.html
index 1a5712b..f768d84 100755
--- a/resources/index.html
+++ b/resources/index.html
@@ -203,7 +203,7 @@
- GitHub
+ View Code on GitHub
@@ -249,7 +249,7 @@
- GitHub
+ View Code on GitHub
diff --git a/run-production-srg/index.html b/run-production-srg/index.html
index a6e5bd5..9e272a7 100755
--- a/run-production-srg/index.html
+++ b/run-production-srg/index.html
@@ -203,7 +203,7 @@
- GitHub
+ View Code on GitHub
@@ -249,7 +249,7 @@
- GitHub
+ View Code on GitHub
@@ -431,21 +431,6 @@
-
-
@@ -587,21 +572,6 @@
-
-
@@ -624,46 +594,55 @@
7. Run a Production SRG
-The preparation phase is now complete. Everything before now is a one-off task.
+
+Preparation Complete
+The preparation phase is now complete.
+Everything before now is a one-off task.
In day-to-day operations, you would begin from here.
+
Run an Evaluation#
Now that the Site Reliability Guardian is trained, run another evaluation by triggering a load test.
-
-Remember, the workflow is currently configured to listen for test finished
events but you could easily create additional workflows with different triggers such as on-demand on time-based CRON triggers.
-
-Run another load test to trigger a sixth evaluation.
-kubectl apply -f .devcontainer/k6/k6.yaml
-
-After about 90 seconds, kubectl get jobs
should show:
-➜ /workspaces/obslab-release-validation (main) $ kubectl get jobs
-NAME STATUS COMPLETIONS DURATION AGE
+
+Tip
+Remember, the workflow is currently configured to listen for test finished
events
+but you could easily create additional workflows with different triggers such as on-demand on time-based CRON triggers.
+This provides an ability to continuously test your service (eg. in production).
+
+Run another load test to trigger a sixth evaluation.
+
kubectl apply -f .devcontainer/k6/k6.yaml
+
+Again, wait for all jobs to complete. This run will take longer. Approximately 2mins.
+kubectl -n default wait --for=condition=Complete --all --timeout 120s jobs
+
+When the above command returns, you should see:
+NAME STATUS COMPLETIONS DURATION AGE
k6-training-run1 Complete 1/1 102s 9m41s
k6-training-run2 Complete 1/1 100s 9m33s
k6-training-run3 Complete 1/1 101s 9m23s
k6-training-run4 Complete 1/1 93s 9m17s
k6-training-run5 Complete 1/1 91s 9m11s
run-k6 Complete 1/1 79s 81s
-
-
+
When this evaluation is completed, click the Refresh
button in the Validation history
panel of the site reliability guardian app (when viewing an individual guardian) and the heatmap should look like the image below
-
-Note: Your results may vary.
-In this example below, the Traffic
objective failed because the auto-adaptive thresholds detected that a traffic level below 1171
requests is too low and the actual traffic level was 1158
.
-Because one objective failed, the guardian failed.
-
+
+Your results may vary
+Your results may vary.
+In this example below, the Traffic
objective failed because the auto-adaptive thresholds detected that a traffic level below 1171
requests is too low and the actual traffic level was 1158
.
+Because one objective failed, the guardian failed.
+
5 training runs and 1 "real" run:
-Setting Objectives to Informational Only#
+
+Information Only Objectives
It is possible to add objectives that are "informational only" and do not contribute to the pass / fail decisions.
This is useful for new services where you are trying to "get a feel for" the real-world data values of your metrics.
-To set an objective as "information only":
-
-- Select the objective to open the side panel
-- Scroll down to
Define thresholds
-- Select the
No thresholds
option
-
+To set an objective as "information only":
+* Select the objective to open the side panel
+* Scroll down to Define thresholds
+* Select the No thresholds
option
+
- Click Here to Continue
diff --git a/search/search_index.json b/search/search_index.json
index 8ab14b1..1ac1d2f 100755
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Release Validation for DevOps Engineers with Site Reliability Guardian","text":"In this demo, you take on the role of a Product Manager or DevOps engineer. You are running an application, and wish to enable a new feature.
The application is already instrumented to emit tracing data, using the OpenTelemetry standard. The demo system will be automatically configured to transport that data to Dynatrace for storage and processing.
Your job is to:
- Ensure each service in the application is healthy.
- Ensure that any new release of a microservice does not negatively impact the application.
To achieve these objectives, you will:
- Create a Site Reliability Guardian to test and ensure the health of your microservices (starting with the most user impacting service - the
checkoutservice
) - Use the auto baselining capability of Dynatrace to suggest (and dynamically adjust) thresholds based on current and past performance.
"},{"location":"#a-new-release","title":"A New Release","text":"Your company utilises feature flags to enable new features. A product manager informs you that they wish to release a new feature.
It is your job to:
- Enable that feature flag in a development environment.
- Judge the impact (if any) of that change on the application.
- If an impact is observed, gather the evidence and then disable the feature flag.
- Make the \"go / no go\" decision for that feature.
- Provide feedback on why you made the decision you did.
"},{"location":"#logical-architecture","title":"Logical Architecture","text":"Below is the \"flow\" of information and actors during this demo.
This architecture also holds true for other load testing tools (eg. JMeter).
-
A load test is executed. The HTTP requests are annotated with the standard header values.
-
Metrics are streamed during the load test (if the load testing tool supports this) or the metrics are send at the end of the load test.
-
The load testing tool is responsible for sending an SDLC event to signal \"test is finished\". Integrators are responsible for crafting this event to contain any important information required by Dynatrace such as the test duration.
-
A workflow is triggered on receipt of this event. The workflow triggers the Site Reliability Guardian.
-
The Site Reliability Guardian processes the load testing metrics and to provide an automated load testing report. This can be used for information only or can be used as an automated \"go / no go\" decision point.
-
Dynatrace users can view the results in a dashboard, notebook or use the result as a trigger for further automated workflows.
-
Integrators have the choice to send (emit) the results to an external tool. This external tool can then use this result. One example would be sending the SRG result to Jenkins to progress or prevent a deployment.
"},{"location":"#compatibility","title":"Compatibility","text":"Deployment Tutorial Compatible Dynatrace Managed \u274c Dynatrace SaaS \u2714\ufe0f - Click Here to Begin
"},{"location":"automate-srg/","title":"Automate the Site Reliability Guardian","text":"Site reliability guardians can be automated so they happen whenever you prefer (on demand / on schedule / event based). A Dynatrace workflow is used to achieve this.
In this demo:
- A load test will run and send a \"load test finished\" Software Delivery Lifecycle event into Dynatrace (see below).
- A Dynatrace workflow will react to that event and trigger a guardian.
Let's plumb that together now.
"},{"location":"automate-srg/#sample-k6-teardown-test-finished-event","title":"Sample k6 teardown test finished event","text":"This is already coded into the demo load test script.
export function teardown() {\n let post_params = {\n headers: {\n 'Content-Type': 'application/json',\n 'Authorization': `Api-Token ${__ENV.K6_DYNATRACE_APITOKEN}`\n },\n };\n\n let test_duration = 2m;\n\n // Send SDLC event at the end of the test\n let payload = {\n \"event.provider\": \"k6\",\n \"event.type\": \"test\",\n \"event.category\": \"finished\",\n \"service\": \"checkoutservice\",\n \"duration\": test_duration\n }\n let res = http.post(`${__ENV.K6_DYNATRACE_URL}/platform/ingest/v1/events.sdlc`, JSON.stringify(payload), post_params);\n}\n
"},{"location":"automate-srg/#create-a-workflow-to-trigger-guardian","title":"Create a Workflow to Trigger Guardian","text":"Ensure you are still on the Three golden signals (checkoutservice)
screen.
- Click the
Automate
button. This will create a template workflow. - Change the
event type
from bizevents
to events
. - Change the
Filter query
to:
event.type == \"test\"\nAND event.category == \"finished\"\nAND service == \"checkoutservice\"\n
- Click the
run_validation
node. - Remove
event.timeframe.from
and replace with:
now-{{ event()['duration'] }}\n
The UI will change this to now-event.duration
.
- Remove
event.timeframe.to
and replace with:
now\n
- Click the
Save
button.
The workflow is now created and connected to the guardian. It will be triggered whenever the platform receives an event like below.
The workflow is now live and listening for events.
- Click Here to Continue
"},{"location":"cleanup/","title":"Cleanup","text":"Go to https://github.com/codespaces and delete the codespace which will delete the demo environment.
- View all resources related to this demo
"},{"location":"create-srg/","title":"Create Site Reliability Guardian","text":"Site reliability guardians are a mechanism to automate analysis when changes are made. They can be used in production (on a CRON) or as deployment checks (eg. pre and post deployment health checks, security checks, infrastructure health checks).
We will create a guardian to check the checkoutservice
microservice which is used during the purchase journey.
- Press
ctrl + k
search for Site Reliability Guardian
and select the app. - Click
+ Guardian
to add a new guardian. - Under
Four Golden Signals
choose Use template
. - Click
Run query
and toggle 50
rows per page to see more services. - Select the
checkoutservice
. Click Apply to template (1)
. - Hover over the
Saturation
objective and delete it (there are no resource statistics from OpenTelemetry available so this objective cannot be evaluated). - At the top right of the screen, customise the guardian name to be called
Three golden signals (checkoutservice)
. - Click
Save
Note: This process can be automated for at-scale usage using Monaco or Terraform.
## TODO: Need secondary tutorial on this? Link to it for each...\n
- Click Here to Continue
"},{"location":"enable-auto-baselines/","title":"Enable Automatic Baselining for Site Reliability Guardian","text":"Objectives that are set to \"auto baseline\" in Dynatrace Site Reliability Guardians require 5
runs in order to enable the baselines.
In a real scenario, these test runs would likely be spread over hours, days or weeks. This provides Dynatrace with ample time to gather sufficient usage data.
For demo purposes, 5 seperate \"load tests\" will be triggered in quick succession to enable the baselining.
First, open a new terminal window and apply the load test script:
kubectl apply -f .devcontainer/k6/k6-load-test-script.yaml\n
"},{"location":"enable-auto-baselines/#trigger-the-first-load-test","title":"Trigger the First Load Test","text":"kubectl apply -f .devcontainer/k6/k6-srg-training-run1.yaml\n
"},{"location":"enable-auto-baselines/#trigger-the-second-load-test","title":"Trigger the Second Load Test","text":"Wait a few seconds and trigger the second load test:
kubectl apply -f .devcontainer/k6/k6-srg-training-run2.yaml\n
"},{"location":"enable-auto-baselines/#trigger-the-third-load-test","title":"Trigger the Third Load Test","text":"Wait a few seconds and trigger the third load test:
kubectl apply -f .devcontainer/k6/k6-srg-training-run3.yaml\n
"},{"location":"enable-auto-baselines/#trigger-the-fourth-load-test","title":"Trigger the Fourth Load Test","text":"Wait a few seconds and trigger the fourth load test:
kubectl apply -f .devcontainer/k6/k6-srg-training-run4.yaml\n
"},{"location":"enable-auto-baselines/#trigger-the-final-training-load-test","title":"Trigger the Final Training Load Test","text":"Wait a few seconds and trigger the final (fifth) load test:
kubectl apply -f .devcontainer/k6/k6-srg-training-run5.yaml\n
"},{"location":"enable-auto-baselines/#wait-for-completion","title":"Wait for Completion","text":"Each load test runs for 1 minute so keep running the following command until you see all jobs listed as Complete
:
kubectl get jobs\n
\u279c /workspaces/obslab-release-validation (main) $ kubectl get jobs\nNAME STATUS COMPLETIONS DURATION AGE\nk6-training-run1 Complete 1/1 95s 2m2s\nk6-training-run2 Complete 1/1 93s 115s\nk6-training-run3 Complete 1/1 93s 108s\nk6-training-run4 Complete 1/1 90s 100s\nk6-training-run5 Complete 1/1 84s 94s\n
"},{"location":"enable-auto-baselines/#view-completed-training-runs","title":"View Completed Training Runs","text":"In Dynatrace, go to workflows
and select Executions
. You should see 5 successful workflow executions:
"},{"location":"enable-auto-baselines/#view-srg-status-using-dql","title":"View SRG Status using DQL","text":"You can also use this DQL to see the Site Reliability Guardian results in a notebook:
fetch bizevents\n| filter event.provider == \"dynatrace.site.reliability.guardian\"\n| filter event.type == \"guardian.validation.finished\"\n| fieldsKeep guardian.id, validation.id, timestamp, guardian.name, validation.status, validation.summary, validation.from, validation.to\n
"},{"location":"enable-auto-baselines/#view-srg-status-in-the-site-reliability-guardian-app","title":"View SRG Status in the Site Reliability Guardian App","text":"The SRG results are also available in the Site Reliabiltiy Guardian app:
- Press
ctrl + k
- Search for
site reliability guardian
or srg
- Open the app and click
Open
on your guardian
You should see the 5
runs listed:
"},{"location":"enable-auto-baselines/#training-complete","title":"Training Complete","text":"The automatic baselines for the guardian are now enabled.
You can proceed to use the guardian for \"real\" evaluations.
- Click Here to Continue
"},{"location":"enable-change/","title":"8. Make a Change","text":"A product manager informs you that they're ready to release their new feature. They ask you to enable the feature and run the load test in a dev environment.
They tell you that the new feature is behind a flag called paymentServiceFailure
(yes, an obvious name for this demo) and they tell you to change the defaultValue
from off
to on
.
"},{"location":"enable-change/#update-the-feature-flag-and-inform-dynatrce","title":"Update the Feature Flag and Inform Dynatrce","text":"Run the following script which:
- Notifies Dynatrace using a
CUSTOM_INFO
event of the change inc. the new value - Changes the
defaultValue
of the paymentServiceFailure
feature flag to on
- Applies the configuration change
./runtimeChange.sh paymentServiceFailure on\n
"},{"location":"enable-change/#run-acceptance-load-test","title":"Run Acceptance Load Test","text":"It is time to run an acceptance load test to see if the new feature has caused a regression.
This load test will run for 3 minutes and then trigger the site reliability guardian again:
kubectl apply -f .devcontainer/k6/k6-after-change.yaml\n
"},{"location":"enable-change/#configuration-change-events","title":"Configuration Change Events","text":"While you are waiting for the load test to complete, it is worth noting that each time a feature flag is changed, the runtimeChange.sh
shell script sends an event to the service that is affected.
The feature flag changes the behaviour of the paymentservice
(which the checkoutservice
depends on).
Look at the paymentservice
and notice the configuration changed events noted.
Dynatrace AI uses these events as part of the root cause analysis engine.
You can send event for anything you like: deployments, load tests, security scans, configuration changes and more.
- Click Here to Continue
"},{"location":"getting-started/","title":"Getting Started","text":"You must have the following to use this hands on demo.
- A Dynatrace environment (sign up here)
- A Dynatrace API token (see below)
Save the Dynatrace environment URL without the trailing slash and without the .apps.
in the URL:
https://abc12345.live.dynatrace.com\n
"},{"location":"getting-started/#create-api-token","title":"Create API Token","text":"In Dynatrace:
- Press
ctrl + k
. Search for access tokens
. - Create a new access token with the following permissions:
metrics.ingest
logs.ingest
events.ingest
openTelemetryTrace.ingest
openpipeline.events_sdlc
"},{"location":"getting-started/#start-demo","title":"Start Demo","text":"Click this button to open the demo environment. This will open in a new tab.
- Click Here to Continue
"},{"location":"resources/","title":"Resources","text":" - Free Dynatrace trial
- This repository and documentation on GitHub
- Where to next?
"},{"location":"run-production-srg/","title":"7. Run a Production SRG","text":"The preparation phase is now complete. Everything before now is a one-off task.
In day-to-day operations, you would begin from here.
"},{"location":"run-production-srg/#run-an-evaluation","title":"Run an Evaluation","text":"Now that the Site Reliability Guardian is trained, run another evaluation by triggering a load test.
Remember, the workflow is currently configured to listen for test finished
events but you could easily create additional workflows with different triggers such as on-demand on time-based CRON triggers.
Run another load test to trigger a sixth evaluation.
kubectl apply -f .devcontainer/k6/k6.yaml\n
After about 90 seconds, kubectl get jobs
should show:
\u279c /workspaces/obslab-release-validation (main) $ kubectl get jobs\nNAME STATUS COMPLETIONS DURATION AGE\nk6-training-run1 Complete 1/1 102s 9m41s\nk6-training-run2 Complete 1/1 100s 9m33s\nk6-training-run3 Complete 1/1 101s 9m23s\nk6-training-run4 Complete 1/1 93s 9m17s\nk6-training-run5 Complete 1/1 91s 9m11s\nrun-k6 Complete 1/1 79s 81s\n\n
When this evaluation is completed, click the Refresh
button in the Validation history
panel of the site reliability guardian app (when viewing an individual guardian) and the heatmap should look like the image below
Note: Your results may vary.
In this example below, the Traffic
objective failed because the auto-adaptive thresholds detected that a traffic level below 1171
requests is too low and the actual traffic level was 1158
. Because one objective failed, the guardian failed.
5 training runs and 1 \"real\" run:
"},{"location":"run-production-srg/#setting-objectives-to-informational-only","title":"Setting Objectives to Informational Only","text":"It is possible to add objectives that are \"informational only\" and do not contribute to the pass / fail decisions.
This is useful for new services where you are trying to \"get a feel for\" the real-world data values of your metrics.
To set an objective as \"information only\":
- Select the objective to open the side panel
- Scroll down to
Define thresholds
- Select the
No thresholds
option
- Click Here to Continue
"},{"location":"validate-telemetry/","title":"Start The Demo","text":"After the codespaces has started, the post creation script should begin. This will install everything and will take a few moments.
When the script has completed, a success message will briefly be displayed (it is so quick you'll probably miss it) and an empty terminal window will be shown.
"},{"location":"validate-telemetry/#wait-for-demo-to-start","title":"Wait For Demo to Start","text":"Wait for the demo application pods to start:
kubectl -n default wait --for=condition=Ready --all --timeout 300s pod\n
"},{"location":"validate-telemetry/#access-demo-user-interface","title":"Access Demo User Interface","text":"Start port forwarding to access the user interface:
kubectl -n default port-forward svc/my-otel-demo-frontendproxy 8080\n
Leave this command running. Open a new terminal window to run any other commands.
Go to ports tab, right click the demo app
entry and choose Open in browser
.
You should see the OpenTelemetry demo:
"},{"location":"validate-telemetry/#validate-telemetry","title":"Validate Telemetry","text":"It is time to ensure telemetry is flowing correctly into Dynatrace.
In Dynatrace, follow these steps:
"},{"location":"validate-telemetry/#validate-services","title":"Validate Services","text":" - Press
ctrl + k
. Search for services
. Go to services screen and validate you can see services. - Open a service and validate that the URL contains
SERVICE-****
. - If the URL contains
CUSTOM_DEVICE-****
: - Press
ctrl + k
and search for settings
. - Go to
Service Detection > Unified services for OpenTelemetry
and ensure the toggle is on.
"},{"location":"validate-telemetry/#validate-traces","title":"Validate Traces","text":" - Press
ctrl + k
. Search for distributed traces
. - Go to distributed traces and validate data is flowing.
"},{"location":"validate-telemetry/#validate-metrics","title":"Validate Metrics","text":" - Press
ctrl + k
. Search for metrics
. - Go to metrics and search for
app.
and validate you can see some metrics.
"},{"location":"validate-telemetry/#validate-logs","title":"Validate Logs","text":" - Press
ctrl + k
. Search for notebooks
. - Create a new notebook then click
+
to add a new DQL
section. - Use this Dynatrace Query Language. Validate you can see some log lines.
fetch logs, scanLimitGBytes: 1\n| filter contains(content, \"conversion\")\n
"},{"location":"validate-telemetry/#telemetry-flowing","title":"Telemetry Flowing?","text":"If these four things are OK, your telemetry is flowing correctly into Dynatrace.
If not, please search for similar problems and / or raise an issue here.
- Click Here to Continue
"},{"location":"view-acceptance-test-results/","title":"9. View Acceptance Test Results","text":""},{"location":"view-acceptance-test-results/#view-data","title":"View Data","text":"After ~3 minutes, kubectl get jobs
should show the acceptance-load-test
to be Complete
.
Refresh the Site Reliability Guardian results heatmap again and you should see that the guardian has failed.
The guardian has failed due to the error rate being too high.
Navigating to the checkoutservice
(ctrl + k
> services
> checkoutservice
), you can see the increase in failure rate.
Scroll down the services screen until you see the OpenTelemetry traces list. Notice lots of failed requests:
"},{"location":"view-acceptance-test-results/#analyse-a-failed-request","title":"Analyse a Failed Request","text":"Drill into one of the failed requests and notice lots of failures.
These failures are bubbling up through the request chain back towards the checkoutservice.
Ultimately though, the failure comes from the final span in the trace: The call to PaymentService/Charge
.
Investigating the span events the cause of the failure becomes clear: The payment service cuase an exception. The exception message and stacktrace is given:
exception.message PaymentService Fail Feature Flag Enabled\nexception.stacktrace Error: PaymentService Fail Feature Flag Enabled at module.exports.charge\n (/usr/src/app/charge.js:21:11) at process.processTicksAndRejections\n (node:internal/process/task_queues:95:5) at async Object.chargeServiceHandler\n [as charge] (/usr/src/app/index.js:21:22)\nexception.type Error\n
"},{"location":"view-acceptance-test-results/#roll-back-change","title":"Roll Back Change","text":"Roll back the change:
./runtimeChange.sh paymentServiceFailure off\n
"},{"location":"view-acceptance-test-results/#summary","title":"Summary","text":"Looking back at the initial brief, it was your job to:
- Enable that feature flag in a development environment.
- Judge the impact (if any) of that change on the application.
- If an impact is observed, gather the evidence and then disable the feature flag.
- Make the \"go / no go\" decision for that feature.
- Provide feedback to the product managers on why you made the decision you did.
So how did things turn out?
- You have enabled a feature flag and send contextual event information to Dynatrace.
- You used OpenTelemetry and Dynatrace to make an evidence-based analysis of the new software quality.
- You have automated the change analysis, noticing an impact and remediated it.
- You have protected users by automating this analysis in a development environment (of course, you could repeat this setup in production too).
- You have made the
no go
decision based on evidence provided by OpenTelemetry and the Dynatrace Site Reliability Guardian. - You can provide this evidence (down to the stacktrace and line of code) back to the product manager so they can prioritise fixes.
The Dynatrace Platform, Site Reliability Guardian and Workflows have provided visibility and automated change analysis.
- Cleanup Resources
"},{"location":"whats-next/","title":"What's Next?","text":"TODO
Content about how the user progresses after this demo.
"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Release Validation for DevOps Engineers with Site Reliability Guardian","text":"In this demo, you take on the role of a Product Manager or DevOps engineer. You are running an application, and wish to enable a new feature.
The application is already instrumented to emit tracing data, using the OpenTelemetry standard. The demo system will be automatically configured to transport that data to Dynatrace for storage and processing.
Your job is to:
- Ensure each service in the application is healthy.
- Ensure that any new release of a microservice does not negatively impact the application.
To achieve these objectives, you will:
- Create a Site Reliability Guardian to test and ensure the health of your microservices (starting with the most user impacting service - the
checkoutservice
) - Use the auto baselining capability of Dynatrace to suggest (and dynamically adjust) thresholds based on current and past performance.
"},{"location":"#a-new-release","title":"A New Release","text":"Your company utilises feature flags to enable new features. A product manager informs you that they wish to release a new feature.
It is your job to:
- Enable that feature flag in a development environment.
- Judge the impact (if any) of that change on the application.
- If an impact is observed, gather the evidence and then disable the feature flag.
- Make the \"go / no go\" decision for that feature.
- Provide feedback on why you made the decision you did.
"},{"location":"#logical-architecture","title":"Logical Architecture","text":"Below is the \"flow\" of information and actors during this demo.
This architecture also holds true for other load testing tools (eg. JMeter).
-
A load test is executed. The HTTP requests are annotated with the standard header values.
-
Metrics are streamed during the load test (if the load testing tool supports this) or the metrics are send at the end of the load test.
-
The load testing tool is responsible for sending an event to signal \"test is finished\". Integrators are responsible for crafting this event to contain any important information required by Dynatrace such as the test duration.
-
A workflow is triggered on receipt of this event. The workflow triggers the Site Reliability Guardian.
-
The Site Reliability Guardian processes the load testing metrics and to provide an automated load testing report. This can be used for information only or can be used as an automated \"go / no go\" decision point.
-
Dynatrace users can view the results in a dashboard, notebook or use the result as a trigger for further automated workflows.
-
Integrators have the choice to send (emit) the results to an external tool. This external tool can then use this result. One example would be sending the SRG result to Jenkins to progress or prevent a deployment.
"},{"location":"#compatibility","title":"Compatibility","text":"Deployment Tutorial Compatible Dynatrace Managed \u274c Dynatrace SaaS \u2714\ufe0f - Click Here to Begin
"},{"location":"automate-srg/","title":"Automate the Site Reliability Guardian","text":"Site reliability guardians can be automated so they happen whenever you prefer (on demand / on schedule / event based). A Dynatrace workflow is used to achieve this.
In this demo:
- A load test will run and send a \"load test finished\" Software Delivery Lifecycle event into Dynatrace (see below).
- A Dynatrace workflow will react to that event and trigger a guardian.
Let's plumb that together now.
Sample k6 teardown test finished event
For information only, no action is required.
This is already coded into the demo load test script.
export function teardown() {\n // Send event at the end of the test\n let payload = {\n \"entitySelector\": \"type(SERVICE),entityName.equals(checkoutservice)\",\n \"eventType\": \"CUSTOM_INFO\",\n \"properties\": {\n \"tool\": \"k6\",\n \"action\": \"test\",\n \"state\": \"finished\",\n \"purpose\": `${__ENV.LOAD_TEST_PURPOSE}`,\n \"duration\": test_duration\n },\n \"title\": \"k6 load test finished\"\n }\n\n let res = http.post(`${__ENV.K6_DYNATRACE_URL}/api/v2/events/ingest`, JSON.stringify(payload), post_params);\n }\n}\n
"},{"location":"automate-srg/#create-a-workflow-to-trigger-guardian","title":"Create a Workflow to Trigger Guardian","text":"Ensure you are still on the Three golden signals (checkoutservice)
screen.
- Click the
Automate
button. This will create a template workflow. - Change the
event type
from bizevents
to events
. - Change the
Filter query
to:
event.type == \"CUSTOM_INFO\" and\ndt.entity.service.name == \"checkoutservice\" and\ntool == \"k6\" and\naction == \"test\" and\nstate == \"finished\"\n
- Click the
run_validation
node. - Remove
event.timeframe.from
and replace with:
now-{{ event()['duration'] }}\n
The UI will change this to now-event.duration
.
-
Remove event.timeframe.to
and replace with:
now\n
-
Click the Save
button.
"},{"location":"automate-srg/#workflow-created","title":"Workflow Created","text":"The workflow is now created and connected to the guardian. It will be triggered whenever the platform receives an event like below.
The workflow is now live and listening for events.
- Click Here to Continue
"},{"location":"cleanup/","title":"Cleanup","text":"Go to https://github.com/codespaces and delete the codespace which will delete the demo environment.
You may also wish to delete the API token.
- View all resources related to this demo
"},{"location":"create-srg/","title":"Create Site Reliability Guardian","text":"Site reliability guardians are a mechanism to automate analysis when changes are made. They can be used in production (on a CRON) or as deployment checks (eg. pre and post deployment health checks, security checks, infrastructure health checks).
We will create a guardian to check the checkoutservice
microservice which is used during the purchase journey.
- Press
ctrl + k
search for Site Reliability Guardian
and select the app. - Click
+ Guardian
to add a new guardian. - Under
Four Golden Signals
choose Use template
. - Click
Run query
and toggle 50
rows per page to see more services. - Select the
checkoutservice
. Click Apply to template (1)
. - Hover over the
Saturation
objective and delete it (there are no resource statistics from OpenTelemetry available so this objective cannot be evaluated). - At the top right of the screen, customise the guardian name to be called
Three golden signals (checkoutservice)
. - Click
Save
Automate at scale
This process can be automated for at-scale usage using Monaco or Terraform.
- Click Here to Continue
"},{"location":"enable-auto-baselines/","title":"Enable Automatic Baselining for Site Reliability Guardian","text":"Objectives that are set to \"auto baseline\" in Dynatrace Site Reliability Guardians require 5
runs in order to enable the baselines.
In a real scenario, these test runs would likely be spread over hours, days or weeks. This provides Dynatrace with ample time to gather sufficient usage data.
For demo purposes, 5 seperate \"load tests\" will be triggered in quick succession to enable the baselining.
First, open a new terminal window and apply the load test script:
kubectl apply -f .devcontainer/k6/k6-load-test-script.yaml\n
"},{"location":"enable-auto-baselines/#trigger-the-first-load-test","title":"Trigger the First Load Test","text":"kubectl apply -f .devcontainer/k6/k6-srg-training-run1.yaml\n
"},{"location":"enable-auto-baselines/#trigger-the-second-load-test","title":"Trigger the Second Load Test","text":"Wait a few seconds and trigger the second load test:
kubectl apply -f .devcontainer/k6/k6-srg-training-run2.yaml\n
"},{"location":"enable-auto-baselines/#trigger-the-third-load-test","title":"Trigger the Third Load Test","text":"Wait a few seconds and trigger the third load test:
kubectl apply -f .devcontainer/k6/k6-srg-training-run3.yaml\n
"},{"location":"enable-auto-baselines/#trigger-the-fourth-load-test","title":"Trigger the Fourth Load Test","text":"Wait a few seconds and trigger the fourth load test:
kubectl apply -f .devcontainer/k6/k6-srg-training-run4.yaml\n
"},{"location":"enable-auto-baselines/#trigger-the-final-training-load-test","title":"Trigger the Final Training Load Test","text":"Wait a few seconds and trigger the final (fifth) load test:
kubectl apply -f .devcontainer/k6/k6-srg-training-run5.yaml\n
"},{"location":"enable-auto-baselines/#wait-for-completion","title":"Wait for Completion","text":"Each load test runs for 1 minute. Run this command to wait for all jobs to complete.
This command will appear to hang until the jobs are done. Be patient. It should take about 2mins:
kubectl -n default wait --for=condition=Complete --all --timeout 120s jobs\n
\u279c /workspaces/obslab-release-validation (main) $ kubectl get jobs\nNAME STATUS COMPLETIONS DURATION AGE\nk6-training-run1 Complete 1/1 95s 2m2s\nk6-training-run2 Complete 1/1 93s 115s\nk6-training-run3 Complete 1/1 93s 108s\nk6-training-run4 Complete 1/1 90s 100s\nk6-training-run5 Complete 1/1 84s 94s\n
"},{"location":"enable-auto-baselines/#view-completed-training-runs","title":"View Completed Training Runs","text":"In Dynatrace, go to workflows
and select Executions
. You should see 5 successful workflow executions:
"},{"location":"enable-auto-baselines/#view-srg-status-using-dql","title":"View SRG Status using DQL","text":"You can also use this DQL to see the Site Reliability Guardian results in a notebook:
fetch bizevents\n| filter event.provider == \"dynatrace.site.reliability.guardian\"\n| filter event.type == \"guardian.validation.finished\"\n| fieldsKeep guardian.id, validation.id, timestamp, guardian.name, validation.status, validation.summary, validation.from, validation.to\n
"},{"location":"enable-auto-baselines/#view-srg-status-in-the-site-reliability-guardian-app","title":"View SRG Status in the Site Reliability Guardian App","text":"The SRG results are also available in the Site Reliabiltiy Guardian app:
- Press
ctrl + k
- Search for
site reliability guardian
or srg
- Open the app and click
Open
on your guardian
You should see the 5
runs listed:
Training Complete
The automatic baselines for the guardian are now enabled.
You can proceed to use the guardian for \"real\" evaluations.
- Click Here to Continue
"},{"location":"enable-change/","title":"8. Make a Change","text":"A product manager informs you that they're ready to release their new feature. They ask you to enable the feature and run the load test in a dev environment.
They tell you that the new feature is behind a flag called paymentServiceFailure
(yes, an obvious name for this demo) and they tell you to change the defaultValue
from off
to on
.
"},{"location":"enable-change/#update-the-feature-flag-and-inform-dynatrce","title":"Update the Feature Flag and Inform Dynatrce","text":"Run the following script which notifies Dynatrace using a CUSTOM_INFO
event of the change inc. the new value.
./runtimeChange.sh paymentServiceFailure on\n
"},{"location":"enable-change/#change-flag-value","title":"Change Flag Value","text":"Locate the flags.yaml
file. Change the defaultValue
of the paymentServiceFailure
flag from \"off\"
to \"on\"
(line 84
).
Apply those changes:
kubectl apply -f $CODESPACE_VSCODE_FOLDER/flags.yaml\n
You should see:
configmap/my-otel-demo-flagd-config configured\n
"},{"location":"enable-change/#run-acceptance-load-test","title":"Run Acceptance Load Test","text":"It is time to run an acceptance load test to see if the new feature has caused a regression.
This load test will run for 3 minutes and then trigger the site reliability guardian again:
kubectl apply -f .devcontainer/k6/k6-after-change.yaml\n
"},{"location":"enable-change/#configuration-change-events","title":"Configuration Change Events","text":"While you are waiting for the load test to complete, it is worth noting that each time a feature flag is changed, you should execute runtimeChange.sh
shell script to send an event to the service that is affected.
The feature flag changes the behaviour of the paymentservice
(which the checkoutservice
depends on).
Look at the paymentservice
and notice the configuration changed events.
Tip
You can send event for anything you like: deployments, load tests, security scans, configuration changes and more.
- Click Here to Continue
"},{"location":"getting-started/","title":"Getting Started","text":"You must have the following to use this hands on demo.
- A Dynatrace environment (sign up here)
- A Dynatrace API token (see below)
"},{"location":"getting-started/#format-dynatrace-environment-url","title":"Format Dynatrace Environment URL","text":"Save the Dynatrace environment URL:
- Without the trailing slash
- Without
.apps.
in the URL
The generic format is:
https://<EnvironmentID>.<Environment>.<URL>\n
For example:
https://abc12345.live.dynatrace.com\n
"},{"location":"getting-started/#create-api-token","title":"Create API Token","text":"In Dynatrace:
- Press
ctrl + k
. Search for access tokens
. - Create a new access token with the following permissions:
metrics.ingest
logs.ingest
events.ingest
openTelemetryTrace.ingest
"},{"location":"getting-started/#start-demo","title":"Start Demo","text":"Click this button to open the demo environment. This will open in a new tab.
- Click Here to Continue
"},{"location":"resources/","title":"Resources","text":" - Free Dynatrace trial
- This repository and documentation on GitHub
- Where to next?
"},{"location":"run-production-srg/","title":"7. Run a Production SRG","text":"Preparation Complete
The preparation phase is now complete. Everything before now is a one-off task.
In day-to-day operations, you would begin from here.
"},{"location":"run-production-srg/#run-an-evaluation","title":"Run an Evaluation","text":"Now that the Site Reliability Guardian is trained, run another evaluation by triggering a load test.
Tip
Remember, the workflow is currently configured to listen for test finished
events but you could easily create additional workflows with different triggers such as on-demand on time-based CRON triggers.
This provides an ability to continuously test your service (eg. in production).
Run another load test to trigger a sixth evaluation.
kubectl apply -f .devcontainer/k6/k6.yaml\n
Again, wait for all jobs to complete. This run will take longer. Approximately 2mins.
kubectl -n default wait --for=condition=Complete --all --timeout 120s jobs\n
When the above command returns, you should see:
NAME STATUS COMPLETIONS DURATION AGE\nk6-training-run1 Complete 1/1 102s 9m41s\nk6-training-run2 Complete 1/1 100s 9m33s\nk6-training-run3 Complete 1/1 101s 9m23s\nk6-training-run4 Complete 1/1 93s 9m17s\nk6-training-run5 Complete 1/1 91s 9m11s\nrun-k6 Complete 1/1 79s 81s\n
When this evaluation is completed, click the Refresh
button in the Validation history
panel of the site reliability guardian app (when viewing an individual guardian) and the heatmap should look like the image below
Your results may vary
Your results may vary. In this example below, the Traffic
objective failed because the auto-adaptive thresholds detected that a traffic level below 1171
requests is too low and the actual traffic level was 1158
.
Because one objective failed, the guardian failed.
5 training runs and 1 \"real\" run:
Information Only Objectives
It is possible to add objectives that are \"informational only\" and do not contribute to the pass / fail decisions.
This is useful for new services where you are trying to \"get a feel for\" the real-world data values of your metrics.
To set an objective as \"information only\": * Select the objective to open the side panel * Scroll down to Define thresholds
* Select the No thresholds
option
- Click Here to Continue
"},{"location":"validate-telemetry/","title":"Start The Demo","text":"After the codespaces has started, the post creation script should begin. This will install everything and will take a few moments.
When the script has completed, a success message will briefly be displayed (it is so quick you'll probably miss it) and an empty terminal window will be shown.
"},{"location":"validate-telemetry/#wait-for-demo-to-start","title":"Wait For Demo to Start","text":"Wait for the demo application pods to start:
kubectl -n default wait --for=condition=Ready --all --timeout 300s pod\n
"},{"location":"validate-telemetry/#access-demo-user-interface","title":"Access Demo User Interface","text":"Start port forwarding to access the user interface:
kubectl -n default port-forward svc/my-otel-demo-frontendproxy 8080\n
Leave this command running. Open a new terminal window to run any other commands.
Go to ports tab, right click the demo app
entry and choose Open in browser
.
You should see the OpenTelemetry demo:
"},{"location":"validate-telemetry/#validate-telemetry","title":"Validate Telemetry","text":"It is time to ensure telemetry is flowing correctly into Dynatrace.
In Dynatrace, follow these steps:
"},{"location":"validate-telemetry/#validate-services","title":"Validate Services","text":" - Press
ctrl + k
. Search for services
. Go to services screen and validate you can see services. - Open a service and validate that the URL contains
SERVICE-****
. - If the URL contains
CUSTOM_DEVICE-****
: - Press
ctrl + k
and search for settings
. - Go to
Service Detection > Unified services for OpenTelemetry
and ensure the toggle is on.
"},{"location":"validate-telemetry/#validate-traces","title":"Validate Traces","text":" - Press
ctrl + k
. Search for distributed traces
. - Go to distributed traces and validate data is flowing.
"},{"location":"validate-telemetry/#validate-metrics","title":"Validate Metrics","text":" - Press
ctrl + k
. Search for metrics
. - Go to metrics and search for
app.
and validate you can see some metrics.
"},{"location":"validate-telemetry/#validate-logs","title":"Validate Logs","text":" - Press
ctrl + k
. Search for notebooks
. - Create a new notebook then click
+
to add a new DQL
section. - Use this Dynatrace Query Language. Validate you can see some log lines.
fetch logs, scanLimitGBytes: 1\n| filter contains(content, \"conversion\")\n
"},{"location":"validate-telemetry/#telemetry-flowing","title":"Telemetry Flowing?","text":"If these four things are OK, your telemetry is flowing correctly into Dynatrace.
If not, please search for similar problems and / or raise an issue here.
- Click Here to Continue
"},{"location":"view-acceptance-test-results/","title":"9. View Acceptance Test Results","text":""},{"location":"view-acceptance-test-results/#view-data","title":"View Data","text":"Wait for all jobs to complete:
kubectl -n default wait --for=condition=Complete --all --timeout 120s jobs\n
All jobs (including the acceptance-load-test
) should now be Complete
.
Refresh the Site Reliability Guardian results heatmap again and notice that the guardian has failed.
The guardian has failed due to the error rate being too high.
Navigating to the checkoutservice
(ctrl + k
> services
> checkoutservice
), you can see the increase in failure rate.
Scroll down the services screen until you see the OpenTelemetry traces list. Notice lots of failed requests:
"},{"location":"view-acceptance-test-results/#analyse-a-failed-request","title":"Analyse a Failed Request","text":"Drill into one of the failed requests and notice lots of failures.
These failures are bubbling up through the request chain back towards the checkoutservice.
Ultimately though, the failure comes from the final span in the trace: The call to PaymentService/Charge
.
Investigating the span events the cause of the failure becomes clear: The payment service cuase an exception. The exception message and stacktrace is given:
exception.message PaymentService Fail Feature Flag Enabled\nexception.stacktrace Error: PaymentService Fail Feature Flag Enabled at module.exports.charge\n (/usr/src/app/charge.js:21:11) at process.processTicksAndRejections\n (node:internal/process/task_queues:95:5) at async Object.chargeServiceHandler\n [as charge] (/usr/src/app/index.js:21:22)\nexception.type Error\n
"},{"location":"view-acceptance-test-results/#roll-back-change","title":"Roll Back Change","text":"Inform Dynatrace that a change in configuration is coming. The paymentServiceFailure
flag will be set to off
./runtimeChange.sh paymentServiceFailure off\n
Again edit flags.yaml
and set the defaultValue
of paymentServiceFailure
from \"on\"
to \"off\"
(line 84
)
Apply the chnages:
kubectl apply -f $CODESPACE_VSCODE_FOLDER/flags.yaml\n
"},{"location":"view-acceptance-test-results/#summary","title":"Summary","text":"Looking back at the initial brief, it was your job to:
- Enable a feature flag in a development environment.
- Judge the impact (if any) of that change on the application.
- If an impact is observed, gather the evidence and then disable the feature flag.
- Make the \"go / no go\" decision for that feature.
- Provide feedback to the product managers on why you made the decision you did.
So how did things turn out?
- You have enabled a feature flag and send contextual event information to Dynatrace.
- You used OpenTelemetry and Dynatrace to make an evidence-based analysis of the new software quality.
- You have automated the change analysis, noticing an impact and remediated it.
- You have protected users by automating this analysis in a development environment (of course, you could repeat this setup in production too).
- You have made the
no go
decision based on evidence provided by OpenTelemetry and the Dynatrace Site Reliability Guardian. - You can provide this evidence (down to the stacktrace and line of code) back to the product manager so they can prioritise fixes.
Works with any metric
The techniques described here work with any metric, from any source.
You are encouraged to use metrics from other devices and sources (such as business related metrics like revenue).
Success
The Dynatrace Platform, Site Reliability Guardian and Workflows have provided visibility and automated change analysis.
- Cleanup Resources
"},{"location":"whats-next/","title":"What's Next?","text":"Content about how the user progresses after this demo.
"}]}
\ No newline at end of file
diff --git a/validate-telemetry/index.html b/validate-telemetry/index.html
index 92f8aad..6e47764 100755
--- a/validate-telemetry/index.html
+++ b/validate-telemetry/index.html
@@ -203,7 +203,7 @@
- GitHub
+ View Code on GitHub
@@ -249,7 +249,7 @@
- GitHub
+ View Code on GitHub
@@ -743,12 +743,12 @@ Start The Demo
Wait For Demo to Start#
Wait for the demo application pods to start:
-kubectl -n default wait --for=condition=Ready --all --timeout 300s pod
-
+kubectl -n default wait --for=condition=Ready --all --timeout 300s pod
+
Access Demo User Interface#
Start port forwarding to access the user interface:
-kubectl -n default port-forward svc/my-otel-demo-frontendproxy 8080
-
+kubectl -n default port-forward svc/my-otel-demo-frontendproxy 8080
+
Leave this command running. Open a new terminal window to run any other commands.
Go to ports tab, right click the demo app
entry and choose Open in browser
.
@@ -788,9 +788,9 @@ Validate LogsCreate a new notebook then click +
to add a new DQL
section.
Use this Dynatrace Query Language. Validate you can see some log lines.
-fetch logs, scanLimitGBytes: 1
+fetch logs, scanLimitGBytes: 1
| filter contains(content, "conversion")
-
+
Telemetry Flowing?#
If these four things are OK, your telemetry is flowing correctly into Dynatrace.
diff --git a/view-acceptance-test-results/index.html b/view-acceptance-test-results/index.html
index 38c16ec..1c16226 100755
--- a/view-acceptance-test-results/index.html
+++ b/view-acceptance-test-results/index.html
@@ -203,7 +203,7 @@
- GitHub
+ View Code on GitHub
@@ -249,7 +249,7 @@
- GitHub
+ View Code on GitHub
@@ -649,8 +649,11 @@
9. View Acceptance Test Results
View Data#
-
After ~3 minutes, kubectl get jobs
should show the acceptance-load-test
to be Complete
.
-Refresh the Site Reliability Guardian results heatmap again and you should see that the guardian has failed.
+Wait for all jobs to complete:
+kubectl -n default wait --for=condition=Complete --all --timeout 120s jobs
+
+All jobs (including the acceptance-load-test
) should now be Complete
.
+Refresh the Site Reliability Guardian results heatmap again and notice that the guardian has failed.
The guardian has failed due to the error rate being too high.
@@ -664,22 +667,27 @@ Analyse a Failed RequestThese failures are bubbling up through the request chain back towards the checkoutservice.
Ultimately though, the failure comes from the final span in the trace: The call to PaymentService/Charge
.
Investigating the span events the cause of the failure becomes clear: The payment service cuase an exception. The exception message and stacktrace is given:
-exception.message PaymentService Fail Feature Flag Enabled
+exception.message PaymentService Fail Feature Flag Enabled
exception.stacktrace Error: PaymentService Fail Feature Flag Enabled at module.exports.charge
(/usr/src/app/charge.js:21:11) at process.processTicksAndRejections
(node:internal/process/task_queues:95:5) at async Object.chargeServiceHandler
[as charge] (/usr/src/app/index.js:21:22)
exception.type Error
-
+
Roll Back Change#
-Roll back the change:
-./runtimeChange.sh paymentServiceFailure off
-
+Inform Dynatrace that a change in configuration is coming.
+The paymentServiceFailure
flag will be set to off
+./runtimeChange.sh paymentServiceFailure off
+
+Again edit flags.yaml
and set the defaultValue
of paymentServiceFailure
from "on"
to "off"
(line 84
)
+Apply the chnages:
+kubectl apply -f $CODESPACE_VSCODE_FOLDER/flags.yaml
+
Summary#
Looking back at the initial brief, it was your job to:
-- Enable that feature flag in a development environment.
+- Enable a feature flag in a development environment.
- Judge the impact (if any) of that change on the application.
- If an impact is observed, gather the evidence and then disable the feature flag.
- Make the "go / no go" decision for that feature.
@@ -694,7 +702,15 @@ Summary
+Works with any metric
+The techniques described here work with any metric, from any source.
+You are encouraged to use metrics from other devices and sources (such as business related metrics like revenue).
+
+
+Success
The Dynatrace Platform, Site Reliability Guardian and Workflows have provided visibility and automated change analysis.
+
- Cleanup Resources
diff --git a/whats-next/index.html b/whats-next/index.html
index ac44f1c..75ea437 100755
--- a/whats-next/index.html
+++ b/whats-next/index.html
@@ -201,7 +201,7 @@
- GitHub
+ View Code on GitHub
@@ -247,7 +247,7 @@
- GitHub
+ View Code on GitHub
@@ -541,7 +541,6 @@
What's Next?#
-
TODO
Content about how the user progresses after this demo.