From b538e919112d925ee2f3e5bb8a6826a48c1b4f52 Mon Sep 17 00:00:00 2001 From: Rigel <64812637+rsgel@users.noreply.github.com> Date: Fri, 11 Oct 2024 17:00:06 -0700 Subject: [PATCH 01/11] Initial batch of AKS Entra changes --- articles/chaos-studio/TOC.yml | 2 + .../chaos-studio-aks-authentication.md | 196 ++++++++++++++++++ .../chaos-studio-fault-providers.md | 50 ++++- .../chaos-studio-tutorial-aks-cli.md | 11 +- .../chaos-studio-tutorial-aks-portal.md | 1 - articles/chaos-studio/troubleshooting.md | 10 + 6 files changed, 263 insertions(+), 7 deletions(-) create mode 100644 articles/chaos-studio/chaos-studio-aks-authentication.md diff --git a/articles/chaos-studio/TOC.yml b/articles/chaos-studio/TOC.yml index 74f518a09d..5026edf241 100644 --- a/articles/chaos-studio/TOC.yml +++ b/articles/chaos-studio/TOC.yml @@ -67,6 +67,8 @@ href: chaos-studio-tutorial-aks-portal.md - name: CLI href: chaos-studio-tutorial-aks-cli.md + - name: Using Microsoft Entra authentication with Chaos Mesh + href: chaos-studio-aks-authentication.md - name: Dynamic targeting items: - name: Portal diff --git a/articles/chaos-studio/chaos-studio-aks-authentication.md b/articles/chaos-studio/chaos-studio-aks-authentication.md new file mode 100644 index 0000000000..83f8c3858c --- /dev/null +++ b/articles/chaos-studio/chaos-studio-aks-authentication.md @@ -0,0 +1,196 @@ +--- +title: Using Microsoft Entra authentication with Chaos Studio AKS faults +description: Learn about the different ways for Chaos Studio to authenticate with your AKS cluster. +services: chaos-studio +author: rsgel +ms.topic: article +ms.date: 10/11/2024 +ms.author: carlsonr +ms.reviewer: abbyweisberg +ms.service: azure-chaos-studio +--- + +# Using Microsoft Entra authentication with Chaos Studio AKS faults + +## Overview + +Azure Chaos Studio integrates with Chaos Mesh to run faults on Azure Kubernetes Service (AKS) clusters, like removing pods, CPU stress, network disruption, and more. You can use two different types of authentication to run these faults, depending on your configuration and preferences, either local accounts or AKS-Managed Microsoft Entra authentication: + +* Kubernetes local accounts are stored in the Kubernetes API server and can be used to authenticate and authorize requests to the cluster. Learn more about local accounts at this page: [Manage local accounts](/azure/aks/manage-local-accounts-managed-azure-ad). +* AKS-Managed Microsoft Entra authentication allows you to sign in and manage permissions for your cluster using Microsoft Entra credentials and Azure RBAC. Learn how to [Enable AKS-Managed Microsoft Entra authentication](/azure/aks/enable-authentication-microsoft-entra-id). + +Chaos Studio previously only supported using Chaos Mesh with local accounts, but Version 2.2 of all AKS faults now support both local accounts and Microsoft Entra authentication. + +## Updating targets + +Before using the updated faults, you need to update the target, which represents your AKS cluster in Chaos Studio's resource model. You can do this in one of two ways: +- Disable and re-enable the target resource. + - To do this in the Azure portal, visit the **Targets** pane in the Chaos Studio portal interface, select the relevant AKS cluster(s), and select **Disable targets**. Wait 1-2 minutes or for a confirmation notification, then select **Enable targets** > **Enable service-direct targets** and go through the Review & Create screen. +- Update the enabled capabilities. + - To do this in the Azure portal, visit the **Targets** pane in Chaos Studio, find the AKS cluster(s), select **Manage actions**, and make sure all of the capabilities are enabled. Select **Save** to finalize the update. + +If you're using the API or command-line, follow the instructions at [Create a chaos experiment that uses a Chaos Mesh fault with the Azure CLI](chaos-studio-tutorial-aks-cli.md#enable-chaos-studio-on-your-aks-cluster) to ensure the latest available capabilities are enabled. + +## Creating a new experiment + +When you create a new experiment that uses AKS Chaos Mesh faults in the Azure portal, you may see two versions of each fault, such as "AKS Chaos Mesh DNS Chaos" and "AKS Chaos Mesh DNS Chaos (deprecated)". Select the first option, not the deprecated option. + +If you don't see your AKS cluster as a possible target after selecting the fault, you may need to enable the new fault version on the cluster. Visit the Targets page, find your AKS cluster and select **Manage actions**, then make sure all of the capabilities are selected before selecting **Save**. + +Follow the [Create a chaos experiment that uses a Chaos Mesh fault to kill AKS pods with the Azure portal](chaos-studio-tutorial-aks-portal.md) tutorial to create an experiment. + +## Updating an existing experiment + +### Azure portal +1. Open an experiment that contains at least one AKS Chaos Mesh fault. +1. Select **Edit** on the fault and copy the `jsonSpec` parameter value to your clipboard. +1. Open the fault selection dropdown and select the version of your desired fault without the "(deprecated)" marking. +1. Paste the `jsonSpec` from your clipboard into the parameter field. +1. Save the fault and the experiment. + +### Command-line +1. Use the [REST API](chaos-studio-samples-rest-api.md) to get the experiment JSON. + ```azurecli-interactive + az rest --method get --url "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.Chaos/experiments/$EXPERIMENT_NAME?api-version=2024-01-01" + ``` + ```json + { + "id": "/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.Chaos/experiments/$EXPERIMENT_NAME", + "identity": { + "principalId": "30221089-383b-433f-8439-4eee8b011b1b", + "tenantId": "72f988bf-86f1-41af-91ab-2d7cd011db47", + "type": "SystemAssigned" + }, + "location": "eastus", + "name": "aks-private-1", + "properties": { + "selectors": [ + { + "id": "1925533b-5a3d-4733-a86d-167ab82f1931", + "targets": [ + { + "id": "/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.ContainerService/managedClusters/$AKS_CLUSTER_NAME/providers/Microsoft.Chaos/targets/Microsoft-AzureKubernetesServiceChaosMesh", + "type": "ChaosTarget" + } + ], + "type": "List" + } + ], + "steps": [ + { + "branches": [ + { + "actions": [ + { + "duration": "PT10M", + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:podChaos/2.1", + "parameters": [ + { + "key": "jsonSpec", + "value": "{\"action\":\"pod-failure\",\"mode\":\"all\",\"selector\":{\"namespaces\":[\"default\"]}}" + } + ], + "selectorId": "1925533b-5a3d-4733-a86d-167ab82f1931", + "type": "continuous" + } + ], + "name": "Branch 1" + } + ], + "name": "Step 1" + } + ] + }, + "systemData": { + "createdAt": "2023-07-05T19:08:56.0145761+00:00", + "createdByType": "User", + "lastModifiedAt": "2024-10-08T00:00:12.701+00:00" + }, + "tags": {}, + "type": "Microsoft.Chaos/experiments" + } + ``` +1. Use a text or code editor to update the fault version for the corresponding fault(s) from 2.1 to 2.2. For example, change the line `"name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:podChaos/2.1"` to `"name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:podChaos/2.2"`. Save the file with a name you can reference in the next step, such as `experimentBody.json`. +1. Send the updated experiment JSON to Chaos Studio. + ```azurecli-interactive + az rest --method put --url "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.Chaos/experiments/$EXPERIMENT_NAME?api-version=2024-01-01" --body @{experimentBody.json} + ``` + +## Permissions + +Chaos Studio needs permission to execute faults on your resources. + +When creating an experiment in the Azure portal, you can optionally select **Enable custom role creation and assignment** to let Chaos Studio attempt to assign the necessary permissions to the experiment's managed identity. + +If you choose not to use custom role creation, or you're not using the Azure portal, you must do **one of the following** after creating your experiment: +* Manually assign the [Azure Kubernetes Service RBAC Admin](/azure/role-based-access-control/built-in-roles/containers#azure-kubernetes-service-rbac-admin) and [Azure Kubernetes Service Cluster User](/azure/role-based-access-control/built-in-roles/containers#azure-kubernetes-service-cluster-user) roles to the experiment managed identity (system-assigned or user-assigned). +* Manually create a custom role allowing the full list of operations needed in [RBAC operations](#rbac-operations). +* Manually create a custom role allowing a partial list of the operations needed, and deploy a custom YAML file. This process is detailed in [Optional least-privilege access](#optional-least-privilege-access). + +### RBAC operations + +The following RBAC operations are used for AKS Chaos Mesh faults: + +- Actions: + - Microsoft.ContainerService/managedClusters/read +- Data Actions: + - Microsoft.ContainerService/managedClusters/namespaces/read + - Microsoft.ContainerService/managedClusters/pods/read + - Microsoft.ContainerService/managedClusters/apiextensions.k8s.io/customresourcedefinitions/write + - Microsoft.ContainerService/managedClusters/apiextensions.k8s.io/customresourcedefinitions/read + - Microsoft.ContainerService/managedClusters/authorization.k8s.io/subjectaccessreviews/write + - Microsoft.ContainerService/managedClusters/rbac.authorization.k8s.io/clusterroles/read + - Microsoft.ContainerService/managedClusters/rbac.authorization.k8s.io/clusterroles/write + - Microsoft.ContainerService/managedClusters/rbac.authorization.k8s.io/clusterroles/delete + - Microsoft.ContainerService/managedClusters/rbac.authorization.k8s.io/clusterroles/bind/action + - Microsoft.ContainerService/managedClusters/rbac.authorization.k8s.io/clusterroles/escalate/action + - Microsoft.ContainerService/managedClusters/rbac.authorization.k8s.io/clusterrolebindings/read + - Microsoft.ContainerService/managedClusters/rbac.authorization.k8s.io/clusterrolebindings/write + - Microsoft.ContainerService/managedClusters/rbac.authorization.k8s.io/clusterrolebindings/delete + + +### Optional least-privilege access + +If you prefer not to grant full ClusterRole and ClusterRoleBinding read/write access to the Chaos Studio experiment identity, you can manually create the necessary role and binding for Chaos Mesh. This is necessary for Chaos Mesh to ensure the experiment has permission to target the specified tenant namespace. + +There are two steps to this optional configuration. + +1. When assigning permissions to the experiment's managed identity, use a custom role with a limited set of permissions. The permissions required are: + + - Actions: + - Microsoft.ContainerService/managedClusters/read + - Data Actions: + - Microsoft.ContainerService/managedClusters/namespaces/read + - Microsoft.ContainerService/managedClusters/pods/read + - Microsoft.ContainerService/managedClusters/apiextensions.k8s.io/customresourcedefinitions/write + - Microsoft.ContainerService/managedClusters/apiextensions.k8s.io/customresourcedefinitions/read + - Microsoft.ContainerService/managedClusters/authorization.k8s.io/subjectaccessreviews/write + +1. Deploy the following YAML configuration to create the role and binding. Learn more about deployments in the AKS documentation: [Deploy an Azure Kubernetes Service (AKS) cluster using Azure portal](/azure/aks/learn/quick-kubernetes-deploy-portal?tabs=azure-cli). + + ```yml + kind: ClusterRole + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + name: role-cluster-manager-pdmas + rules: + - apiGroups: + - chaos-mesh.org + resources: [ "*" ] + verbs: ["get", "list", "watch", "create", "delete", "patch", "update"] + + --- + kind: ClusterRoleBinding + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + name: cluster-manager-binding + namespace: {Namespace targeted by experiment} + subjects: + - kind: User + name: {CHAOS-STUDIO-EXPERIMENT-MSI-OBJECT-ID} + roleRef: + kind: ClusterRole + name: role-cluster-manager-pdmas + apiGroup: rbac.authorization.k8s.io + + ``` diff --git a/articles/chaos-studio/chaos-studio-fault-providers.md b/articles/chaos-studio/chaos-studio-fault-providers.md index 294c80e185..6929f20829 100644 --- a/articles/chaos-studio/chaos-studio-fault-providers.md +++ b/articles/chaos-studio/chaos-studio-fault-providers.md @@ -24,8 +24,9 @@ More information about role assignments can be found on the [Azure built-in role | Microsoft.Compute/virtualMachineScaleSets (agent-based) | Microsoft-Agent | [Reader](/azure/role-based-access-control/built-in-roles#reader) | | Microsoft.Compute/virtualMachines (service-direct) | Microsoft-VirtualMachine | [Virtual Machine Contributor](/azure/role-based-access-control/built-in-roles#virtual-machine-contributor) | | Microsoft.Compute/virtualMachineScaleSets (service-direct) | Microsoft-VirtualMachineScaleSet | [Virtual Machine Contributor](/azure/role-based-access-control/built-in-roles#virtual-machine-contributor) | -| Microsoft.ContainerService/managedClusters (service-direct) | Microsoft-AzureKubernetesServiceChaosMesh | [Azure Kubernetes Service Cluster Admin Role](/azure/role-based-access-control/built-in-roles#azure-kubernetes-service-cluster-admin-role) | -| Microsoft.DocumentDb/databaseAccounts (Cosmos DB, service-direct) | Microsoft-Cosmos DB | [Azure Cosmos DB Operator](/azure/role-based-access-control/built-in-roles#cosmos-db-operator) | +| Microsoft.ContainerService/managedClusters (service-direct) | Microsoft-AzureKubernetesServiceChaosMesh (recommended)| [Azure Kubernetes Service RBAC Admin Role](/azure/role-based-access-control/built-in-roles#azure-kubernetes-service-rbac-admin-role) and [Azure Kubernetes Service Cluster User Role](/azure/role-based-access-control/built-in-roles#azure-kubernetes-service-cluster-user-role) | +| Microsoft.ContainerService/managedClusters (service-direct) | Microsoft-AzureKubernetesServiceChaosMesh (fault version 2.1 with Kubernetes local accounts only)| [Azure Kubernetes Service Cluster Admin Role](/azure/role-based-access-control/built-in-roles#azure-kubernetes-service-cluster-admin-role) | +| Microsoft.DocumentDb/databaseAccounts (Cosmos DB, service-direct) | Microsoft-Cosmos DB | [Azure Cosmos DB Operator](/azure/role-based-access-control/built-in-roles#cosmos-db-operator) | | Microsoft.Insights/autoscalesettings (service-direct) | Microsoft-AutoScaleSettings | [Web Plan Contributor](/azure/role-based-access-control/built-in-roles#web-plan-contributor) | | Microsoft.KeyVault/vaults (service-direct) | Microsoft-KeyVault | [Azure Key Vault Contributor](/azure/role-based-access-control/built-in-roles#key-vault-contributor) | | Microsoft.Network/networkSecurityGroups (service-direct) | Microsoft-NetworkSecurityGroup | [Network Contributor](/azure/role-based-access-control/built-in-roles#network-contributor) | @@ -33,3 +34,48 @@ More information about role assignments can be found on the [Azure built-in role | Microsoft.ServiceBus/namespaces (service-direct) | Microsoft-ServiceBus | [Azure Service Bus Data Owner](/azure/role-based-access-control/built-in-roles#azure-service-bus-data-owner) | | Microsoft.EventHub/namespaces (service-direct) | Microsoft-EventHub | [Azure Event Hubs Data Owner](/azure/role-based-access-control/built-in-roles#azure-event-hubs-data-owner) | | Microsoft.LoadTestService/loadtests (service-direct) | Microsoft-AzureLoadTest | [Load Test Contributor](/azure/role-based-access-control/built-in-roles#load-test-contributor) | + +## Custom role operations + +If you prefer not to use the listed built-in roles, you can create custom roles and assign the exact operations needed for each fault. There are two ways to do this. + +While creating an experiment within the Azure portal, you can select **"Enable custom role creation and assignment"** in the **Permissions** tab to allow Chaos Studio to deploy a custom role with the necessary operations. + +Alternatively, if you aren't using the Azure portal or you prefer to manage operations individually, you can find the operations needed for each fault and manually assign them to a custom role. To see what roles are needed for a Chaos Studio fault, run the following Azure CLI REST command: + +```azurecli-interactive +az rest --method get --uri https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/providers/Microsoft.Chaos/locations/eastus/targetTypes/$TARGET_TYPE/capabilityTypes/$CAPABILITY_NAME?api-version=2024-01-01 +``` + +As an example, see `properties.azureRbacActions` and `properties.azureRbacDataActions` for the Cosmos DB Failover fault. +```json +> az rest --method get --url "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/providers/Microsoft.Chaos/locations/eastus/targetTypes/Microsoft-CosmosDB/capabilityTypes/Failover-1.0?api-version=2024-01-01" +{ + "id": "/subscriptions/$SUBSCRIPTION_ID/providers/Microsoft.Chaos/locations/eastus/targetTypes/CosmosDB/capabilityTypes/Failover-1.0", + "location": "eastus", + "name": "Failover-1.0", + "properties": { + "azureRbacActions": [ + "Microsoft.DocumentDB/databaseAccounts/read", + "Microsoft.DocumentDB/databaseAccounts/failoverPriorityChange/action" + ], + "azureRbacDataActions": null, + "description": "", + "displayName": "", + "kind": "Fault", + "parametersSchema": "https://schema-tc.eastus.chaos-prod.azure.com/targetTypes/Microsoft-CosmosDB/capabilityTypes/Failover-1.0/parametersSchema.json", + "publisher": "Microsoft", + "runtimeProperties": { + "kind": "Continuous" + }, + "targetType": "CosmosDB", + "urn": "urn:csci:microsoft:cosmosDB:failover/1.0" + }, + "systemData": { + "createdAt": "2024-10-10T17:28:41.7377834+00:00", + "createdByType": "Application", + "lastModifiedAt": "2024-10-10T17:28:41.7377834+00:00" + }, + "type": "Microsoft.Chaos/locations/targetTypes/capabilityTypes" +} +``` \ No newline at end of file diff --git a/articles/chaos-studio/chaos-studio-tutorial-aks-cli.md b/articles/chaos-studio/chaos-studio-tutorial-aks-cli.md index febc45a493..18c016462b 100644 --- a/articles/chaos-studio/chaos-studio-tutorial-aks-cli.md +++ b/articles/chaos-studio/chaos-studio-tutorial-aks-cli.md @@ -26,7 +26,6 @@ Chaos Studio uses [Chaos Mesh](https://chaos-mesh.org/), a free, open-source cha * You can use Chaos Mesh faults with private clusters by configuring [VNet Injection in Chaos Studio](chaos-studio-private-networking.md). Any commands issued to the private cluster, including the steps in this article to set up Chaos Mesh, need to follow the [private cluster guidance](/azure/aks/private-clusters). Recommended methods include connecting from a VM in the same virtual network or using the [AKS command invoke](/azure/aks/access-private-cluster) feature. * AKS Chaos Mesh faults are only supported on Linux node pools. -* Currently, Chaos Mesh faults don't work if the AKS cluster has [local accounts disabled](/azure/aks/manage-local-accounts-managed-azure-ad). * If your AKS cluster is configured to only allow authorized IP ranges, you need to allow Chaos Studio's IP ranges. You can find them by querying the `ChaosStudio` [service tag with the Service Tag Discovery API or downloadable JSON files](/azure/virtual-network/service-tags-overview). ## Open Azure Cloud Shell @@ -170,7 +169,7 @@ Now you can create your experiment. A chaos experiment defines the actions you w "value": "{\"action\":\"pod-failure\",\"mode\":\"all\",\"selector\":{\"namespaces\":[\"default\"]}}" } ], - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:podChaos/2.1" + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:podChaos/2.2" } ] } @@ -210,13 +209,17 @@ When you create a chaos experiment, Chaos Studio creates a system-assigned manag az rest --method get --uri https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.Chaos/experiments/$EXPERIMENT_NAME?api-version=2024-01-01 ``` -2. Give the experiment access to your resources by using the following command. Replace `$EXPERIMENT_PRINCIPAL_ID` with the principal ID from the previous step. Replace `$SUBSCRIPTION_ID`, `$resourceGroupName`, and `$AKS_CLUSTER_NAME` with the relevant strings of the AKS cluster. +2. Give the experiment access to your resources by using the following commands. Replace `$EXPERIMENT_PRINCIPAL_ID` with the principal ID from the previous step. Replace `$SUBSCRIPTION_ID`, `$resourceGroupName`, and `$AKS_CLUSTER_NAME` with the relevant strings of the AKS cluster. ```azurecli-interactive -az role assignment create --role "Azure Kubernetes Service Cluster Admin Role" --assignee-principal-type "ServicePrincipal" --assignee-object-id $EXPERIMENT_PRINCIPAL_ID --scope subscriptions/$SUBSCRIPTION_ID/resourceGroups/$resourceGroupName/providers/Microsoft.ContainerService/managedClusters/$AKS_CLUSTER_NAME +az role assignment create --role "Azure Kubernetes Service RBAC Admin Role" --assignee-principal-type "ServicePrincipal" --assignee-object-id $EXPERIMENT_PRINCIPAL_ID --scope subscriptions/$SUBSCRIPTION_ID/resourceGroups/$resourceGroupName/providers/Microsoft.ContainerService/managedClusters/$AKS_CLUSTER_NAME + +az role assignment create --role "Azure Kubernetes Service Cluster User Role" --assignee-principal-type "ServicePrincipal" --assignee-object-id $EXPERIMENT_PRINCIPAL_ID --scope subscriptions/$SUBSCRIPTION_ID/resourceGroups/$resourceGroupName/providers/Microsoft.ContainerService/managedClusters/$AKS_CLUSTER_NAME ``` +If you prefer to create custom roles instead of the built-in AKS roles, follow the instructions on the [Supported resource types and role assignments for Chaos Studio](chaos-studio-fault-providers.md) page to list the role-based access control operations needed for a specific fault and add them to a manually created custom role. + ## Run your experiment You're now ready to run your experiment. To see the effect, we recommend that you open your AKS cluster overview and go to **Insights** in a separate browser tab. Live data for the **Active Pod Count** shows the effect of running your experiment. diff --git a/articles/chaos-studio/chaos-studio-tutorial-aks-portal.md b/articles/chaos-studio/chaos-studio-tutorial-aks-portal.md index 1c069b4ae2..f8e80d496f 100644 --- a/articles/chaos-studio/chaos-studio-tutorial-aks-portal.md +++ b/articles/chaos-studio/chaos-studio-tutorial-aks-portal.md @@ -25,7 +25,6 @@ Chaos Studio uses [Chaos Mesh](https://chaos-mesh.org/), a free, open-source cha * You can use Chaos Mesh faults with private clusters by configuring [VNet Injection in Chaos Studio](chaos-studio-private-networking.md). Any commands issued to the private cluster, including the steps in this article to set up Chaos Mesh, need to follow the [private cluster guidance](/azure/aks/private-clusters). Recommended methods include connecting from a VM in the same virtual network or using the [AKS command invoke](/azure/aks/access-private-cluster) feature. * AKS Chaos Mesh faults are only supported on Linux node pools. -* Currently, Chaos Mesh faults don't work if the AKS cluster has [local accounts disabled](/azure/aks/manage-local-accounts-managed-azure-ad). * If your AKS cluster is configured to only allow authorized IP ranges, you need to allow Chaos Studio's IP ranges. You can find them by querying the `ChaosStudio` [service tag with the Service Tag Discovery API or downloadable JSON files](/azure/virtual-network/service-tags-overview). ## Set up Chaos Mesh on your AKS cluster diff --git a/articles/chaos-studio/troubleshooting.md b/articles/chaos-studio/troubleshooting.md index 621a3c911e..86c9b4a1b1 100644 --- a/articles/chaos-studio/troubleshooting.md +++ b/articles/chaos-studio/troubleshooting.md @@ -148,6 +148,16 @@ After starting an experiment, you might see an error message like: `The long-run To resolve this error, ensure that the experiment's system-assigned or user-assigned managed identity has permission to all resources in the experiment. Learn more about permissions here: [Permissions and security in Azure Chaos Studio](chaos-studio-permissions-security.md). For example, if the experiment targets a virtual machine, navigate to the virtual machine's identity page and assign the "Virtual Machine Contributor" role to the experiment's managed identity. +### My AKS Chaos Mesh experiment failed + +There are several common errors you may encounter when using AKS Chaos Mesh faults. + +| Error message | Suggested action | +| --- | --- | +| Getting static credential is not allowed because this cluster is set to disable local accounts. | The AKS Chaos Mesh faults can use either Kubernetes local accounts or Microsoft Entra authentication as of version 2.2. Learn more about these authentication types here. | +| The Chaos Mesh experiment could not be started because the provided configuration was invalid | Ensure the `jsonSpec` contains all the required fields. | +| Chaos Mesh version 'x.x.x' is not currently supported by Chaos Studio | Verify the installed version against the [Azure Chaos Studio version compatibility](chaos-studio-versions.md) page and submit a [feature request](https://feedback.azure.com/d365community/forum/18f8dc01-dc37-ec11-b6e6-000d3a9c7101) if the desired version isn't listed. | + ## Problems when setting up a managed identity ### When I try to add a system-assigned/user-assigned managed identity to my existing experiment, it fails to save. From c1cca9782c52ca58d1832c077766d0cc6eb3a898 Mon Sep 17 00:00:00 2001 From: Rigel <64812637+rsgel@users.noreply.github.com> Date: Fri, 11 Oct 2024 17:38:42 -0700 Subject: [PATCH 02/11] fault library version changes --- .../chaos-studio-fault-library.md | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/articles/chaos-studio/chaos-studio-fault-library.md b/articles/chaos-studio/chaos-studio-fault-library.md index 490713105b..7f1e2a341d 100644 --- a/articles/chaos-studio/chaos-studio-fault-library.md +++ b/articles/chaos-studio/chaos-studio-fault-library.md @@ -1025,12 +1025,12 @@ Currently, a maximum of 4 process names can be listed in the processNames parame | Property | Value | |-|-| -| Capability name | NetworkChaos-2.1 | +| Capability name | NetworkChaos-2.2 | | Target type | Microsoft-AzureKubernetesServiceChaosMesh | | Supported node pool OS types | Linux | | Description | Causes a network fault available through [Chaos Mesh](https://chaos-mesh.org/docs/simulate-network-chaos-on-kubernetes/) to run against your Azure Kubernetes Service (AKS) cluster. Useful for re-creating AKS incidents that result from network outages, delays, duplications, loss, and corruption. | | Prerequisites | The AKS cluster must [have Chaos Mesh deployed](chaos-studio-tutorial-aks-portal.md). | -| Urn | urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.1 | +| Urn | urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.2 | | Parameters (key, value) | | | jsonSpec | A JSON-formatted Chaos Mesh spec that uses the [NetworkChaos kind](https://chaos-mesh.org/docs/simulate-network-chaos-on-kubernetes/#create-experiments-using-the-yaml-files). You can use a YAML-to-JSON converter like [Convert YAML To JSON](https://www.convertjson.com/yaml-to-json.htm) to convert the Chaos Mesh YAML to JSON and minify it. Use single-quotes within the JSON or escape the quotes with a backslash character. Only include the YAML under the `jsonSpec` property. Don't include information like metadata and kind. Specifying duration within the `jsonSpec` isn't necessary, but it's used if available. | @@ -1042,7 +1042,7 @@ Currently, a maximum of 4 process names can be listed in the processNames parame "actions": [ { "type": "continuous", - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.1", + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.2", "parameters": [ { "key": "jsonSpec", @@ -1059,12 +1059,12 @@ Currently, a maximum of 4 process names can be listed in the processNames parame | Property | Value | |-|-| -| Capability name | PodChaos-2.1 | +| Capability name | PodChaos-2.2 | | Target type | Microsoft-AzureKubernetesServiceChaosMesh | | Supported node pool OS types | Linux | | Description | Causes a pod fault available through [Chaos Mesh](https://chaos-mesh.org/docs/simulate-pod-chaos-on-kubernetes/) to run against your AKS cluster. Useful for re-creating AKS incidents that are a result of pod failures or container issues. | | Prerequisites | The AKS cluster must [have Chaos Mesh deployed](chaos-studio-tutorial-aks-portal.md). | -| Urn | urn:csci:microsoft:azureKubernetesServiceChaosMesh:podChaos/2.1 | +| Urn | urn:csci:microsoft:azureKubernetesServiceChaosMesh:podChaos/2.2 | | Parameters (key, value) | | | jsonSpec | A JSON-formatted Chaos Mesh spec that uses the [PodChaos kind](https://chaos-mesh.org/docs/simulate-pod-chaos-on-kubernetes/#create-experiments-using-yaml-configuration-files). You can use a YAML-to-JSON converter like [Convert YAML To JSON](https://www.convertjson.com/yaml-to-json.htm) to convert the Chaos Mesh YAML to JSON and minify it. Use single-quotes within the JSON or escape the quotes with a backslash character. Only include the YAML under the `jsonSpec` property. Don't include information like metadata and kind. Specifying duration within the `jsonSpec` isn't necessary, but it's used if available. | @@ -1076,7 +1076,7 @@ Currently, a maximum of 4 process names can be listed in the processNames parame "actions": [ { "type": "continuous", - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:podChaos/2.1", + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:podChaos/2.2", "parameters": [ { "key": "jsonSpec", @@ -1093,12 +1093,12 @@ Currently, a maximum of 4 process names can be listed in the processNames parame | Property | Value | |-|-| -| Capability name | StressChaos-2.1 | +| Capability name | StressChaos-2.2 | | Target type | Microsoft-AzureKubernetesServiceChaosMesh | | Supported node pool OS types | Linux | | Description | Causes a stress fault available through [Chaos Mesh](https://chaos-mesh.org/docs/simulate-heavy-stress-on-kubernetes/) to run against your AKS cluster. Useful for re-creating AKS incidents because of stresses over a collection of pods, for example, due to high CPU or memory consumption. | | Prerequisites | The AKS cluster must [have Chaos Mesh deployed](chaos-studio-tutorial-aks-portal.md). | -| Urn | urn:csci:microsoft:azureKubernetesServiceChaosMesh:stressChaos/2.1 | +| Urn | urn:csci:microsoft:azureKubernetesServiceChaosMesh:stressChaos/2.2 | | Parameters (key, value) | | | jsonSpec | A JSON-formatted Chaos Mesh spec that uses the [StressChaos kind](https://chaos-mesh.org/docs/simulate-heavy-stress-on-kubernetes/#create-experiments-using-the-yaml-file). You can use a YAML-to-JSON converter like [Convert YAML To JSON](https://www.convertjson.com/yaml-to-json.htm) to convert the Chaos Mesh YAML to JSON and minify it. Use single-quotes within the JSON or escape the quotes with a backslash character. Only include the YAML under the `jsonSpec` property. Don't include information like metadata and kind. Specifying duration within the `jsonSpec` isn't necessary, but it's used if available. | @@ -1110,7 +1110,7 @@ Currently, a maximum of 4 process names can be listed in the processNames parame "actions": [ { "type": "continuous", - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:stressChaos/2.1", + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:stressChaos/2.2", "parameters": [ { "key": "jsonSpec", @@ -1127,12 +1127,12 @@ Currently, a maximum of 4 process names can be listed in the processNames parame | Property | Value | |-|-| -| Capability name | IOChaos-2.1 | +| Capability name | IOChaos-2.2 | | Target type | Microsoft-AzureKubernetesServiceChaosMesh | | Supported node pool OS types | Linux | | Description | Causes an IO fault available through [Chaos Mesh](https://chaos-mesh.org/docs/simulate-io-chaos-on-kubernetes/) to run against your AKS cluster. Useful for re-creating AKS incidents because of IO delays and read/write failures when you use IO system calls such as `open`, `read`, and `write`. | | Prerequisites | The AKS cluster must [have Chaos Mesh deployed](chaos-studio-tutorial-aks-portal.md). | -| Urn | urn:csci:microsoft:azureKubernetesServiceChaosMesh:IOChaos/2.1 | +| Urn | urn:csci:microsoft:azureKubernetesServiceChaosMesh:IOChaos/2.2 | | Parameters (key, value) | | | jsonSpec | A JSON-formatted Chaos Mesh spec that uses the [IOChaos kind](https://chaos-mesh.org/docs/simulate-io-chaos-on-kubernetes/#create-experiments-using-the-yaml-files). You can use a YAML-to-JSON converter like [Convert YAML To JSON](https://www.convertjson.com/yaml-to-json.htm) to convert the Chaos Mesh YAML to JSON and minify it. Use single-quotes within the JSON or escape the quotes with a backslash character. Only include the YAML under the `jsonSpec` property. Don't include information like metadata and kind. Specifying duration within the `jsonSpec` isn't necessary, but it's used if available. | @@ -1144,7 +1144,7 @@ Currently, a maximum of 4 process names can be listed in the processNames parame "actions": [ { "type": "continuous", - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:IOChaos/2.1", + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:IOChaos/2.2", "parameters": [ { "key": "jsonSpec", @@ -1161,12 +1161,12 @@ Currently, a maximum of 4 process names can be listed in the processNames parame | Property | Value | |-|-| -| Capability name | TimeChaos-2.1 | +| Capability name | TimeChaos-2.2 | | Target type | Microsoft-AzureKubernetesServiceChaosMesh | | Supported node pool OS types | Linux | | Description | Causes a change in the system clock on your AKS cluster by using [Chaos Mesh](https://chaos-mesh.org/docs/simulate-time-chaos-on-kubernetes/). Useful for re-creating AKS incidents that result from distributed systems falling out of sync, missing/incorrect leap year/leap second logic, and more. | | Prerequisites | The AKS cluster must [have Chaos Mesh deployed](chaos-studio-tutorial-aks-portal.md). | -| Urn | urn:csci:microsoft:azureKubernetesServiceChaosMesh:timeChaos/2.1 | +| Urn | urn:csci:microsoft:azureKubernetesServiceChaosMesh:timeChaos/2.2 | | Parameters (key, value) | | | jsonSpec | A JSON-formatted Chaos Mesh spec that uses the [TimeChaos kind](https://chaos-mesh.org/docs/simulate-time-chaos-on-kubernetes/#create-experiments-using-the-yaml-file). You can use a YAML-to-JSON converter like [Convert YAML To JSON](https://www.convertjson.com/yaml-to-json.htm) to convert the Chaos Mesh YAML to JSON and minify it. Use single-quotes within the JSON or escape the quotes with a backslash character. Only include the YAML under the `jsonSpec` property. Don't include information like metadata and kind. Specifying duration within the `jsonSpec` isn't necessary, but it's used if available. | @@ -1178,7 +1178,7 @@ Currently, a maximum of 4 process names can be listed in the processNames parame "actions": [ { "type": "continuous", - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:timeChaos/2.1", + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:timeChaos/2.2", "parameters": [ { "key": "jsonSpec", @@ -1195,12 +1195,12 @@ Currently, a maximum of 4 process names can be listed in the processNames parame | Property | Value | |-|-| -| Capability name | KernelChaos-2.1 | +| Capability name | KernelChaos-2.2 | | Target type | Microsoft-AzureKubernetesServiceChaosMesh | | Supported node pool OS types | Linux | | Description | Causes a kernel fault available through [Chaos Mesh](https://chaos-mesh.org/docs/simulate-kernel-chaos-on-kubernetes/) to run against your AKS cluster. Useful for re-creating AKS incidents because of Linux kernel-level errors, such as a mount failing or memory not being allocated. | | Prerequisites | The AKS cluster must [have Chaos Mesh deployed](chaos-studio-tutorial-aks-portal.md). | -| Urn | urn:csci:microsoft:azureKubernetesServiceChaosMesh:kernelChaos/2.1 | +| Urn | urn:csci:microsoft:azureKubernetesServiceChaosMesh:kernelChaos/2.2 | | Parameters (key, value) | | | jsonSpec | A JSON-formatted Chaos Mesh spec that uses the [KernelChaos kind](https://chaos-mesh.org/docs/simulate-kernel-chaos-on-kubernetes/#configuration-file). You can use a YAML-to-JSON converter like [Convert YAML To JSON](https://www.convertjson.com/yaml-to-json.htm) to convert the Chaos Mesh YAML to JSON and minify it. Use single-quotes within the JSON or escape the quotes with a backslash character. Only include the YAML under the `jsonSpec` property. Don't include information like metadata and kind. Specifying duration within the `jsonSpec` isn't necessary, but it's used if available. | @@ -1212,7 +1212,7 @@ Currently, a maximum of 4 process names can be listed in the processNames parame "actions": [ { "type": "continuous", - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:kernelChaos/2.1", + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:kernelChaos/2.2", "parameters": [ { "key": "jsonSpec", @@ -1229,12 +1229,12 @@ Currently, a maximum of 4 process names can be listed in the processNames parame | Property | Value | |-|-| -| Capability name | HTTPChaos-2.1 | +| Capability name | HTTPChaos-2.2 | | Target type | Microsoft-AzureKubernetesServiceChaosMesh | | Supported node pool OS types | Linux | | Description | Causes an HTTP fault available through [Chaos Mesh](https://chaos-mesh.org/docs/simulate-http-chaos-on-kubernetes/) to run against your AKS cluster. Useful for re-creating incidents because of HTTP request and response processing failures, such as delayed or incorrect responses. | | Prerequisites | The AKS cluster must [have Chaos Mesh deployed](chaos-studio-tutorial-aks-portal.md). | -| Urn | urn:csci:microsoft:azureKubernetesServiceChaosMesh:httpChaos/2.1 | +| Urn | urn:csci:microsoft:azureKubernetesServiceChaosMesh:httpChaos/2.2 | | Parameters (key, value) | | | jsonSpec | A JSON-formatted Chaos Mesh spec that uses the [HTTPChaos kind](https://chaos-mesh.org/docs/simulate-http-chaos-on-kubernetes/#create-experiments). You can use a YAML-to-JSON converter like [Convert YAML To JSON](https://www.convertjson.com/yaml-to-json.htm) to convert the Chaos Mesh YAML to JSON and minify it. Use single-quotes within the JSON or escape the quotes with a backslash character. Only include the YAML under the `jsonSpec` property. Don't include information like metadata and kind. Specifying duration within the `jsonSpec` isn't necessary, but it's used if available. | @@ -1246,7 +1246,7 @@ Currently, a maximum of 4 process names can be listed in the processNames parame "actions": [ { "type": "continuous", - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:httpChaos/2.1", + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:httpChaos/2.2", "parameters": [ { "key": "jsonSpec", @@ -1263,12 +1263,12 @@ Currently, a maximum of 4 process names can be listed in the processNames parame | Property | Value | |-|-| -| Capability name | DNSChaos-2.1 | +| Capability name | DNSChaos-2.2 | | Target type | Microsoft-AzureKubernetesServiceChaosMesh | | Supported node pool OS types | Linux | | Description | Causes a DNS fault available through [Chaos Mesh](https://chaos-mesh.org/docs/simulate-dns-chaos-on-kubernetes/) to run against your AKS cluster. Useful for re-creating incidents because of DNS failures. | | Prerequisites | The AKS cluster must [have Chaos Mesh deployed](chaos-studio-tutorial-aks-portal.md) and the [DNS service must be installed](https://chaos-mesh.org/docs/simulate-dns-chaos-on-kubernetes/#deploy-chaos-dns-service). | -| Urn | urn:csci:microsoft:azureKubernetesServiceChaosMesh:dnsChaos/2.1 | +| Urn | urn:csci:microsoft:azureKubernetesServiceChaosMesh:dnsChaos/2.2 | | Parameters (key, value) | | | jsonSpec | A JSON-formatted Chaos Mesh spec that uses the [DNSChaos kind](https://chaos-mesh.org/docs/simulate-dns-chaos-on-kubernetes/#create-experiments-using-the-yaml-file). You can use a YAML-to-JSON converter like [Convert YAML To JSON](https://www.convertjson.com/yaml-to-json.htm) to convert the Chaos Mesh YAML to JSON and minify it. Use single-quotes within the JSON or escape the quotes with a backslash character. Only include the YAML under the `jsonSpec` property. Don't include information like metadata and kind. Specifying duration within the `jsonSpec` isn't necessary, but it's used if available. | @@ -1280,7 +1280,7 @@ Currently, a maximum of 4 process names can be listed in the processNames parame "actions": [ { "type": "continuous", - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:dnsChaos/2.1", + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:dnsChaos/2.2", "parameters": [ { "key": "jsonSpec", From 69dd24f813751f7bd2a9839980c7d276dc4da766 Mon Sep 17 00:00:00 2001 From: Rigel <64812637+rsgel@users.noreply.github.com> Date: Tue, 15 Oct 2024 09:26:40 -0700 Subject: [PATCH 03/11] add blurb about targets --- articles/chaos-studio/chaos-studio-aks-authentication.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/articles/chaos-studio/chaos-studio-aks-authentication.md b/articles/chaos-studio/chaos-studio-aks-authentication.md index 83f8c3858c..e663f0733c 100644 --- a/articles/chaos-studio/chaos-studio-aks-authentication.md +++ b/articles/chaos-studio/chaos-studio-aks-authentication.md @@ -23,7 +23,11 @@ Chaos Studio previously only supported using Chaos Mesh with local accounts, but ## Updating targets -Before using the updated faults, you need to update the target, which represents your AKS cluster in Chaos Studio's resource model. You can do this in one of two ways: +Targets (`Microsoft.Chaos/targets`) represent another Azure resource -- in this case, an AKS cluster -- in Chaos Studio's resource model, so you can easily control whether or not a certain resource is enabled for fault injection and what faults can run against it. + +If you're onboarding an AKS cluster as a new Chaos Studio target within the Azure portal, the new fault versions will automatically be available. + +If you want to use the new fault version on an existing AKS target, you need to update the target. You can do this in one of two ways: - Disable and re-enable the target resource. - To do this in the Azure portal, visit the **Targets** pane in the Chaos Studio portal interface, select the relevant AKS cluster(s), and select **Disable targets**. Wait 1-2 minutes or for a confirmation notification, then select **Enable targets** > **Enable service-direct targets** and go through the Review & Create screen. - Update the enabled capabilities. From ce41acf7e2d752ef20f183ed68ebc30aec941e28 Mon Sep 17 00:00:00 2001 From: Rigel <64812637+rsgel@users.noreply.github.com> Date: Tue, 15 Oct 2024 09:31:17 -0700 Subject: [PATCH 04/11] change aks fault versions --- articles/chaos-studio/experiment-examples.md | 24 ++++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/articles/chaos-studio/experiment-examples.md b/articles/chaos-studio/experiment-examples.md index 1b3881dbff..2bf79dac14 100644 --- a/articles/chaos-studio/experiment-examples.md +++ b/articles/chaos-studio/experiment-examples.md @@ -19,7 +19,7 @@ Here's an example of where you would copy and paste the Azure portal parameter i [![Screenshot that shows Azure portal parameter location.](images/azure-portal-parameter-examples.png)](images/azure-portal-parameter-examples.png#lightbox) -To save one of the "experiment.json" examples shown below, simply type *nano experiment.json* into your cloud shell, copy and paste any of the below experiment examples, save it (ctrl+o), exit nano (ctrl+x) and run the following command: +To save one of the "experiment.json" examples shown below, simply type *nano experiment.json* into your Cloud Shell, copy and paste any of the below experiment examples, save it (ctrl+o), exit nano (ctrl+x) and run the following command: ```AzCLI az rest --method put --uri https://management.azure.com/subscriptions/6b052e15-03d3-4f17-b2e1-be7f07588291/resourceGroups/exampleRG/providers/Microsoft.Chaos/experiments/exampleExperiment?api-version=2024-01-01 ``` @@ -79,7 +79,7 @@ Azure Kubernetes Service (AKS) - Network Delay "value": "{\"action\":\"delay\",\"mode\":\"all\",\"selector\":{\"namespaces\":[\"default\"]},\"delay\":{\"latency\":\"200ms\",\"correlation\":\"100\",\"jitter\":\"0ms\"}}" } ], - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.1" + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.2" } ] } @@ -140,7 +140,7 @@ Azure Kubernetes Service (AKS) - Pod Failure "value": "{\"action\":\"pod-failure\",\"mode\":\"all\",\"duration\":\"600s\",\"selector\":{\"namespaces\":[\"autoinstrumentationdemo\"]}}}" } ], - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:podChaos/2.1" + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:podChaos/2.2" } ] } @@ -202,7 +202,7 @@ Azure Kubernetes Service (AKS) - Memory Stress "value": "{\"mode\":\"all\",\"selector\":{\"namespaces\":[\"autoinstrumentationdemo\"]},\"stressors\":{\"memory\":{\"workers\":4,\"size\":\"95%\"}}" } ], - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:stressChaos/2.1" + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:stressChaos/2.2" } ] } @@ -264,7 +264,7 @@ Azure Kubernetes Service (AKS) - CPU Stress "value": "{\"mode\":\"all\",\"selector\":{\"namespaces\":[\"autoinstrumentationdemo\"]},\"stressors\":{\"cpu\":{\"workers\":4,\"load\":95}}}" } ], - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:stressChaos/2.1" + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:stressChaos/2.2" } ] } @@ -324,7 +324,7 @@ Azure Kubernetes Service (AKS) - Network Emulation "value": "{\"action\":\"netem\",\"mode\":\"all\",\"selector\":{\"namespaces\":[\"default\"]},\"netem\":{\"latency\":\"100ms\",\"loss\":\"0.1\",\"correlation\":\"25\"}}" } ], - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.1" + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.2" } ] } @@ -385,7 +385,7 @@ Azure Kubernetes Service (AKS) - Network Partition "value": "{\"action\":\"partition\",\"mode\":\"all\",\"selector\":{\"namespaces\":[\"default\"]},\"partition\":{\"direction\":\"to\"}}" } ], - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.1" + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.2" } ] } @@ -446,7 +446,7 @@ Azure Kubernetes Service (AKS) - Network Bandwidth Limitation "value": "{\"action\":\"bandwidth\",\"mode\":\"all\",\"selector\":{\"namespaces\":[\"default\"]},\"bandwidth\":{\"rate\":\"1mbps\",\"limit\":\"50mb\",\"buffer\":\"10kb\",\"peakrate\":\"1mbps\",\"minburst\":\"0\"}}" } ], - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.1" + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.2" } ] } @@ -507,7 +507,7 @@ Azure Kubernetes Service (AKS) - Network Packet Re-order "value": "{\"action\":\"reorder\",\"mode\":\"all\",\"selector\":{\"namespaces\":[\"default\"]},\"reorder\":{\"gap\":\"5\",\"reorder\":\"25\",\"correlation\":\"50\"}}" } ], - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.1" + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.2" } ] } @@ -568,7 +568,7 @@ Azure Kubernetes Service (AKS) - Network Packet Loss "value": "{\"action\":\"loss\",\"mode\":\"all\",\"selector\":{\"namespaces\":[\"default\"]},\"loss\":{\"loss\":\"10\",\"correlation\":\"25\"}}" } ], - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.1" + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.2" } ] } @@ -629,7 +629,7 @@ Azure Kubernetes Service (AKS) - Network Packet Duplication "value": "{\"action\":\"duplicate\",\"mode\":\"all\",\"selector\":{\"namespaces\":[\"default\"]},\"duplicate\":{\"duplicate\":\"50\",\"correlation\":\"50\"}}" } ], - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.1" + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.2" } ] } @@ -690,7 +690,7 @@ Azure Kubernetes Service (AKS) - Network Packet Corruption "value": "{\"action\":\"corrupt\",\"mode\":\"all\",\"selector\":{\"namespaces\":[\"default\"]},\"corrupt\":{\"corrupt\":\"50\",\"correlation\":\"50\"}}" } ], - "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.1" + "name": "urn:csci:microsoft:azureKubernetesServiceChaosMesh:networkChaos/2.2" } ] } From 631549f4707d0981cb86d8d973b63d84b50a27a8 Mon Sep 17 00:00:00 2001 From: Rigel <64812637+rsgel@users.noreply.github.com> Date: Tue, 15 Oct 2024 11:03:44 -0700 Subject: [PATCH 05/11] fixing feedback nits --- articles/chaos-studio/chaos-studio-aks-authentication.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/articles/chaos-studio/chaos-studio-aks-authentication.md b/articles/chaos-studio/chaos-studio-aks-authentication.md index e663f0733c..712ef88bf4 100644 --- a/articles/chaos-studio/chaos-studio-aks-authentication.md +++ b/articles/chaos-studio/chaos-studio-aks-authentication.md @@ -23,7 +23,7 @@ Chaos Studio previously only supported using Chaos Mesh with local accounts, but ## Updating targets -Targets (`Microsoft.Chaos/targets`) represent another Azure resource -- in this case, an AKS cluster -- in Chaos Studio's resource model, so you can easily control whether or not a certain resource is enabled for fault injection and what faults can run against it. +Targets (`Microsoft.Chaos/targets`) represent another Azure resource in Chaos Studio's resource model, so you can easily control whether or not a certain resource is enabled for fault injection and what faults can run against it. In this case, the target represents an AKS cluster that you want to affect. If you're onboarding an AKS cluster as a new Chaos Studio target within the Azure portal, the new fault versions will automatically be available. @@ -48,7 +48,7 @@ Follow the [Create a chaos experiment that uses a Chaos Mesh fault to kill AKS p ### Azure portal 1. Open an experiment that contains at least one AKS Chaos Mesh fault. 1. Select **Edit** on the fault and copy the `jsonSpec` parameter value to your clipboard. -1. Open the fault selection dropdown and select the version of your desired fault without the "(deprecated)" marking. +1. Open the fault selection dropdown and select the version of your desired fault without the `(deprecated)` marking. 1. Paste the `jsonSpec` from your clipboard into the parameter field. 1. Save the fault and the experiment. From 97547e2fe7789745c3938a2164857c42603c7d7d Mon Sep 17 00:00:00 2001 From: Rigel <64812637+rsgel@users.noreply.github.com> Date: Wed, 16 Oct 2024 09:44:07 -0700 Subject: [PATCH 06/11] small fixes --- articles/chaos-studio/chaos-studio-aks-authentication.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/articles/chaos-studio/chaos-studio-aks-authentication.md b/articles/chaos-studio/chaos-studio-aks-authentication.md index 712ef88bf4..4f6ed62045 100644 --- a/articles/chaos-studio/chaos-studio-aks-authentication.md +++ b/articles/chaos-studio/chaos-studio-aks-authentication.md @@ -45,14 +45,18 @@ Follow the [Create a chaos experiment that uses a Chaos Mesh fault to kill AKS p ## Updating an existing experiment -### Azure portal +Follow one of these two methods to update your existing experiment. + +### [Azure portal](#tab/azure-portal) + 1. Open an experiment that contains at least one AKS Chaos Mesh fault. 1. Select **Edit** on the fault and copy the `jsonSpec` parameter value to your clipboard. 1. Open the fault selection dropdown and select the version of your desired fault without the `(deprecated)` marking. 1. Paste the `jsonSpec` from your clipboard into the parameter field. 1. Save the fault and the experiment. -### Command-line +### [Command line](#tab/command-line) + 1. Use the [REST API](chaos-studio-samples-rest-api.md) to get the experiment JSON. ```azurecli-interactive az rest --method get --url "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.Chaos/experiments/$EXPERIMENT_NAME?api-version=2024-01-01" From 410722ded7c85e53e11387285d5db130ff17d41b Mon Sep 17 00:00:00 2001 From: Rigel Carlson <64812637+rsgel@users.noreply.github.com> Date: Fri, 18 Oct 2024 08:36:37 -0700 Subject: [PATCH 07/11] fix tabs --- articles/chaos-studio/chaos-studio-aks-authentication.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/articles/chaos-studio/chaos-studio-aks-authentication.md b/articles/chaos-studio/chaos-studio-aks-authentication.md index 4f6ed62045..4a4901df0a 100644 --- a/articles/chaos-studio/chaos-studio-aks-authentication.md +++ b/articles/chaos-studio/chaos-studio-aks-authentication.md @@ -124,6 +124,8 @@ Follow one of these two methods to update your existing experiment. az rest --method put --url "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.Chaos/experiments/$EXPERIMENT_NAME?api-version=2024-01-01" --body @{experimentBody.json} ``` +--- + ## Permissions Chaos Studio needs permission to execute faults on your resources. From a3db5a5d24255feecb61d163327c903f47da7787 Mon Sep 17 00:00:00 2001 From: Rigel <64812637+rsgel@users.noreply.github.com> Date: Fri, 18 Oct 2024 09:01:18 -0700 Subject: [PATCH 08/11] note about local accounts per suggestion --- articles/chaos-studio/chaos-studio-aks-authentication.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/articles/chaos-studio/chaos-studio-aks-authentication.md b/articles/chaos-studio/chaos-studio-aks-authentication.md index 4a4901df0a..cd2e5ecb97 100644 --- a/articles/chaos-studio/chaos-studio-aks-authentication.md +++ b/articles/chaos-studio/chaos-studio-aks-authentication.md @@ -19,8 +19,12 @@ Azure Chaos Studio integrates with Chaos Mesh to run faults on Azure Kubernetes * Kubernetes local accounts are stored in the Kubernetes API server and can be used to authenticate and authorize requests to the cluster. Learn more about local accounts at this page: [Manage local accounts](/azure/aks/manage-local-accounts-managed-azure-ad). * AKS-Managed Microsoft Entra authentication allows you to sign in and manage permissions for your cluster using Microsoft Entra credentials and Azure RBAC. Learn how to [Enable AKS-Managed Microsoft Entra authentication](/azure/aks/enable-authentication-microsoft-entra-id). + > [!NOTE] +> Local account permissions grant access as long as the credentials are on the client machine, while AKS-Managed Microsoft Entra authentication allows more scoped assignment and management of permissions. Learn more about this best practice: [Best practices for cluster security and upgrades](/azure/aks/operator-best-practices-cluster-security?tabs=azure-cli). + Chaos Studio previously only supported using Chaos Mesh with local accounts, but Version 2.2 of all AKS faults now support both local accounts and Microsoft Entra authentication. + ## Updating targets Targets (`Microsoft.Chaos/targets`) represent another Azure resource in Chaos Studio's resource model, so you can easily control whether or not a certain resource is enabled for fault injection and what faults can run against it. In this case, the target represents an AKS cluster that you want to affect. From 7655ccdf6fc3e1ba6e22845a2c42f1319258c076 Mon Sep 17 00:00:00 2001 From: Rigel <64812637+rsgel@users.noreply.github.com> Date: Fri, 18 Oct 2024 09:11:03 -0700 Subject: [PATCH 09/11] fix troubleshooting link --- articles/chaos-studio/troubleshooting.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/articles/chaos-studio/troubleshooting.md b/articles/chaos-studio/troubleshooting.md index 75f7be63ef..c3663e4b4c 100644 --- a/articles/chaos-studio/troubleshooting.md +++ b/articles/chaos-studio/troubleshooting.md @@ -154,7 +154,7 @@ There are several common errors you may encounter when using AKS Chaos Mesh faul | Error message | Suggested action | | --- | --- | -| Getting static credential is not allowed because this cluster is set to disable local accounts. | The AKS Chaos Mesh faults can use either Kubernetes local accounts or Microsoft Entra authentication as of version 2.2. Learn more about these authentication types here. | +| Getting static credential is not allowed because this cluster is set to disable local accounts. | The AKS Chaos Mesh faults can use either Kubernetes local accounts or Microsoft Entra authentication as of version 2.2. Learn how to migrate your experiments here: [Using Microsoft Entra authentication with Chaos Studio AKS faults](chaos-studio-aks-authentication.md). | | The Chaos Mesh experiment could not be started because the provided configuration was invalid | Ensure the `jsonSpec` contains all the required fields. | | Chaos Mesh version 'x.x.x' is not currently supported by Chaos Studio | Verify the installed version against the [Azure Chaos Studio version compatibility](chaos-studio-versions.md) page and submit a [feature request](https://feedback.azure.com/d365community/forum/18f8dc01-dc37-ec11-b6e6-000d3a9c7101) if the desired version isn't listed. | From a0e01379771f4b1335d8fe6f9b6d391e476bf1cd Mon Sep 17 00:00:00 2001 From: Rigel <64812637+rsgel@users.noreply.github.com> Date: Fri, 18 Oct 2024 10:14:38 -0700 Subject: [PATCH 10/11] address reviewer feedback --- articles/chaos-studio/TOC.yml | 2 +- .../chaos-studio-aks-authentication.md | 10 ++++---- .../chaos-studio-fault-library.md | 23 ++++++++++--------- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/articles/chaos-studio/TOC.yml b/articles/chaos-studio/TOC.yml index 5026edf241..01b76db0e2 100644 --- a/articles/chaos-studio/TOC.yml +++ b/articles/chaos-studio/TOC.yml @@ -67,7 +67,7 @@ href: chaos-studio-tutorial-aks-portal.md - name: CLI href: chaos-studio-tutorial-aks-cli.md - - name: Using Microsoft Entra authentication with Chaos Mesh + - name: Use Microsoft Entra authentication with Chaos Mesh href: chaos-studio-aks-authentication.md - name: Dynamic targeting items: diff --git a/articles/chaos-studio/chaos-studio-aks-authentication.md b/articles/chaos-studio/chaos-studio-aks-authentication.md index cd2e5ecb97..62d7a60480 100644 --- a/articles/chaos-studio/chaos-studio-aks-authentication.md +++ b/articles/chaos-studio/chaos-studio-aks-authentication.md @@ -1,5 +1,5 @@ --- -title: Using Microsoft Entra authentication with Chaos Studio AKS faults +title: Use Microsoft Entra authentication with Chaos Studio AKS faults description: Learn about the different ways for Chaos Studio to authenticate with your AKS cluster. services: chaos-studio author: rsgel @@ -10,7 +10,7 @@ ms.reviewer: abbyweisberg ms.service: azure-chaos-studio --- -# Using Microsoft Entra authentication with Chaos Studio AKS faults +# Use Microsoft Entra authentication with Chaos Studio AKS faults ## Overview @@ -25,7 +25,7 @@ Azure Chaos Studio integrates with Chaos Mesh to run faults on Azure Kubernetes Chaos Studio previously only supported using Chaos Mesh with local accounts, but Version 2.2 of all AKS faults now support both local accounts and Microsoft Entra authentication. -## Updating targets +## Update targets Targets (`Microsoft.Chaos/targets`) represent another Azure resource in Chaos Studio's resource model, so you can easily control whether or not a certain resource is enabled for fault injection and what faults can run against it. In this case, the target represents an AKS cluster that you want to affect. @@ -39,7 +39,7 @@ If you want to use the new fault version on an existing AKS target, you need to If you're using the API or command-line, follow the instructions at [Create a chaos experiment that uses a Chaos Mesh fault with the Azure CLI](chaos-studio-tutorial-aks-cli.md#enable-chaos-studio-on-your-aks-cluster) to ensure the latest available capabilities are enabled. -## Creating a new experiment +## Create a new experiment When you create a new experiment that uses AKS Chaos Mesh faults in the Azure portal, you may see two versions of each fault, such as "AKS Chaos Mesh DNS Chaos" and "AKS Chaos Mesh DNS Chaos (deprecated)". Select the first option, not the deprecated option. @@ -47,7 +47,7 @@ If you don't see your AKS cluster as a possible target after selecting the fault Follow the [Create a chaos experiment that uses a Chaos Mesh fault to kill AKS pods with the Azure portal](chaos-studio-tutorial-aks-portal.md) tutorial to create an experiment. -## Updating an existing experiment +## Update an existing experiment Follow one of these two methods to update your existing experiment. diff --git a/articles/chaos-studio/chaos-studio-fault-library.md b/articles/chaos-studio/chaos-studio-fault-library.md index 376d893844..41875aad35 100644 --- a/articles/chaos-studio/chaos-studio-fault-library.md +++ b/articles/chaos-studio/chaos-studio-fault-library.md @@ -54,7 +54,7 @@ This section applies to the `Microsoft.Insights/autoscaleSettings` resource type | Fault name | Applicable scenarios | |------------|----------------------| -| [Disable Autoscale](#disable-autoscale) | Compute capacity loss (when used with VMSS Shutdown) | +| [Disable Autoscale](#disable-autoscale) | Compute capacity loss (when used with Virtual Machine Scale Set Shutdown) | ## Azure Kubernetes Service @@ -147,8 +147,8 @@ This section applies to the `Microsoft.Compute/virtualMachineScaleSets` resource | Fault name | Applicable scenarios | |------------|----------------------| -| [VMSS Shutdown](#vmss-shutdown-version-10) | Compute loss/disruption | -| [VMSS Shutdown (2.0)](#vmss-shutdown-version-20) | Compute loss/disruption (by Availability Zone) | +| [Virtual Machine Scale Set Shutdown](#virtual-machine-scale-set-shutdown-version-10) | Compute loss/disruption | +| [Virtual Machine Scale Set Shutdown (2.0)](#virtual-machine-scale-set-shutdown-version-20) | Compute loss/disruption (by Availability Zone) | ## Orchestration actions @@ -232,7 +232,7 @@ The parameters **destinationFilters** and **inboundDestinationFilters** use the | Capability name | NetworkDisconnectViaFirewall-1.0 | | Target type | Microsoft-Agent | | Supported OS types | Windows | -| Description | Applies a Windows firewall rule to block outbound traffic for specified port range and network block. | +| Description | Applies a Windows Firewall rule to block outbound traffic for specified port range and network block. | | Prerequisites | Agent must run as administrator. If the agent is installed as a VM extension, it runs as administrator by default. | | Urn | urn:csci:microsoft:agent:networkDisconnectViaFirewall/1.0 | | Fault type | Continuous. | @@ -408,7 +408,7 @@ The parameters **destinationFilters** and **inboundDestinationFilters** use the | Capability name | NetworkIsolation-1.0 | | Target type | Microsoft-Agent | | Supported OS types | Windows, Linux (outbound only) | -| Description | Fully isolate the virtual machine from network connections by dropping all IP-based inbound (on Windows) and outbound (on Windows and Linux) packets for the specified duration. At the end of the duration, network connections will be re-enabled. Because the agent depends on network traffic, this action cannot be cancelled and will run to the specified duration. | +| Description | Fully isolate the virtual machine from network connections by dropping all IP-based inbound (on Windows) and outbound (on Windows and Linux) packets for the specified duration. At the end of the duration, network connections will be re-enabled. Because the agent depends on network traffic, this action cannot be canceled and will run to the specified duration. | | Prerequisites | **Windows:** The agent must run as administrator, which happens by default if installed as a VM extension. | | | **Linux:** The `tc` (Traffic Control) package is used for network faults. If it isn't already installed, the agent automatically attempts to install it from the default package manager. | | Urn | urn:csci:microsoft:agent:networkIsolation/1.0 | @@ -435,7 +435,7 @@ The parameters **destinationFilters** and **inboundDestinationFilters** use the #### Limitations -* Because the agent depends on network traffic, **this action cannot be cancelled** and will run to the specified duration. Use with caution. +* Because the agent depends on network traffic, **this action cannot be canceled** and will run to the specified duration. Use with caution. * This fault currently only affects new connections. Existing active connections are unaffected. You can restart the service or process to force connections to break. * When running on Linux, this fault can only affect **outbound** traffic, not inbound traffic. The fault can affect **both inbound and outbound** traffic on Windows environments. @@ -786,7 +786,7 @@ These sample values produced ~100% disk pressure when tested on a `Standard_D2s_ | Capability name | KillProcess-1.0 | | Target type | Microsoft-Agent | | Supported OS types | Windows, Linux | -| Description | Kills all the **running** instances of a process that matches the process name sent in the fault parameters. Within the duration set for the fault action, a process is killed repetitively based on the value of the kill interval specified. This fault is a destructive fault where system admin would need to manually recover the process if self-healing is configured for it. Note that this fault will error when used on an empty name process, when used with an unspecifiec interval, or when we cannot find the target process name that we want to kill.| +| Description | Kills all the **running** instances of a process that matches the process name sent in the fault parameters. Within the duration set for the fault action, a process is killed repetitively based on the value of the kill interval specified. This fault is a destructive fault where system admin would need to manually recover the process if self-healing is configured for it. Note that this fault will error when used on an empty name process, when used with an unspecified interval, or when we cannot find the target process name that we want to kill.| | Prerequisites | None. | | Urn | urn:csci:microsoft:agent:killProcess/1.0 | | Fault type | Continuous. | @@ -1887,7 +1887,8 @@ Currently, a maximum of 4 process names can be listed in the processNames parame * The Virtual Machine Redeploy operation is throttled within an interval of 10 hours. If your experiment fails with a "Too many redeploy requests" error, wait for 10 hours to retry the experiment. -### VM Shutdown +### Virtual Machine Shutdown + | Property | Value | |-|-| | Capability name | Shutdown-1.0 | @@ -1923,11 +1924,11 @@ Currently, a maximum of 4 process names can be listed in the processNames parame ``` -### VMSS Shutdown +### Virtual Machine Scale Set Shutdown This fault has two available versions that you can use, Version 1.0 and Version 2.0. The main difference is that Version 2.0 allows you to filter by availability zones, only shutting down instances within a specified zone or zones. -#### VMSS Shutdown Version 1.0 +#### Virtual Machine Scale Set Shutdown Version 1.0 | Property | Value | |-|-| @@ -1968,7 +1969,7 @@ This fault has two available versions that you can use, Version 1.0 and Version } ``` -#### VMSS Shutdown Version 2.0 +#### Virtual Machine Scale Set Shutdown Version 2.0 | Property | Value | |-|-| From 7ecf6e0c6b32cac10ace34d1a91685f44c0c5428 Mon Sep 17 00:00:00 2001 From: Rigel <64812637+rsgel@users.noreply.github.com> Date: Fri, 18 Oct 2024 10:20:37 -0700 Subject: [PATCH 11/11] fix heading links --- articles/chaos-studio/chaos-studio-fault-library.md | 2 +- .../chaos-studio/chaos-studio-tutorial-dynamic-target-cli.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/articles/chaos-studio/chaos-studio-fault-library.md b/articles/chaos-studio/chaos-studio-fault-library.md index 41875aad35..5b33aaad2d 100644 --- a/articles/chaos-studio/chaos-studio-fault-library.md +++ b/articles/chaos-studio/chaos-studio-fault-library.md @@ -139,7 +139,7 @@ This section applies to the `Microsoft.Compute/virtualMachines` resource type. [ | Fault name | Applicable scenarios | |------------|----------------------| | [VM Redeploy](#vm-redeploy) | Compute disruption, maintenance events | -| [VM Shutdown](#vm-shutdown) | Compute loss/disruption | +| [VM Shutdown](#virtual-machine-shutdown) | Compute loss/disruption | ## Virtual Machine Scale Set diff --git a/articles/chaos-studio/chaos-studio-tutorial-dynamic-target-cli.md b/articles/chaos-studio/chaos-studio-tutorial-dynamic-target-cli.md index f9d5930b4c..6ed70a08ed 100644 --- a/articles/chaos-studio/chaos-studio-tutorial-dynamic-target-cli.md +++ b/articles/chaos-studio/chaos-studio-tutorial-dynamic-target-cli.md @@ -57,7 +57,7 @@ You've now successfully added your virtual machine scale set to Chaos Studio. Now you can create your experiment. A chaos experiment defines the actions you want to take against target resources. The actions are organized and run in sequential steps. The chaos experiment also defines the actions you want to take against branches, which run in parallel. -1. Formulate your experiment JSON starting with the following [Virtual Machine Scale Sets Shutdown 2.0](chaos-studio-fault-library.md#vmss-shutdown-version-20) JSON sample. Modify the JSON to correspond to the experiment you want to run by using the [Create Experiment API](/rest/api/chaosstudio/experiments/create-or-update) and the [fault library](chaos-studio-fault-library.md). At this time, dynamic targeting is only available with the Virtual Machine Scale Sets Shutdown 2.0 fault and can only filter on availability zones. +1. Formulate your experiment JSON starting with the following [Virtual Machine Scale Sets Shutdown 2.0](chaos-studio-fault-library.md#virtual-machine-scale-set-shutdown-version-20) JSON sample. Modify the JSON to correspond to the experiment you want to run by using the [Create Experiment API](/rest/api/chaosstudio/experiments/create-or-update) and the [fault library](chaos-studio-fault-library.md). At this time, dynamic targeting is only available with the Virtual Machine Scale Sets Shutdown 2.0 fault and can only filter on availability zones. - Use the `filter` element to configure the list of Azure availability zones to filter targets by. If you don't provide a `filter`, the fault shuts down all instances in the virtual machine scale set. - The experiment targets all Virtual Machine Scale Sets instances in the specified zones.