-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
19 changed files
with
1,171 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
**/stress-test-resources.json | ||
Chart.lock | ||
charts/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
target/ | ||
src/ | ||
README.md | ||
CHANGELOG.md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Release History | ||
|
||
## 1.0.0-beta.1 (Unreleased) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
apiVersion: v2 | ||
name: java-template | ||
description: An example stress test chart for performing azure resource deployments | ||
version: 0.1.1 | ||
appVersion: v0.1 | ||
annotations: | ||
stressTest: 'true' # enable auto-discovery of this test via `find-all-stress-packages.ps1` | ||
namespace: 'java-template' | ||
|
||
dependencies: | ||
- name: stress-test-addons | ||
version: ~0.3.0 | ||
repository: "@stress-test-charts" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
ARG REGISTRY="azsdkengsys.azurecr.io" | ||
FROM ${REGISTRY}/java/jdk-mariner-mvn:jdk11-latest as builder | ||
|
||
RUN yum -y update | ||
|
||
# Add necessary files to the image | ||
RUN mkdir /stress | ||
WORKDIR /stress | ||
ADD ./sdk/tools /stress/sdk/tools | ||
ADD ./eng /stress/eng | ||
ADD ./common /stress/common | ||
ADD ./sdk/parents /stress/sdk/parents | ||
ADD ./sdk/template /stress/sdk/template | ||
|
||
# Build dependencies and stress tests | ||
RUN --mount=type=cache,target=/root/.m2 \ | ||
mvn -f /stress/eng/code-quality-reports/pom.xml clean install -Dcheckstyle.skip -Dgpg.skip -Dmaven.javadoc.skip -Drevapi.skip -Dspotbugs.skip -Djacoco.skip -DskipTests && \ | ||
mvn -f /stress/common/perf-test-core/pom.xml clean install -Dcheckstyle.skip -Dgpg.skip -Dmaven.javadoc.skip -Drevapi.skip -Dspotbugs.skip -Djacoco.skip -DskipTests && \ | ||
mvn -f /stress/sdk/parents/azure-perf-test-parent/pom.xml clean install -Dcheckstyle.skip -Dgpg.skip -Dmaven.javadoc.skip -Drevapi.skip -Dspotbugs.skip -Djacoco.skip -DskipTests && \ | ||
mvn -f /stress/sdk/tools/pom.xml clean install -Dcheckstyle.skip -Dgpg.skip -Dmaven.javadoc.skip -Drevapi.skip -Dspotbugs.skip -Djacoco.skip -DskipTests && \ | ||
mvn -f /stress/sdk/template/azure-template-stress/pom.xml clean install -Dcheckstyle.skip -Dgpg.skip -Dmaven.javadoc.skip -Drevapi.skip -Dspotbugs.skip -Djacoco.skip -DskipTests | ||
|
||
FROM mcr.microsoft.com/openjdk/jdk:11-mariner | ||
|
||
RUN yum -y update | ||
|
||
# Copy target files from builder image | ||
WORKDIR /app | ||
COPY --from=builder /stress/sdk/template/azure-template-stress/target . | ||
|
||
# Configure monitoring | ||
ARG APPLICATION_INSIGHTS_AGENT_VERSION=3.4.19 | ||
ARG AGENT_URL=/~https://github.com/microsoft/ApplicationInsights-Java/releases/download/${APPLICATION_INSIGHTS_AGENT_VERSION}/applicationinsights-agent-${APPLICATION_INSIGHTS_AGENT_VERSION}.jar | ||
ADD ${AGENT_URL} ./applicationinsights-agent.jar | ||
|
||
# This is never executed (since job yaml overrides it) | ||
ENTRYPOINT ["bash"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,228 @@ | ||
# Stress tests for Azure <Name> client library for Java | ||
|
||
This package contains template project for stress tests and recommendations on how to create them for your library. | ||
|
||
## Getting started | ||
|
||
Check out [Azure SDK Stress Test Wiki][azure_sdk_stress_test] for general information about stress tests. | ||
|
||
### Prerequisites | ||
|
||
- [Java Development Kit (JDK)][jdk_link], version 8 or later. | ||
- [Maven][maven] | ||
- [Docker][docker] | ||
- [Kubectl][kubectl] | ||
- [Helm][helm] | ||
- [Azure CLI][azure_cli] | ||
- [Powershell 7.0+][powershell] | ||
|
||
### Deploy Stress Test | ||
|
||
cd into `azure-sdk-for-java` root folder and run command to deploy the package to cluster: | ||
|
||
```shell | ||
.\eng\common\scripts\stress-testing\deploy-stress-tests.ps1 -SearchDirectory .\sdk\<your service directory> | ||
``` | ||
|
||
### Check Status | ||
|
||
Only the most frequently used commands are listed below. See [Deploying A Stress Test][deploy_stress_test] for more details. | ||
|
||
List deployed packages: | ||
|
||
```shell | ||
helm list -n <stress test namespace> | ||
``` | ||
|
||
the namespace usually matches your username. | ||
|
||
Get stress test pods and status: | ||
|
||
```shell | ||
kubectl get pods -n <stress test namespace> | ||
``` | ||
|
||
Get stress test pod logs: | ||
|
||
```shell | ||
kubectl logs -n <stress test namespace> <stress test pod name> | ||
# Note that we may define multiple containers (for example, `fault-injector` and `main`) | ||
kubectl logs -n <stress test namespace> <stress test pod name> -c <container name> | ||
``` | ||
|
||
If stress test pod is in `Error` status, check logs from containers: | ||
|
||
```shell | ||
kubectl logs -n <stress test namespace> <stress test pod name> | ||
``` | ||
|
||
You may also get logs for specific containers: | ||
|
||
```shell | ||
kubectl describe pod -n <stress test namespace> <stress test pod name> -c <container-name> | ||
``` | ||
|
||
Stop and remove deployed package: | ||
|
||
```shell | ||
helm uninstall <stress test name> -n <stress test namespace> | ||
``` | ||
|
||
## Key concepts | ||
|
||
### Project Structure | ||
|
||
See [Layout][stress_test_layout] section for details. | ||
|
||
Below is the current structure of project: | ||
``` | ||
. | ||
├── src/ # Test code | ||
├── templates/ # A directory of helm templates that will generate Kubernetes manifest files. | ||
├── workbooks/ # A directory of Azure Monitor workbooks for analyzing stress test results. | ||
├── Chart.yaml # A YAML file containing information about the helm chart and its dependencies | ||
├── scenarios-matrix.yaml # A YAML file containing configuration and custom values for stress test(s) | ||
├── Dockerfile # A Dockerfile for building the stress test image | ||
├── stress-test-resources.bicep # An Azure Bicep for deploying stress test azure resources | ||
├── pom.xml | ||
└── README.md | ||
``` | ||
|
||
### How to create your own tests | ||
|
||
- Start with [Azure SDK stress Wiki](https://aka.ms/azsdk/stress) to learn about stress tests. | ||
- Copy `src/main/java/com/azure/sdk/template/stress` folder to your service folder. | ||
- Update the code | ||
- Update `pom.xml` to add dependencies on your service. | ||
- Implement your first stress test instead of `HttpGet` and make sure to update `StressTestOptions` to your needs. | ||
- Update configuration | ||
- update `stress-test-resources.bicep` to create resources required for your stress test | ||
- change chart `name` (in `Chart.yaml`) to match your service name. Please keep `java-` prefix. | ||
- update `templates/job.yaml` | ||
- remove `server` container | ||
- replace occurrences of `java-template` to match name in the `Chart.yaml` | ||
- update test parameters in `test` container | ||
- define scenarios and parameters in `scenarios-matrix.yaml` | ||
|
||
Now you're ready to run tests with `.\eng\common\scripts\stress-testing\deploy-stress-tests.ps1 -SearchDirectory .\sdk\<you service directory>`. | ||
See [Deploying A Stress Test][deploy_stress_test] for more details. | ||
|
||
Let's see how we can check test results. | ||
|
||
### Checking test results | ||
|
||
#### Stress Test Dashboard | ||
|
||
General-purpose stress test dashboard is available at https://aka.ms/azsdk/stress/dashboard. It shows: | ||
- Pod status events | ||
- CPU and memory utilization of the stress test pods | ||
- Container logs and events | ||
|
||
Stress test dashboard does not know about local stress test runs. | ||
|
||
#### Application Insights | ||
|
||
Application Insights agent brings rich monitoring experience including: | ||
- resource utilization metrics (CPU, memory, GC, threads, etc.) | ||
- live metrics, performance overview, etc | ||
- distributed tracing and dependency calls (HTTP, Azure SDK calls) | ||
- exceptions and logs | ||
- profiling in production | ||
|
||
Application Insights is useful to: | ||
- monitor and compare throughput and latency across runs | ||
- investigate issues and find bottlenecks | ||
|
||
Application Insights is available for local runs (as long as you provide `-javaagent` option and make sure connection string is configured). | ||
It's also possible to use Azure Monitor Profiler for Java inside the stress test ApplicationInsights resource to capture JFR profiles. | ||
|
||
### Logging | ||
|
||
We use [logback.xml][logback_xml] to configure the logging. By default, the stress test run on cluster will output | ||
`WARN` level log which you may adjust based on your needs. | ||
You may also control the verbosity of logs that go to Application Insights - see [Application Insights logging configuration][application-insights-logging] for more details. | ||
|
||
Since logs are hard to query and are extremely verbose (in case of high-scale stress tests), we're relying on metrics and workbooks for test result analysis. | ||
See also [Logging in Azure SDK][logging-azure-sdk]. | ||
|
||
### Metrics | ||
|
||
While some Azure SDKs provide custom metrics, we're going to collect generic test metrics and build queries/workbooks on top of them, | ||
so it's important to reuse the same metric across different tests whenever possible. | ||
|
||
We need just one generic metric for basic analysis - the one that measures duration of one test execution (with additional dimensions). | ||
It's implemented in `com.azure.sdk.template.stress.util.TelemetryHelper` and has the following semantic: | ||
- name: `test.run.duration` - it is used in the stress workbook, so make sure to use the same name when applicable | ||
- unit: seconds | ||
- customDimensions: | ||
- `error.type` - The low-cardinality type of error describing what happened (eg. exception class name). | ||
|
||
The metric should measure exactly one test operation, so we'll be able to derive the key performance indicators from it such as: | ||
- throughput (rate of operations per period of time) | ||
- duration of one operation | ||
- error rate (how frequently errors of different types occur) | ||
|
||
Each metric collected with OpenTelemetry (directly or via Application Insights) also has the following dimensions: | ||
- `cloud_RoleName` - in case of stress tests, it matches test name and run id (`{{ .Release.Name }}-{{ .Stress.BaseName }}` in helm chart). | ||
- `cloud_RoleInstance` - in case of k8s it matches pod name and identifies the test container. | ||
|
||
When running multiple test containers, make sure to assign different role instances to them by setting `APPLICATIONINSIGHTS_ROLE_INSTANCE` environment variable. | ||
(e.g. `{{ .Stress.BaseName }}-consumer` and `{{ .Stress.BaseName }}-producer`). This would allow you to distinguish telemetry coming from different containers. | ||
You may additionally consider assigning different `APPLICATIONINSIGHTS_ROLE_NAME`. | ||
In any case, you may need to adjust the workbook to accomodate those changes and break down metrics by role instance. | ||
|
||
In addition to `test.run_duration`, we're also collecting: | ||
- [reactor schedulers metrics](/~https://github.com/reactor/reactor-core/blob/main/docs/asciidoc/metrics.adoc) | ||
- JVM metrics measured by ApplicationInsights agent: | ||
- Normalized CPU percentage (based on [OperatingSystemMXBean](https://docs.oracle.com/javase/8/docs/api/java/lang/management/OperatingSystemMXBean.html) and [RuntimeMXBean](https://docs.oracle.com/javase/8/docs/api/java/lang/management/RuntimeMXBean.html))) | ||
Note: if 0.5 CPU is configured for pod, maximum normalized CPU percentage for it would be 50%. | ||
- GC time (based on [GarbageCollectorMXBean](https://docs.oracle.com/javase/8/docs/api/java/lang/management/GarbageCollectorMXBean.html)) | ||
- Heap memory usage (based on [MemoryMXBean](https://docs.oracle.com/javase/8/docs/api/java/lang/management/MemoryMXBean.html)) | ||
- Thread count (based on [ThreadMXBean](https://docs.oracle.com/javase/8/docs/api/java/lang/management/ThreadMXBean.html)) | ||
|
||
### Stress test workbook | ||
|
||
[Stress test workbook](https://ms.portal.azure.com/#@microsoft.onmicrosoft.com/resource/subscriptions/faa080af-c1d8-40ad-9cce-e1a450ca5b57/resourceGroups/rg-stress-cluster-pg/providers/Microsoft.Insights/components/stress-pg-ai-s7b6dif73rup6/workbooks) | ||
shows a summary of a test run. | ||
|
||
First, select a time range and run from the list, then check the report: | ||
- `Test summary` contains key test parameters and key counters (total number of operations, errors, etc.) | ||
- Tst operation success rate, latency and error rate | ||
- CPU and memory utilization, number of threads and time spent in GC | ||
- Warnings, errors, and exceptions in logs. Note logs and traces are sampled (at 1%) rate, so you won't see every error there | ||
|
||
Since you're changing the chart name, you would need to update the workbook to use `java-your-service-name` instead of `java-template`. | ||
Then you'd need to create a new workbook for your service, follow | ||
[Azure Monitor workbook documentation](https://learn.microsoft.com/azure/azure-monitor/visualize/workbooks-create-workbook) for more details. | ||
Then you can import json file from `workbooks` folder. | ||
|
||
## Writing useful tests | ||
|
||
Stress tests are intended to detect reliability and resiliency issues: | ||
- bugs in retry policy | ||
- graceful degradation under high load and transient failures | ||
- memory leaks, thread pool starvation, etc | ||
|
||
While we don't have generic solution for fault injection, check out [Chaos mesh](/~https://github.com/Azure/azure-sdk-tools/blob/main/tools/stress-cluster/chaos/README.md#chaos-manifest), | ||
[Azure Chaos Studio](https://azure.microsoft.com/products/chaos-studio), and [Http Fault injector](/~https://github.com/Azure/azure-sdk-tools/tree/main/tools/http-fault-injector). | ||
|
||
Even without fault injection, by applying maximum load to the service, we can detect memory leaks, extensive allocations, | ||
thread pool issues, or other performance issues in the code. | ||
|
||
So make sure to configure resource limits and apply the maximum load you can get under them. | ||
|
||
<!-- links --> | ||
[azure_sdk_stress_test]: https://aka.ms/azsdk/stress | ||
[jdk_link]: https://docs.microsoft.com/java/azure/jdk/?view=azure-java-stable | ||
[maven]: https://maven.apache.org/ | ||
[docker]: https://docs.docker.com/get-docker/ | ||
[kubectl]: https://kubernetes.io/docs/tasks/tools/#kubectl | ||
[helm]: https://helm.sh/docs/intro/install/ | ||
[azure_cli]: https://docs.microsoft.com/cli/azure/install-azure-cli | ||
[powershell]: https://docs.microsoft.com/powershell/scripting/install/installing-powershell?view=powershell-7 | ||
[enable_application_insights]: https://learn.microsoft.com/en-us/azure/azure-monitor/app/opentelemetry-enable?tabs=java#enable-azure-monitor-application-insights | ||
[logback_xml]: /~https://github.com/Azure/azure-sdk-for-java/blob/main/sdk/servicebus/azure-messaging-servicebus-stress/src/main/resources/logback.xml | ||
[deploy_stress_test]: /~https://github.com/Azure/azure-sdk-tools/blob/main/tools/stress-cluster/chaos/README.md#deploying-a-stress-test | ||
[stress_test_layout]: /~https://github.com/Azure/azure-sdk-tools/blob/main/tools/stress-cluster/chaos/README.md#layout | ||
[application-insights-logging]: https://learn.microsoft.com/en-us/azure/azure-monitor/app/java-standalone-config#autocollected-logging | ||
[logging-azure-sdk]: /~https://github.com/Azure/azure-sdk-for-java/wiki/Logging-in-Azure-SDK |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
|
||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
|
||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<groupId>com.azure</groupId> | ||
<artifactId>azure-template-stress</artifactId> | ||
<version>1.0.0-beta.1</version> <!-- {x-version-update;com.azure:azure-template-perf;current} --> | ||
<packaging>jar</packaging> | ||
|
||
<properties> | ||
<maven.compiler.source>1.8</maven.compiler.source> | ||
<maven.compiler.target>1.8</maven.compiler.target> | ||
<doclint>all,-missing</doclint> | ||
</properties> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>com.azure</groupId> | ||
<artifactId>perf-test-core</artifactId> | ||
<version>1.0.0-beta.1</version> <!-- {x-version-update;com.azure:perf-test-core;current} --> | ||
</dependency> | ||
|
||
<!-- HTTP clients --> | ||
<dependency> | ||
<groupId>com.azure</groupId> | ||
<artifactId>azure-core-http-netty</artifactId> | ||
<version>1.13.10</version> <!-- {x-version-update;com.azure:azure-core-http-netty;dependency} --> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.azure</groupId> | ||
<artifactId>azure-core-http-okhttp</artifactId> | ||
<version>1.11.17</version> <!-- {x-version-update;com.azure:azure-core-http-okhttp;dependency} --> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.azure</groupId> | ||
<artifactId>azure-core-http-jdk-httpclient</artifactId> | ||
<version>1.0.0-beta.9</version> <!-- {x-version-update;com.azure:azure-core-http-jdk-httpclient;dependency} --> | ||
</dependency> | ||
<!-- logging, tracing, metrics --> | ||
<dependency> | ||
<groupId>ch.qos.logback</groupId> | ||
<artifactId>logback-classic</artifactId> | ||
<version>1.2.12</version> <!-- {x-version-update;ch.qos.logback:logback-classic;external_dependency} --> | ||
</dependency> | ||
<dependency> | ||
<groupId>io.opentelemetry</groupId> | ||
<artifactId>opentelemetry-api</artifactId> | ||
<version>1.32.0</version> <!-- {x-version-update;io.opentelemetry:opentelemetry-api;external_dependency} --> | ||
</dependency> | ||
<dependency> | ||
<groupId>io.micrometer</groupId> | ||
<artifactId>micrometer-core</artifactId> | ||
<version>1.9.17</version> <!-- {x-version-update;io.micrometer:micrometer-core;external_dependency} --> | ||
</dependency> | ||
</dependencies> | ||
|
||
<build> | ||
<plugins> | ||
<plugin> | ||
<!-- we need shade plugin to merge MANIFEST-INF properly into the uber jar--> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-shade-plugin</artifactId> | ||
<version>3.3.0</version> <!-- {x-version-update;org.apache.maven.plugins:maven-shade-plugin;external_dependency} --> | ||
<executions> | ||
<execution> | ||
<phase>package</phase> | ||
<goals> | ||
<goal>shade</goal> | ||
</goals> | ||
<configuration> | ||
<transformers> | ||
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> | ||
<mainClass>com.azure.sdk.template.stress.App</mainClass> | ||
</transformer> | ||
<!--Transforms META-INF/services (essential if you relocate com.azure classes)--> | ||
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/> | ||
</transformers> | ||
<finalName>${project.artifactId}-${project.version}-jar-with-dependencies</finalName> | ||
<filters> | ||
<filter> | ||
<artifact>*:*</artifact> | ||
<excludes> | ||
<exclude>META-INF/maven/**</exclude> | ||
<exclude>META-INF/*.SF</exclude> | ||
<exclude>META-INF/*.DSA</exclude> | ||
<exclude>META-INF/*.RSA</exclude> | ||
</excludes> | ||
</filter> | ||
</filters> | ||
</configuration> | ||
</execution> | ||
</executions> | ||
</plugin> | ||
</plugins> | ||
</build> | ||
</project> |
Oops, something went wrong.