diff --git a/.vscode/cspell.json b/.vscode/cspell.json index c5308851b4d96..5607c73ba17df 100644 --- a/.vscode/cspell.json +++ b/.vscode/cspell.json @@ -53,6 +53,7 @@ "*.parquet", "**/sdk/**/src/main/java/**/implementation/**", "**/*-perf/**", + "**/*-stress/**", "**/*-tests/**", "**/*-test-*/**", "**/resourcemanager/**", diff --git a/common/perf-test-core/src/main/java/com/azure/perf/test/core/ApiPerfTestBase.java b/common/perf-test-core/src/main/java/com/azure/perf/test/core/ApiPerfTestBase.java index 4e2052edaf2a2..805fe02673a26 100644 --- a/common/perf-test-core/src/main/java/com/azure/perf/test/core/ApiPerfTestBase.java +++ b/common/perf-test-core/src/main/java/com/azure/perf/test/core/ApiPerfTestBase.java @@ -130,9 +130,12 @@ private static HttpClient createHttpClient(PerfStressOptions options) { } else { httpClientProvider = VertxAsyncHttpClientProvider.class; } - } else { - httpClientProvider = getHttpclientProvider(httpClientType); } + + if (httpClientProvider == null) { + httpClientProvider = getHttpClientProvider(httpClientType); + } + try { return httpClientProvider.getDeclaredConstructor().newInstance().createInstance(); } catch (Throwable e) { @@ -141,7 +144,7 @@ private static HttpClient createHttpClient(PerfStressOptions options) { } @SuppressWarnings("unchecked") - private static Class getHttpclientProvider(PerfStressOptions.HttpClientType httpClientType) { + private static Class getHttpClientProvider(PerfStressOptions.HttpClientType httpClientType) { String providerClassName = httpClientType.toString(); try { Class provider = Class.forName(providerClassName, false, ApiPerfTestBase.class.getClassLoader()); diff --git a/eng/.docsettings.yml b/eng/.docsettings.yml index 079223646bccb..ed465cca306fe 100644 --- a/eng/.docsettings.yml +++ b/eng/.docsettings.yml @@ -17,6 +17,7 @@ omitted_paths: - "sdk/*/mgmt-*/*" - "sdk/*/swagger/*" - "sdk/*/codegen/*" + - sdk/*/*-stress/* - doc/* - eng/* - common/smoke-tests/* @@ -123,7 +124,7 @@ known_content_issues: - ['sdk/cosmos/azure-cosmos-spark_3-1_2-12/README.md', '#3113'] - ['sdk/cosmos/azure-cosmos-spark_3-2_2-12/README.md', '#3113'] - ['sdk/cosmos/azure-cosmos-spark_3-3_2-12/README.md', '#3113'] - - ['sdk/cosmos/azure-cosmos-spark_3-4_2-12/README.md', '#3113'] + - ['sdk/cosmos/azure-cosmos-spark_3-4_2-12/README.md', '#3113'] - ['sdk/cosmos/azure-cosmos-spark_3_2-12/dev/README.md', '#3113'] - ['sdk/cosmos/azure-cosmos-spark_3_2-12/docs/catalog-api.md', '#3113'] - ['sdk/cosmos/azure-cosmos-spark_3_2-12/docs/configuration-reference.md', '#3113'] diff --git a/eng/code-quality-reports/src/main/resources/checkstyle/checkstyle-suppressions.xml b/eng/code-quality-reports/src/main/resources/checkstyle/checkstyle-suppressions.xml index b166f7a44d269..39bbdce718982 100755 --- a/eng/code-quality-reports/src/main/resources/checkstyle/checkstyle-suppressions.xml +++ b/eng/code-quality-reports/src/main/resources/checkstyle/checkstyle-suppressions.xml @@ -95,6 +95,7 @@ + + + + diff --git a/eng/versioning/version_client.txt b/eng/versioning/version_client.txt index c025e51f74f4e..7f651d4648c1c 100644 --- a/eng/versioning/version_client.txt +++ b/eng/versioning/version_client.txt @@ -192,6 +192,7 @@ com.azure:azure-storage-internal-avro;12.10.1;12.11.0-beta.1 com.azure:azure-storage-perf;1.0.0-beta.1;1.0.0-beta.1 com.azure:azure-storage-queue;12.20.1;12.21.0-beta.1 com.azure:azure-template-perf;1.0.0-beta.1;1.0.0-beta.1 +com.azure:azure-template-stress;1.0.0-beta.1;1.0.0-beta.1 com.azure:azure-verticals-agrifood-farming;1.0.0-beta.3;1.0.0-beta.4 com.azure:azure-xml;1.0.0-beta.2;1.0.0-beta.3 com.azure:perf-test-core;1.0.0-beta.1;1.0.0-beta.1 diff --git a/sdk/template/azure-template-stress/.gitignore b/sdk/template/azure-template-stress/.gitignore new file mode 100644 index 0000000000000..56647ec6534b8 --- /dev/null +++ b/sdk/template/azure-template-stress/.gitignore @@ -0,0 +1,3 @@ +**/stress-test-resources.json +Chart.lock +charts/ \ No newline at end of file diff --git a/sdk/template/azure-template-stress/.helmignore b/sdk/template/azure-template-stress/.helmignore new file mode 100644 index 0000000000000..a6f2989376b25 --- /dev/null +++ b/sdk/template/azure-template-stress/.helmignore @@ -0,0 +1,4 @@ +target/ +src/ +README.md +CHANGELOG.md diff --git a/sdk/template/azure-template-stress/CHANGELOG.md b/sdk/template/azure-template-stress/CHANGELOG.md new file mode 100644 index 0000000000000..4144f75694a03 --- /dev/null +++ b/sdk/template/azure-template-stress/CHANGELOG.md @@ -0,0 +1,3 @@ +# Release History + +## 1.0.0-beta.1 (Unreleased) diff --git a/sdk/template/azure-template-stress/Chart.yaml b/sdk/template/azure-template-stress/Chart.yaml new file mode 100644 index 0000000000000..633e991e4ac29 --- /dev/null +++ b/sdk/template/azure-template-stress/Chart.yaml @@ -0,0 +1,13 @@ +apiVersion: v2 +name: java-template +description: An example stress test chart for performing azure resource deployments +version: 0.1.1 +appVersion: v0.1 +annotations: + stressTest: 'false' # change it to true. This enables auto-discovery of this test via `find-all-stress-packages.ps1` + namespace: 'java-template' + +dependencies: +- name: stress-test-addons + version: ~0.3.0 + repository: "@stress-test-charts" diff --git a/sdk/template/azure-template-stress/README.md b/sdk/template/azure-template-stress/README.md new file mode 100644 index 0000000000000..d277df42f047a --- /dev/null +++ b/sdk/template/azure-template-stress/README.md @@ -0,0 +1,257 @@ +# Stress tests for Azure client library for Java + +This package contains template project for stress tests and recommendations on how to create them for your library. + +## Getting started + +Check out [Azure SDK Stress Test Wiki][azure_sdk_stress_test] for general information about stress tests. + +### Prerequisites + +- [Java Development Kit (JDK)][jdk_link], version 8 or later. +- [Maven][maven] +- [Docker][docker] +- [Kubectl][kubectl] +- [Helm][helm] +- [Azure CLI][azure_cli] +- [Powershell 7.0+][powershell] + +### Deploy Stress Test + +cd into `azure-sdk-for-java` root folder and run command to deploy the package to cluster: + +```shell +./eng/common/scripts/stress-testing/deploy-stress-tests.ps1 -SearchDirectory ./sdk/ +``` + +### Check Status + +Only the most frequently used commands are listed below. See [Deploying A Stress Test][deploy_stress_test] for more details. + +List deployed packages: + +```shell +helm list -n +``` + +the namespace usually matches your username. + +Get stress test pods and status: + +```shell +kubectl get pods -n +``` + +To get readable metadata for pods and/or containers use + +```shell +kubectl describe pod -n -c +``` + +Get stress test pod logs: + +```shell +kubectl logs -n +# Note that we may define multiple containers (for example, `fault-injector` and `main`) +kubectl logs -n -c +``` + +If stress test pod is in `Error` status, check logs from containers: + +```shell +kubectl logs -n +``` + +You may also get logs for specific containers: + +```shell +kubectl logs -n -c +``` + +Stop and remove deployed package: + +```shell +helm uninstall -n +``` + +### Other useful commands + +Execute commands in the container: + +```shell +kubectl exec --stdin --tty -n -c -- /bin/bash +```` + +### Share data from within the container + +Stress containers run with `$DEBUG_SHARE` environment variable set to the location of the shared folder. You can put anything you want to share there and access it - check out https://aka.ms/azsdk/stress/fileshare. + +## Key concepts + +### Project Structure + +See [Layout][stress_test_layout] section for details. + +Below is the current structure of project: +``` +. +├── src/ # Test code +├── templates/ # A directory of helm templates that will generate Kubernetes manifest files. +├── workbooks/ # A directory of Azure Monitor workbooks for analyzing stress test results. +├── Chart.yaml # A YAML file containing information about the helm chart and its dependencies +├── scenarios-matrix.yaml # A YAML file containing configuration and custom values for stress test(s) +├── Dockerfile # A Dockerfile for building the stress test image +├── stress-test-resources.bicep # An Azure Bicep for deploying stress test azure resources +├── pom.xml +└── README.md +``` + +### How to create your own tests + +Start with [Azure SDK stress Wiki](https://aka.ms/azsdk/stress) to learn about stress tests. + +1. Copy `src/main/java/com/azure/sdk/template/azure-template-stress` folder to your service folder. +2. Update the code + - Update `pom.xml` to change artifact name and add dependencies on your service. + - Implement your first stress test instead of `HttpGet` and make sure to update `StressTestOptions` to include important parameters for your tests. + +Now you can run stress tests locally. Remaining steps are required to run tests on a stress cluster. + +3. Update `dockerfiles` to build your service artifacts and any dependencies of current version. +4. Describe Azure resources necessary for your tests in `stress-test-resources.bicep` +5. Update `Chart.yaml`: + - change chart `name` to include your service name. Please keep `java-` prefix. + - change `annotations.stressTest` to `true` to enable auto-discovery +5. Update `templates/job.yaml` + - remove `server` container as you probably don't need it + - replace occurrences of `java-template` to match name in the `Chart.yaml` + - update test parameters for `test` container, feel free to rename the container as you see fit +6. Define scenarios and parameters in `scenarios-matrix.yaml` + +Now you're ready to run tests with `./eng/common/scripts/stress-testing/deploy-stress-tests.ps1 -SearchDirectory ./sdk/`. +See [Deploying A Stress Test][deploy_stress_test] for more details. + +Let's see how we can check test results. + +### Checking test results + +#### Stress Test Dashboard + +General-purpose stress test dashboard is available at https://aka.ms/azsdk/stress/dashboard. It shows: +- Pod status events +- CPU and memory utilization of the stress test pods +- Container logs and events + +Stress test dashboard does not know about local stress test runs. + +#### Application Insights + +Stress test template comes with OpenTelemetry and rich monitoring experience including: +- resource utilization metrics (CPU, memory, GC, threads, etc.) +- live metrics, performance overview, etc +- distributed tracing and dependency calls (HTTP, Azure SDK calls) +- exceptions and logs +- profiling in production + +The telemetry is sent to Application Insights where it's useful to: +- monitor and compare throughput and latency across runs +- investigate issues and find bottlenecks + +Application Insights is available for local runs (as long as you provide `APPLICATIONINSIGHTS_CONNECTION_STRING` environment variable). + +You may choose to use [ApplicationInsights Java agent](https://learn.microsoft.com/azure/azure-monitor/app/opentelemetry-enable?tabs=java#install-the-client-library) if +your test throughput (and amount of telemetry it generates) is relatively low. +Since agent does a lot og things, it might create some noise during performance analysis and micro-optimizations. + +### Logging + +We use [logback.xml][logback_xml] to configure the logging. By default, the stress test run on cluster will output +`WARN` level log which you may adjust based on your needs. +You may also control the verbosity of logs that go to Application Insights - see [OpenTelemetry logback appender][opentelemetry-logback] for more details. + +Since logs are hard to query and are extremely verbose (in case of high-scale stress tests), we're relying on metrics and workbooks for test result analysis. + +See also [Logging in Azure SDK][logging-azure-sdk]. + +### Metrics + +While some Azure SDKs provide custom metrics, we're going to collect generic test metrics and build queries/workbooks on top of them, +so it's important to reuse the same metric across different tests whenever possible. + +We need just one generic metric for basic analysis - the one that measures duration of one test execution (with additional dimensions). +It's implemented in `com.azure.sdk.template.stress.util.TelemetryHelper` and has the following semantic: +- name: `test.run.duration` - it is used in the stress workbook, so make sure to use the same name when applicable +- unit: seconds +- customDimensions: + - `error.type` - The low-cardinality type of error describing what happened (eg. exception class name). + +The metric should measure exactly one test operation, so we'll be able to derive the key performance indicators from it such as: +- throughput (rate of operations per period of time) +- duration of one operation +- error rate (how frequently errors of different types occur) + +Each metric collected with OpenTelemetry (and exported to Application Insights) also has the following dimensions: +- `cloud_RoleName` - in case of stress tests, it matches value of `otel.service.name` property configured in `Chart.yaml` to `{{ .Release.Name }}-{{ .Stress.BaseName }}`. +- `cloud_RoleInstance` - in case of k8s it matches pod name and is auto-detected. + +When running multiple test containers, make sure to assign different role instances to them, for example use `{{ .Stress.BaseName }}-consumer` and `{{ .Stress.BaseName }}-producer`. +This would allow you to distinguish telemetry coming from different containers. + +You would need to adjust the workbook to accommodate those changes. + +In addition to `test.run_duration`, we're also collecting: +- [JVM metrics](/~https://github.com/open-telemetry/opentelemetry-java-instrumentation/blob/main/instrumentation/runtime-telemetry/runtime-telemetry-java8/library/README.md) measured by OpenTelemetry: + - CPU and memory usage + - GC stats + - Thread count + - Class stats + - See [JVM metrics semantic conventions for the details](/~https://github.com/open-telemetry/semantic-conventions/blob/main/docs/runtime/jvm-metrics.md) + +You can also enable [reactor schedulers metrics](/~https://github.com/reactor/reactor-core/blob/main/docs/asciidoc/metrics.adoc) collection by installing `micrometer-core` and +[OpenTelemetry micrometer bridge](/~https://github.com/open-telemetry/opentelemetry-java-instrumentation/tree/main/instrumentation/micrometer/micrometer-1.5/library). + +### Stress test workbook + +[Stress test workbook](https://ms.portal.azure.com/#@microsoft.onmicrosoft.com/resource/subscriptions/faa080af-c1d8-40ad-9cce-e1a450ca5b57/resourceGroups/rg-stress-cluster-pg/providers/Microsoft.Insights/components/stress-pg-ai-s7b6dif73rup6/workbooks) +shows a summary of a test run. + +First, select a time range and run from the list, then check the report: +- `Test summary` contains key test parameters and key counters (total number of operations, errors, etc.) +- Tst operation success rate, latency and error rate +- CPU and memory utilization, number of threads and time spent in GC +- Warnings, errors, and exceptions in logs. Note logs and traces are sampled (at 1%) rate, so you won't see every error there + +Since you're changing the chart name, you would need to update the workbook to use `java-your-service-name` instead of `java-template`. +Then you'd need to create a new workbook for your service, follow +[Azure Monitor workbook documentation](https://learn.microsoft.com/azure/azure-monitor/visualize/workbooks-create-workbook) for more details. +Then you can import json file from `workbooks` folder. + +## Writing useful tests + +Stress tests are intended to detect reliability and resiliency issues: +- bugs in retry policy +- graceful degradation under high load and transient failures +- memory and connection leaks, thread pool starvation, etc + +To explore fault injection options, check out [Chaos mesh](/~https://github.com/Azure/azure-sdk-tools/blob/main/tools/stress-cluster/chaos/README.md#chaos-manifest) and [Http Fault injector](/~https://github.com/Azure/azure-sdk-tools/tree/main/tools/http-fault-injector). + +> Note: [Azure Chaos Studio](https://azure.microsoft.com/products/chaos-studio) is not currently supported by the stress test infra. + +Even without fault injection, by applying maximum load to the service, we can detect memory leaks, extensive allocations, +thread pool issues, or other performance issues in the code. So make sure to configure resource limits and apply the maximum load you can get under them. + + +[azure_sdk_stress_test]: https://aka.ms/azsdk/stress +[jdk_link]: https://docs.microsoft.com/java/azure/jdk/?view=azure-java-stable +[maven]: https://maven.apache.org/ +[docker]: https://docs.docker.com/get-docker/ +[kubectl]: https://kubernetes.io/docs/tasks/tools/#kubectl +[helm]: https://helm.sh/docs/intro/install/ +[azure_cli]: https://docs.microsoft.com/cli/azure/install-azure-cli +[powershell]: https://docs.microsoft.com/powershell/scripting/install/installing-powershell?view=powershell-7 +[enable_application_insights]: https://learn.microsoft.com/en-us/azure/azure-monitor/app/opentelemetry-enable?tabs=java#enable-azure-monitor-application-insights +[logback_xml]: /~https://github.com/Azure/azure-sdk-for-java/blob/main/sdk/servicebus/azure-messaging-servicebus-stress/src/main/resources/logback.xml +[deploy_stress_test]: /~https://github.com/Azure/azure-sdk-tools/blob/main/tools/stress-cluster/chaos/README.md#deploying-a-stress-test +[stress_test_layout]: /~https://github.com/Azure/azure-sdk-tools/blob/main/tools/stress-cluster/chaos/README.md#layout +[opentelemetry-logback]: /~https://github.com/open-telemetry/opentelemetry-java-instrumentation/tree/main/instrumentation/logback/logback-appender-1.0/library +[logging-azure-sdk]: /~https://github.com/Azure/azure-sdk-for-java/wiki/Logging-in-Azure-SDK diff --git a/sdk/template/azure-template-stress/dockerfiles/java11 b/sdk/template/azure-template-stress/dockerfiles/java11 new file mode 100644 index 0000000000000..f9dad4d3e89c8 --- /dev/null +++ b/sdk/template/azure-template-stress/dockerfiles/java11 @@ -0,0 +1,41 @@ +ARG REGISTRY="azsdkengsys.azurecr.io" +ARG JRE_VERSION="11" +FROM ${REGISTRY}/java/jdk-mariner-mvn:jdk11-latest as builder + +# Do not remove this line. Update ensures container images do not get flagged for out of date and vulnerable distro packages. +RUN yum -y update + +# Add necessary files to the image +RUN mkdir /stress +WORKDIR /stress +ADD ./sdk/tools /stress/sdk/tools +ADD ./eng /stress/eng +ADD ./common /stress/common +ADD ./sdk/parents /stress/sdk/parents +ADD ./sdk/template /stress/sdk/template + +ARG SKIP_CHECKS="-Dcheckstyle.skip -Dgpg.skip -Dmaven.javadoc.skip -Drevapi.skip -Dspotbugs.skip -Djacoco.skip -DskipTests -Dcodesnippet.skip" + +# Build dependencies and stress tests +RUN --mount=type=cache,target=/root/.m2 \ +mvn -f /stress/eng/code-quality-reports/pom.xml clean install ${SKIP_CHECKS} && \ +mvn -f /stress/common/perf-test-core/pom.xml clean install ${SKIP_CHECKS} && \ +mvn -f /stress/sdk/parents/azure-perf-test-parent/pom.xml clean install ${SKIP_CHECKS} && \ +mvn -f /stress/sdk/tools/pom.xml clean install -Dcheckstyle.skip ${SKIP_CHECKS} && \ +mvn -f /stress/sdk/template/azure-template-stress/pom.xml clean install ${SKIP_CHECKS} + +FROM mcr.microsoft.com/openjdk/jdk:${JRE_VERSION}-mariner + +# Do not remove this line. Update ensures container images do not get flagged for out of date and vulnerable distro packages. +RUN yum -y update + +# Copy target files from builder image +WORKDIR /app +COPY --from=builder /stress/sdk/template/azure-template-stress/target . + +# Import test server self-signed certificate +COPY --from=builder /stress/sdk/template/azure-template-stress/src/main/resources/simplehttpserver.crt ./simplehttpserver.crt +RUN keytool -import -alias test -file ./simplehttpserver.crt -keystore ${JAVA_HOME}/lib/security/cacerts -noprompt -keypass changeit -storepass changeit + +# This is never executed (since job yaml overrides it) +ENTRYPOINT ["bash"] diff --git a/sdk/template/azure-template-stress/dockerfiles/java21 b/sdk/template/azure-template-stress/dockerfiles/java21 new file mode 100644 index 0000000000000..1f39610363127 --- /dev/null +++ b/sdk/template/azure-template-stress/dockerfiles/java21 @@ -0,0 +1,41 @@ +ARG REGISTRY="azsdkengsys.azurecr.io" +ARG JRE_VERSION="21" +FROM ${REGISTRY}/java/jdk-mariner-mvn:jdk11-latest as builder + +# Do not remove this line. Update ensures container images do not get flagged for out of date and vulnerable distro packages. +RUN yum -y update + +# Add necessary files to the image +RUN mkdir /stress +WORKDIR /stress +ADD ./sdk/tools /stress/sdk/tools +ADD ./eng /stress/eng +ADD ./common /stress/common +ADD ./sdk/parents /stress/sdk/parents +ADD ./sdk/template /stress/sdk/template + +ARG SKIP_CHECKS="-Dcheckstyle.skip -Dgpg.skip -Dmaven.javadoc.skip -Drevapi.skip -Dspotbugs.skip -Djacoco.skip -DskipTests -Dcodesnippet.skip" + +# Build dependencies and stress tests +RUN --mount=type=cache,target=/root/.m2 \ +mvn -f /stress/eng/code-quality-reports/pom.xml clean install ${SKIP_CHECKS} && \ +mvn -f /stress/common/perf-test-core/pom.xml clean install ${SKIP_CHECKS} && \ +mvn -f /stress/sdk/parents/azure-perf-test-parent/pom.xml clean install ${SKIP_CHECKS} && \ +mvn -f /stress/sdk/tools/pom.xml clean install -Dcheckstyle.skip ${SKIP_CHECKS} && \ +mvn -f /stress/sdk/template/azure-template-stress/pom.xml clean install ${SKIP_CHECKS} + +FROM mcr.microsoft.com/openjdk/jdk:${JRE_VERSION}-mariner + +# Do not remove this line. Update ensures container images do not get flagged for out of date and vulnerable distro packages. +RUN yum -y update + +# Copy target files from builder image +WORKDIR /app +COPY --from=builder /stress/sdk/template/azure-template-stress/target . + +# Import test server self-signed certificate +COPY --from=builder /stress/sdk/template/azure-template-stress/src/main/resources/simplehttpserver.crt ./simplehttpserver.crt +RUN keytool -import -alias test -file ./simplehttpserver.crt -keystore ${JAVA_HOME}/lib/security/cacerts -noprompt -keypass changeit -storepass changeit + +# This is never executed (since job yaml overrides it) +ENTRYPOINT ["bash"] diff --git a/sdk/template/azure-template-stress/pom.xml b/sdk/template/azure-template-stress/pom.xml new file mode 100644 index 0000000000000..e5fb4646537eb --- /dev/null +++ b/sdk/template/azure-template-stress/pom.xml @@ -0,0 +1,109 @@ + + + + + 4.0.0 + + + com.azure + azure-client-sdk-parent + 1.7.0 + ../../parents/azure-client-sdk-parent + + + com.azure + azure-template-stress + 1.0.0-beta.1 + jar + + + 1.8 + 1.8 + all,-missing + + + + + com.azure + perf-test-core + 1.0.0-beta.1 + + + + com.azure + azure-monitor-opentelemetry-exporter + 1.0.0-beta.15 + + + io.opentelemetry.instrumentation + opentelemetry-runtime-telemetry-java8 + 1.32.0-alpha + + + io.opentelemetry.instrumentation + opentelemetry-logback-appender-1.0 + 1.32.0-alpha + + + ch.qos.logback + logback-classic + 1.3.12 + + + + + + + org.apache.maven.plugins + maven-enforcer-plugin + 3.0.0-M3 + + + + + io.opentelemetry.instrumentation:opentelemetry-runtime-telemetry-java8:[1.32.0-alpha] + io.opentelemetry.instrumentation:opentelemetry-logback-appender-1.0:[1.32.0-alpha] + ch.qos.logback:logback-classic:[1.3.12] + + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.3.0 + + + package + + shade + + + + + com.azure.sdk.template.stress.App + + + + ${project.artifactId}-${project.version}-jar-with-dependencies + + + *:* + + META-INF/maven/** + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + + + + diff --git a/sdk/template/azure-template-stress/scenarios-matrix.yaml b/sdk/template/azure-template-stress/scenarios-matrix.yaml new file mode 100644 index 0000000000000..cf0493da3c987 --- /dev/null +++ b/sdk/template/azure-template-stress/scenarios-matrix.yaml @@ -0,0 +1,61 @@ +matrix: + image: + - dockerfiles/java11 + - dockerfiles/java21 + scenarios: + netty-sync-get: + imageBuildDir: ..\..\..\ + testDurationMin: 15 + testScenario: httpget + concurrency: 75 + sync: true + httpClient: netty + netty-async-get: + imageBuildDir: ..\..\..\ + testDurationMin: 15 + testScenario: httpget + concurrency: 75 + sync: false + httpClient: netty + okhttp-sync-get: + imageBuildDir: ..\..\..\ + testDurationMin: 15 + testScenario: httpget + concurrency: 75 + sync: true + httpClient: okhttp + okhttp-async-get: + imageBuildDir: ..\..\..\ + testDurationMin: 15 + testScenario: httpget + concurrency: 75 + sync: false + httpClient: okhttp + jdk-sync-get: + imageBuildDir: ..\..\..\ + testDurationMin: 15 + testScenario: httpget + concurrency: 75 + sync: true + httpClient: jdk + jdk-async-get: + imageBuildDir: ..\..\..\ + testDurationMin: 15 + testScenario: httpget + concurrency: 75 + sync: false + httpClient: jdk + vertx-sync-get: + imageBuildDir: ..\..\..\ + testDurationMin: 15 + testScenario: httpget + concurrency: 75 + sync: true + httpClient: vertx + vertx-async-get: + imageBuildDir: ..\..\..\ + testDurationMin: 15 + testScenario: httpget + concurrency: 75 + sync: false + httpClient: vertx diff --git a/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/App.java b/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/App.java new file mode 100644 index 0000000000000..21f95476c4602 --- /dev/null +++ b/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/App.java @@ -0,0 +1,26 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package com.azure.sdk.template.stress; + +import com.azure.perf.test.core.PerfStressProgram; +import com.azure.sdk.template.stress.util.TelemetryHelper; + +/** + * Stress test application + */ +public class App { + + /** + * Main method to invoke other stress tests. + * @param args the input arguments + */ + public static void main(String[] args) { + TelemetryHelper.init(); + + PerfStressProgram.run(new Class[]{ + HttpGet.class, + // add other stress tests here + }, args); + } +} diff --git a/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/HttpGet.java b/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/HttpGet.java new file mode 100644 index 0000000000000..68ebbc419331e --- /dev/null +++ b/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/HttpGet.java @@ -0,0 +1,88 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package com.azure.sdk.template.stress; + +import com.azure.core.http.HttpMethod; +import com.azure.core.http.HttpPipeline; +import com.azure.core.http.HttpPipelineBuilder; +import com.azure.core.http.HttpRequest; +import com.azure.core.http.HttpResponse; +import com.azure.core.http.policy.HttpLogDetailLevel; +import com.azure.core.http.policy.HttpLogOptions; +import com.azure.core.http.policy.HttpLoggingPolicy; +import com.azure.core.http.policy.HttpPipelinePolicy; +import com.azure.core.http.policy.RetryPolicy; +import com.azure.core.util.Context; +import com.azure.core.util.logging.ClientLogger; +import com.azure.sdk.template.stress.util.TelemetryHelper; +import reactor.core.publisher.Mono; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; + +/** + * Performance test for simple HTTP GET against test server. + */ +public class HttpGet extends ScenarioBase { + // there will be multiple instances of scenario + private static final TelemetryHelper TELEMETRY_HELPER = new TelemetryHelper(HttpGet.class); + private static final ClientLogger LOGGER = new ClientLogger(HttpGet.class); + private final HttpPipeline pipeline; + private final URL url; + + /** + * Creates an instance of performance test. + * @param options stress test options + */ + public HttpGet(StressOptions options) { + super(options, TELEMETRY_HELPER); + pipeline = getPipelineBuilder().build(); + try { + url = new URL(options.getServiceEndpoint()); + } catch (MalformedURLException ex) { + throw LOGGER.logThrowableAsError(new IllegalArgumentException("'url' must be a valid URL.", ex)); + } + } + + @Override + public void run() { + TELEMETRY_HELPER.instrumentRun(this::runInternal); + } + + private void runInternal() { + HttpRequest request = new HttpRequest(HttpMethod.GET, url); + // no need to handle exceptions here, they will be handled (and recorded) by the telemetry helper + try (HttpResponse response = pipeline.sendSync(request, Context.NONE)) { + response.buffer().close(); + } + } + + @Override + public Mono runAsync() { + return TELEMETRY_HELPER.instrumentRunAsync(runInternalAsync()); + } + + private Mono runInternalAsync() { + HttpRequest request = new HttpRequest(HttpMethod.GET, url); + // no need to handle exceptions here, they will be handled (and recorded) by the telemetry helper + return Mono.usingWhen(pipeline.send(request), + response -> response.getBody().then(), + response -> Mono.fromRunnable(response::close)); + } + + private HttpPipelineBuilder getPipelineBuilder() { + HttpLogOptions logOptions = new HttpLogOptions() + .setLogLevel(HttpLogDetailLevel.HEADERS); + + ArrayList policies = new ArrayList<>(); + + policies.add(new RetryPolicy()); + policies.add(new HttpLoggingPolicy(logOptions)); + + return new HttpPipelineBuilder() + .httpClient(super.httpClient) + .policies(policies.toArray(new HttpPipelinePolicy[0])); + } +} diff --git a/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/ScenarioBase.java b/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/ScenarioBase.java new file mode 100644 index 0000000000000..501592ab43ddb --- /dev/null +++ b/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/ScenarioBase.java @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package com.azure.sdk.template.stress; + +import com.azure.perf.test.core.PerfStressTest; +import com.azure.sdk.template.stress.util.TelemetryHelper; +import reactor.core.publisher.Mono; + +import java.time.Instant; + +/** + * Performance test for getting messages. + */ +public abstract class ScenarioBase extends PerfStressTest { + private final TelemetryHelper telemetryHelper; + private final Instant startTime = Instant.now(); + /** + * Creates a stress test. + * + * @param options Performance test configuration options. + * @param telemetryHelper Telemetry helper to monitor test execution and record stats. + */ + public ScenarioBase(TOptions options, TelemetryHelper telemetryHelper) { + super(options); + this.telemetryHelper = telemetryHelper; + } + + @Override + public Mono globalSetupAsync() { + telemetryHelper.recordStart(options); + return super.globalSetupAsync(); + } + + @Override + public Mono globalCleanupAsync() { + telemetryHelper.recordEnd(startTime); + return super.globalCleanupAsync(); + } +} diff --git a/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/StressOptions.java b/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/StressOptions.java new file mode 100644 index 0000000000000..25582969383fb --- /dev/null +++ b/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/StressOptions.java @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package com.azure.sdk.template.stress; + +import com.azure.perf.test.core.PerfStressOptions; +import com.beust.jcommander.Parameter; + +/** + * Options to be used by your stress tests. +*/ +public class StressOptions extends PerfStressOptions { + @Parameter(names = { "--endpoint" }, description = "Service endpoint") + private String serviceEndpoint; + + /** + * Gets the service endpoint. + * @return the service endpoint. + */ + public String getServiceEndpoint() { + return serviceEndpoint; + } + + // When adding new test parameters, consider adding them to TelemetryHelper.recordStart() +} diff --git a/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/package-info.java b/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/package-info.java new file mode 100644 index 0000000000000..a613a3f8da07e --- /dev/null +++ b/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/package-info.java @@ -0,0 +1,7 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +/** + * Contains classes for stress tests. + */ +package com.azure.sdk.template.stress; diff --git a/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/util/TelemetryHelper.java b/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/util/TelemetryHelper.java new file mode 100644 index 0000000000000..4967c6f36101b --- /dev/null +++ b/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/util/TelemetryHelper.java @@ -0,0 +1,246 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package com.azure.sdk.template.stress.util; + +import com.azure.core.http.HttpClientProvider; +import com.azure.core.util.logging.ClientLogger; +import com.azure.monitor.opentelemetry.exporter.AzureMonitorExporterBuilder; +import com.azure.sdk.template.stress.StressOptions; +import io.opentelemetry.api.GlobalOpenTelemetry; +import io.opentelemetry.api.OpenTelemetry; +import io.opentelemetry.api.common.AttributeKey; +import io.opentelemetry.api.common.Attributes; +import io.opentelemetry.api.metrics.DoubleHistogram; +import io.opentelemetry.api.metrics.Meter; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.SpanKind; +import io.opentelemetry.api.trace.StatusCode; +import io.opentelemetry.api.trace.Tracer; +import io.opentelemetry.context.Context; +import io.opentelemetry.context.Scope; +import io.opentelemetry.instrumentation.logback.appender.v1_0.OpenTelemetryAppender; +import io.opentelemetry.instrumentation.runtimemetrics.java8.Classes; +import io.opentelemetry.instrumentation.runtimemetrics.java8.Cpu; +import io.opentelemetry.instrumentation.runtimemetrics.java8.GarbageCollector; +import io.opentelemetry.instrumentation.runtimemetrics.java8.MemoryPools; +import io.opentelemetry.instrumentation.runtimemetrics.java8.Threads; +import io.opentelemetry.sdk.autoconfigure.AutoConfiguredOpenTelemetrySdk; +import io.opentelemetry.sdk.autoconfigure.AutoConfiguredOpenTelemetrySdkBuilder; +import io.opentelemetry.sdk.trace.data.LinkData; +import io.opentelemetry.sdk.trace.samplers.Sampler; +import io.opentelemetry.sdk.trace.samplers.SamplingResult; +import reactor.core.Exceptions; +import reactor.core.publisher.Mono; +import reactor.core.scheduler.Schedulers; + +import java.time.Instant; +import java.util.List; +import java.util.concurrent.TimeoutException; + +/** + * Telemetry helper is used to monitor test execution and record stats. + */ +public class TelemetryHelper { + private final Tracer tracer; + private final ClientLogger logger; + private static final AttributeKey SCENARIO_NAME_ATTRIBUTE = AttributeKey.stringKey("scenario_name"); + private static final AttributeKey ERROR_TYPE_ATTRIBUTE = AttributeKey.stringKey("error.type"); + private static final AttributeKey SAMPLE_IN_ATTRIBUTE = AttributeKey.booleanKey("sample.in"); + private final String scenarioName; + private final Meter meter; + private final DoubleHistogram runDuration; + private final Attributes commonAttributes; + private final Attributes canceledAttributes; + + static { + // enables micrometer metrics from Reactor schedulers allowing to monitor thread pool usage and starvation + Schedulers.enableMetrics(); + } + + /** + * Creates an instance of telemetry helper. + * @param scenarioClass the scenario class + */ + public TelemetryHelper(Class scenarioClass) { + this.scenarioName = scenarioClass.getName(); + this.tracer = GlobalOpenTelemetry.getTracer(scenarioName); + this.meter = GlobalOpenTelemetry.getMeter(scenarioName); + this.logger = new ClientLogger(scenarioName); + this.runDuration = meter.histogramBuilder("test.run.duration") + .setUnit("s") + .build(); + this.commonAttributes = Attributes.of(SCENARIO_NAME_ATTRIBUTE, scenarioName); + this.canceledAttributes = Attributes.of(SCENARIO_NAME_ATTRIBUTE, scenarioName, ERROR_TYPE_ATTRIBUTE, "cancelled"); + } + + /** + * Initializes telemetry helper: sets up Azure Monitor exporter, enables JVM metrics collection. + */ + public static void init() { + AutoConfiguredOpenTelemetrySdkBuilder sdkBuilder = AutoConfiguredOpenTelemetrySdk.builder(); + String applicationInsightsConnectionString = System.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING"); + if (applicationInsightsConnectionString != null) { + new AzureMonitorExporterBuilder() + .connectionString(applicationInsightsConnectionString) + .install(sdkBuilder); + } + + OpenTelemetry otel = sdkBuilder + // in case of multi-container test, customize instance id to distinguish telemetry from different containers + //.addResourceCustomizer((resource, props) -> resource.toBuilder().put(AttributeKey.stringKey("service.instance.id"), "container-name-1").build()) + .addSamplerCustomizer((sampler, props) -> new Sampler() { + @Override + public SamplingResult shouldSample(Context parentContext, String traceId, String name, SpanKind spanKind, Attributes attributes, List parentLinks) { + if (Boolean.TRUE.equals(attributes.get(SAMPLE_IN_ATTRIBUTE))) { + return SamplingResult.recordAndSample(); + } + return sampler.shouldSample(parentContext, traceId, name, spanKind, attributes, parentLinks); + } + + @Override + public String getDescription() { + return sampler.getDescription(); + } + }) + .setResultAsGlobal() + .build() + .getOpenTelemetrySdk(); + Classes.registerObservers(otel); + Cpu.registerObservers(otel); + MemoryPools.registerObservers(otel); + Threads.registerObservers(otel); + GarbageCollector.registerObservers(otel); + OpenTelemetryAppender.install(otel); + } + + /** + * Instruments a runnable: records runnable duration along with the status (success, error, cancellation), + * @param oneRun the runnable to instrument + */ + @SuppressWarnings("try") + public void instrumentRun(Runnable oneRun) { + Instant start = Instant.now(); + Span span = tracer.spanBuilder("run").startSpan(); + try (Scope s = span.makeCurrent()) { + oneRun.run(); + trackSuccess(start, span); + } catch (Throwable e) { + if (e.getMessage().contains("Timeout on blocking read") || e instanceof InterruptedException || e instanceof TimeoutException) { + trackCancellation(start, span); + } else { + trackFailure(start, e, span); + } + } + } + + /** + * Instruments a Mono: records mono duration along with the status (success, error, cancellation), + * @param runAsync the mono to instrument + * @return the instrumented mono + */ + @SuppressWarnings("try") + public Mono instrumentRunAsync(Mono runAsync) { + return Mono.defer(() -> { + Instant start = Instant.now(); + Span span = tracer.spanBuilder("runAsync").startSpan(); + try (Scope s = span.makeCurrent()) { + return runAsync.doOnError(e -> trackFailure(start, e, span)) + .doOnCancel(() -> trackCancellation(start, span)) + .doOnSuccess(v -> trackSuccess(start, span)) + .contextWrite(reactor.util.context.Context.of(com.azure.core.util.tracing.Tracer.PARENT_TRACE_CONTEXT_KEY, io.opentelemetry.context.Context.current())); + } + }); + } + + private void trackSuccess(Instant start, Span span) { + logger.atInfo() + .log("run ended"); + + runDuration.record(getDuration(start), commonAttributes); + span.end(); + } + + private void trackCancellation(Instant start, Span span) { + logger.atWarning() + .addKeyValue("error.type", "cancelled") + .log("run ended"); + + runDuration.record(getDuration(start), canceledAttributes); + span.setAttribute(ERROR_TYPE_ATTRIBUTE, "cancelled"); + span.setStatus(StatusCode.ERROR); + span.end(); + } + + private void trackFailure(Instant start, Throwable e, Span span) { + Throwable unwrapped = Exceptions.unwrap(e); + + span.recordException(unwrapped); + span.setStatus(StatusCode.ERROR, unwrapped.getMessage()); + + String errorType = unwrapped.getClass().getName(); + logger.atError() + .addKeyValue("error.type", errorType) + .log("run ended", unwrapped); + + Attributes attributes = Attributes.of(SCENARIO_NAME_ATTRIBUTE, scenarioName, ERROR_TYPE_ATTRIBUTE, errorType); + runDuration.record(getDuration(start), attributes); + span.end(); + } + + /** + * Records an event representing the start of a test along with test options. + * @param options test parameters + */ + public void recordStart(StressOptions options) { + String libraryPackageVersion = "unknown"; + try { + Class libraryPackage = Class.forName(HttpClientProvider.class.getName()); + libraryPackageVersion = libraryPackage.getPackage().getImplementationVersion(); + if (libraryPackageVersion == null) { + libraryPackageVersion = "null"; + } + } catch (ClassNotFoundException e) { + logger.atWarning() + .addKeyValue("class", HttpClientProvider.class.getName()) + .log("Could not determine azure-core version, HttpClientProvider class is not found", e); + } + + Span before = startSampledInSpan("before run"); + before.setAttribute(AttributeKey.longKey("durationSec"), options.getDuration()); + before.setAttribute(AttributeKey.stringKey("scenarioName"), scenarioName); + before.setAttribute(AttributeKey.longKey("concurrency"), options.getParallel()); + before.setAttribute(AttributeKey.stringKey("libraryPackageVersion"), libraryPackageVersion); + before.setAttribute(AttributeKey.booleanKey("sync"), options.isSync()); + before.setAttribute(AttributeKey.longKey("size"), options.getSize()); + before.setAttribute(AttributeKey.stringKey("hostname"), System.getenv().get("HOSTNAME")); + before.setAttribute(AttributeKey.stringKey("serviceEndpoint"), options.getServiceEndpoint()); + before.setAttribute(AttributeKey.stringKey("httpClientProvider"), options.getHttpClient().toString()); + before.setAttribute(AttributeKey.stringKey("jreVersion"), System.getProperty("java.version")); + before.setAttribute(AttributeKey.stringKey("jreVendor"), System.getProperty("java.vendor")); + before.end(); + } + + /** + * Records an event representing the end of the test. + * @param startTime the start time of the test + */ + public void recordEnd(Instant startTime) { + Span after = startSampledInSpan("after run"); + after.setAttribute(AttributeKey.longKey("durationMs"), Instant.now().toEpochMilli() - startTime.toEpochMilli()); + after.end(); + } + + + private Span startSampledInSpan(String name) { + return tracer.spanBuilder(name) + // guarantee that we have before/after spans sampled in + // and record duration/result of the test + .setAttribute(SAMPLE_IN_ATTRIBUTE, true) + .startSpan(); + } + + private static double getDuration(Instant start) { + return Math.max(0d, Instant.now().toEpochMilli() - start.toEpochMilli()) / 1000d; + } +} diff --git a/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/util/package-info.java b/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/util/package-info.java new file mode 100644 index 0000000000000..4849c64846bfe --- /dev/null +++ b/sdk/template/azure-template-stress/src/main/java/com/azure/sdk/template/stress/util/package-info.java @@ -0,0 +1,7 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +/** + * Contains classes for stress tests utils. + */ +package com.azure.sdk.template.stress.util; diff --git a/sdk/template/azure-template-stress/src/main/resources/logback.xml b/sdk/template/azure-template-stress/src/main/resources/logback.xml new file mode 100644 index 0000000000000..a4643c0ce304c --- /dev/null +++ b/sdk/template/azure-template-stress/src/main/resources/logback.xml @@ -0,0 +1,16 @@ + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + + + diff --git a/sdk/template/azure-template-stress/src/main/resources/simplehttpserver.crt b/sdk/template/azure-template-stress/src/main/resources/simplehttpserver.crt new file mode 100644 index 0000000000000..1fb68bb575ee3 --- /dev/null +++ b/sdk/template/azure-template-stress/src/main/resources/simplehttpserver.crt @@ -0,0 +1,19 @@ +-----BEGIN CERTIFICATE----- +MIIDDTCCAfWgAwIBAgIJAJUP82tfx9kZMA0GCSqGSIb3DQEBCwUAMBQxEjAQBgNV +BAMTCWxvY2FsaG9zdDAeFw0yNDAxMDQyMzExMzdaFw0yNTAxMDMyMzExMzdaMBQx +EjAQBgNVBAMTCWxvY2FsaG9zdDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoC +ggEBAM4xzCbl8XG120Ns3zunVMjOeldEWarequhVaUMAC9Yx+6VpVLLpH8qwSFS8 +Cwj9ePtd5m+BulfPCZV0sfUgijfG53kov+O3ri7uFxR5mpO3JRlCITEnIJ+S0AJ5 +bbrPW285PgFQzzSIE7zT449A1mIv0aZOxsv+Tl2UHTZeCD7+fEMQBeMoxy5eE1Tl +Jejq0Anm2DJJsBG11pB2ehVhCec4N91LkSCVFywzuQT1A/QJPfNzHotdbSxYqbzy +F4laJJUxWRcgtedCSjYjt4//fkkq0sZOgCJfQvCV2loEf+6cGzYgkM3sdSICqdIj +uHZ0BJJBD6gcKfbQARLhlMjBYkkCAwEAAaNiMGAwDAYDVR0TAQH/BAIwADAOBgNV +HQ8BAf8EBAMCBaAwFgYDVR0lAQH/BAwwCgYIKwYBBQUHAwEwFwYDVR0RAQH/BA0w +C4IJbG9jYWxob3N0MA8GCisGAQQBgjdUAQEEAQIwDQYJKoZIhvcNAQELBQADggEB +AL3uWKII2qUpny9wxc43NYAEyjaMnSUrMoWs15bc94ikjMbWYOnCUtfpdspfM71P +Wsu4Xcb+BBxK0gzEq46nkC5g1712hgae/+PxKf4DmarB1YT7nWM9jVhYCyL+VhfQ +7B7QX7Qp0sXx6JjtbuJnKNRVjS4Rtn3O6fnF6EGlmxz7X3KJ/odQAmUHkUwuALom +f4qVRREJtDNrOzFVEo9mKZNv+S3duCco3gNLeDlFqT01Ph7P+qiqmEUN/6rUrB8A +1cvldYM829wP5izqgSPGnA6UjIh1BFnsThJoNit1IFVUhrmbwrujzjNj+N6SDxcM +NAsWNoChEk2kINynt0Pk2ww= +-----END CERTIFICATE----- \ No newline at end of file diff --git a/sdk/template/azure-template-stress/stress-test-resources.bicep b/sdk/template/azure-template-stress/stress-test-resources.bicep new file mode 100644 index 0000000000000..a9cb5bc8f989b --- /dev/null +++ b/sdk/template/azure-template-stress/stress-test-resources.bicep @@ -0,0 +1,2 @@ +@description('The base resource name.') +param baseName string = resourceGroup().name diff --git a/sdk/template/azure-template-stress/templates/job.yaml b/sdk/template/azure-template-stress/templates/job.yaml new file mode 100644 index 0000000000000..7399453b6bf30 --- /dev/null +++ b/sdk/template/azure-template-stress/templates/job.yaml @@ -0,0 +1,55 @@ +{{- include "stress-test-addons.deploy-job-template.from-pod" (list . "stress.java-template") -}} +{{- define "stress.java-template" -}} +metadata: + labels: + testName: "{{ .Release.Name }}" +spec: + containers: + # simple and fast .NET Core HTTP server to run tests against + # When writing real stress test, you probably won't need it and would use corresponding Azure Service. + - name: server + image: stresspgs7b6dif73rup6.azurecr.io/stress/simplehttpserver + imagePullPolicy: Always + command: ['sh', '-c'] + args: + - | + set -a && + export ASPNETCORE_URLS="http://localhost:8080;https://localhost:8081" && + export Test__DurationInSec={{ mul ( add .Stress.testDurationMin 1) 60 }} && + dotnet /app/dotnet_simple.dll + resources: + limits: + memory: "400Mi" + cpu: "2" + {{- include "stress-test-addons.container-env" . | nindent 6 }} + - name: test + image: {{ .Stress.imageTag }} + imagePullPolicy: Always + command: ['sh', '-c'] + args: + - | + set -a && + source $ENV_FILE && + java \ + -Dotel.service.name={{ .Release.Name }}-{{ .Stress.BaseName }} \ + -Dotel.traces.sampler=traceidratio \ + -Dotel.traces.sampler.arg=0.00001 \ + -XX:InitialRAMPercentage=75.0 \ + -XX:MaxRAMPercentage=75.0 \ + -Dreactor.schedulers.defaultBoundedElasticSize={{ max 20 .Stress.concurrency }} \ + -jar /app/azure-template-stress-1.0.0-beta.1-jar-with-dependencies.jar \ + {{ .Stress.testScenario }} \ + --parallel {{ .Stress.concurrency }} \ + --duration {{ mul .Stress.testDurationMin 60 }} \ + --endpoint https://localhost:8081 \ + {{ ternary "--sync" "" .Stress.sync }} \ + --http-client {{ default "netty" .Stress.httpClient }} \ + --warmup 0 + # add your test parameters here + resources: + # make sure to configure resource limits for your test + limits: + memory: "1Gi" + cpu: "1" + {{- include "stress-test-addons.container-env" . | nindent 6 }} +{{- end -}} diff --git a/sdk/template/azure-template-stress/workbooks/runDetails.json b/sdk/template/azure-template-stress/workbooks/runDetails.json new file mode 100644 index 0000000000000..25e2ae28f648f --- /dev/null +++ b/sdk/template/azure-template-stress/workbooks/runDetails.json @@ -0,0 +1,310 @@ +{ + "version": "Notebook/1.0", + "items": [ + { + "type": 1, + "content": { + "json": "## Template workbook for java stress tests.\n\nSelect the run from the following list." + }, + "name": "text - 2" + }, + { + "type": 9, + "content": { + "version": "KqlParameterItem/1.0", + "parameters": [ + { + "id": "ab5bb927-f8a4-4491-8621-d300820d2ff3", + "version": "KqlParameterItem/1.0", + "name": "timeRange", + "label": "Time Range", + "type": 4, + "typeSettings": { + "selectableValues": [ + { + "durationMs": 900000 + }, + { + "durationMs": 1800000 + }, + { + "durationMs": 3600000 + }, + { + "durationMs": 14400000 + }, + { + "durationMs": 43200000 + }, + { + "durationMs": 86400000 + }, + { + "durationMs": 172800000 + }, + { + "durationMs": 259200000 + }, + { + "durationMs": 604800000 + } + ], + "allowCustom": true + }, + "value": { + "durationMs": 86400000 + } + }, + { + "id": "1b563bbe-70e0-48e6-ae33-d71d97ab8332", + "version": "KqlParameterItem/1.0", + "name": "runId", + "label": "Pod name", + "type": 2, + "isRequired": true, + "query": "dependencies\r\n| where timestamp >= {timeRange:start} and timestamp <= {timeRange:end}\r\n| where cloud_RoleName startswith \"java-template\" and name == \"before run\"\r\n| extend runId = tostring(split(cloud_RoleName, \"-\")[2])\r\n| summarize start=min(timestamp) by runId, pod = tostring(customDimensions[\"hostname\"])\r\n| order by start desc\r\n| project runId, pod", + "typeSettings": { + "additionalResourceOptions": [], + "showDefault": false + }, + "queryType": 0, + "resourceType": "microsoft.insights/components", + "value": null + } + ], + "style": "pills", + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "customWidth": "30", + "name": "parameters - 2", + "styleSettings": { + "maxWidth": "30" + } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "let runId = \"{runId}\";\r\nlet roleName = strcat(\"java-template-\", runId);\r\nlet metrics = customMetrics\r\n| where timestamp >= {timeRange:start} and timestamp <= {timeRange:end}\r\n| where cloud_RoleName startswith \"java-template\" and cloud_RoleName endswith \"{runId}\";\r\nlet testSpans = dependencies\r\n| where timestamp >= {timeRange:start} and timestamp <= {timeRange:end}\r\n| where cloud_RoleName startswith \"java-template\" and cloud_RoleName endswith \"{runId}\";\r\nlet errors = metrics\r\n| where name == \"test.run.duration\"\r\n| extend errorType = tostring(customDimensions[\"error.type\"])\r\n| summarize error_by_type=sum(valueCount) by errorType\r\n| summarize test_errors=make_bag(bag_pack(errorType, error_by_type))\r\n| evaluate narrow();\r\nlet runs = metrics \r\n| where name == \"test.run.duration\" \r\n| summarize successful_runs=sumif(valueCount, customDimensions[\"error.type\"] == \"\"), total_runs=sum(valueCount)\r\n| evaluate narrow();\r\nlet parameters = testSpans \r\n| where name == \"before run\"\r\n| project params_pod=customDimensions[\"hostname\"], params_scenarioName=customDimensions[\"scenarioName\"], params_durationSec=customDimensions[\"durationSec\"], params_concurrency=customDimensions[\"concurrency\"], params_sync=customDimensions[\"sync\"], params_httpClient=customDimensions[\"httpClientProvider\"], params_JRE=strcat(tostring(customDimensions[\"jreVendor\"]), \" \", tostring(customDimensions[\"jreVersion\"]))\r\n| evaluate narrow();\r\nlet actualDuration = metrics\r\n| where name == \"test.run.duration\"\r\n| summarize maxTs = max(timestamp), minTs = min(timestamp)\r\n| project actual_durationSec=(maxTs-minTs)/1s\r\n| evaluate narrow();\r\nlet avgThroughput = metrics \r\n| where name == \"test.run.duration\" \r\n| summarize throughputPerMin=sum(valueCount) by bin(timestamp, 1m) // in case AppInsights ingestion drops something\r\n| summarize avg_throughtputPerSec=avg(throughputPerMin/60)\r\n| evaluate narrow();\r\nparameters \r\n| union runs, errors, actualDuration, avgThroughput\r\n| project Property = Column, Value\r\n", + "size": 0, + "showAnalytics": true, + "title": "Test summary", + "noDataMessageStyle": 5, + "queryType": 0, + "resourceType": "microsoft.insights/components", + "gridSettings": { + "sortBy": [ + { + "itemKey": "Property", + "sortOrder": 1 + } + ] + }, + "sortBy": [ + { + "itemKey": "Property", + "sortOrder": 1 + } + ] + }, + "customWidth": "30", + "name": "query - 9", + "styleSettings": { + "maxWidth": "30" + } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "let runs = customMetrics \r\n| where timestamp >= {timeRange:start} and timestamp <= {timeRange:end}\r\n| where cloud_RoleName startswith \"java-template\" \r\n| extend runId = tostring(split(cloud_RoleName, \"-\")[2])\r\n| summarize start=min(timestamp), end=max(timestamp) by runId\r\n| project start, duration= end-start, runId;\r\nlet runSpans = dependencies\r\n| where timestamp >= {timeRange:start} and timestamp <= {timeRange:end}\r\n| where cloud_RoleName startswith \"java-template\" and name == \"before run\"\r\n| extend runId = tostring(split(cloud_RoleName, \"-\")[2])\r\n| distinct runId, pod=tostring(customDimensions[\"hostname\"]);\r\nruns \r\n| join kind = innerunique runSpans on runId\r\n| order by start desc\r\n| project-away runId1\r\n", + "size": 0, + "title": "Runs in {timeRange:label}", + "noDataMessageStyle": 5, + "queryType": 0, + "resourceType": "microsoft.insights/components", + "gridSettings": { + "sortBy": [ + { + "itemKey": "start", + "sortOrder": 2 + } + ] + }, + "sortBy": [ + { + "itemKey": "start", + "sortOrder": 2 + } + ] + }, + "customWidth": "40", + "name": "query - 8", + "styleSettings": { + "maxWidth": "40" + } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customMetrics\r\n| where timestamp >= {timeRange:start} and timestamp <= {timeRange:end}\r\n| where name == \"test.run.duration\" and cloud_RoleName startswith \"java-template\" and cloud_RoleName endswith \"{runId}\"\r\n| where customDimensions[\"error.type\"] == \"\"\r\n| summarize successful_runs=sum(valueCount) by bin(timestamp, 1m)\r\n| render timechart", + "size": 0, + "aggregation": 3, + "title": "Test run success rate (per minute)", + "noDataMessageStyle": 5, + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "customWidth": "30", + "name": "query - 3", + "styleSettings": { + "maxWidth": "30", + "showBorder": true + } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customMetrics\r\n| where timestamp >= {timeRange:start} and timestamp <= {timeRange:end}\r\n| where name == \"test.run.duration\"\r\n| where cloud_RoleName startswith \"java-template\" and cloud_RoleName endswith \"{runId}\"\r\n| where customDimensions[\"error.type\"] == \"\"\r\n| summarize avg_duration = avg(valueSum/valueCount) * 1000 by bin(timestamp, 1m)\r\n| render timechart", + "size": 0, + "aggregation": 3, + "title": "Duration of successfull operation (ms)", + "noDataMessageStyle": 5, + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "customWidth": "30", + "name": "query - 5", + "styleSettings": { + "maxWidth": "30", + "showBorder": true + } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customMetrics\r\n| where timestamp >= {timeRange:start} and timestamp <= {timeRange:end}\r\n| where cloud_RoleName startswith \"java-template\" and cloud_RoleName endswith \"{runId}\"\r\n| where name == \"test.run.duration\"\r\n| extend status = tostring(customDimensions[\"error.type\"])\r\n| where status != \"\"\r\n| summarize test_errors = sum(valueCount) by status, bin(timestamp, 1m)\r\n| render linechart", + "size": 0, + "aggregation": 3, + "title": "Error rate (per minute)", + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "customWidth": "30", + "name": "query - 3 - Copy", + "styleSettings": { + "maxWidth": "30", + "showBorder": true + } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customMetrics\r\n| where timestamp >= {timeRange:start} and timestamp <= {timeRange:end}\r\n| where cloud_RoleName startswith \"java-template\" and cloud_RoleName endswith \"{runId}\"\r\n| where name == \"process.runtime.jvm.memory.usage\" and customDimensions[\"type\"]==\"heap\"\r\n| summarize heap_memory_used=sum(valueSum/valueCount) by bin(timestamp, 1m)\r\n| render areachart", + "size": 0, + "aggregation": 3, + "title": "Heap memory used (MB)", + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "customWidth": "30", + "name": "query - 8", + "styleSettings": { + "maxWidth": "30" + } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customMetrics\r\n| where timestamp >= {timeRange:start} and timestamp <= {timeRange:end}\r\n| where cloud_RoleName startswith \"java-template\" and cloud_RoleName endswith \"{runId}\"\r\n| where name == \"process.runtime.jvm.cpu.utilization\"\r\n| summarize cpu_time_percent=avg(value) * 100 by bin(timestamp, 1m)\r\n| render timechart\r\n", + "size": 0, + "aggregation": 3, + "title": "CPU %", + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "customWidth": "30", + "name": "query - 9", + "styleSettings": { + "maxWidth": "30" + } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customMetrics\r\n| where timestamp >= {timeRange:start} and timestamp <= {timeRange:end}\r\n| where cloud_RoleName startswith \"java-template\" and cloud_RoleName endswith \"{runId}\"\r\n| where name == \"process.runtime.jvm.threads.count\"\r\n| summarize max_thread_count=max(valueMax) by bin(timestamp, 1m)\r\n| render timechart\r\n", + "size": 0, + "aggregation": 3, + "title": "Thread count", + "noDataMessageStyle": 5, + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "customWidth": "15", + "name": "query - 11", + "styleSettings": { + "maxWidth": "15" + } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customMetrics\r\n| where timestamp >= {timeRange:start} and timestamp <= {timeRange:end}\r\n| where cloud_RoleName startswith \"java-template\" and cloud_RoleName endswith \"{runId}\"\r\n| where name == \"process.runtime.jvm.gc.duration\" \r\n| extend gc_type=tostring(customDimensions[\"gc\"])\r\n| summarize gc_percentage=sum(valueSum) / 60 * 100 by gc_type, bin(timestamp, 1m)\r\n| render timechart\r\n", + "size": 0, + "aggregation": 3, + "title": "% of time spent in GC", + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "customWidth": "15", + "name": "query - 11", + "styleSettings": { + "maxWidth": "15" + } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "traces \r\n| union exceptions\r\n| where timestamp >= {timeRange:start} and timestamp <= {timeRange:end}\r\n| where cloud_RoleName startswith \"java-template\" and cloud_RoleName endswith \"{runId}\" and severityLevel > 1\r\n| extend category = tostring(customDimensions[\"LoggerName\"])\r\n| extend logOrExceptionMessage = coalesce(message, outerMessage) \r\n| extend message = case(logOrExceptionMessage startswith \"{\\\"az.sdk.message\", azSdkContext=parse_json(logOrExceptionMessage)[\"az.sdk.message\"], substring(logOrExceptionMessage, 0, 48))\r\n| project timestamp, category, message, severity = case(severityLevel == 2, \"Warning\", severityLevel == 3, \"Error\", severityLevel == 1, \"Info\", \"\")\r\n| summarize occurences = count() by severity, category, message\r\n| order by occurences desc\r\n", + "size": 0, + "title": "Warnings and errors in logs (sampled, 0.001%)", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "sortBy": [] + }, + "customWidth": "60", + "name": "query - 6", + "styleSettings": { + "maxWidth": "60", + "showBorder": true + } + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "traces \r\n| union exceptions\r\n| where timestamp >= {timeRange:start} and timestamp <= {timeRange:end}\r\n| where cloud_RoleName startswith \"java-template\" and cloud_RoleName endswith \"{runId}\" and severityLevel > 1\r\n| extend severity = case(severityLevel == 2, \"Warning\", severityLevel == 3, \"Error\", \"\")\r\n| summarize warnings = countif(severityLevel==2), errors = countif(severityLevel==3) by bin(timestamp, 1m)\r\n| render timechart\r\n", + "size": 0, + "title": "Errors and warnings in log s(sampled, 0.001%) over time", + "queryType": 0, + "resourceType": "microsoft.insights/components" + }, + "customWidth": "30", + "name": "query - 12", + "styleSettings": { + "maxWidth": "30", + "showBorder": true + } + } + ], + "$schema": "/~https://github.com/Microsoft/Application-Insights-Workbooks/blob/master/schema/workbook.json" +}