Calculate confidence interval for benchmark measurements (#6950)

Project-OSRM · Jun 21, 2024 · e8da3d9 · e8da3d9
1 parent d9ce9cf
commit e8da3d9
Show file tree

Hide file tree

Showing 6 changed files with 556 additions and 412 deletions.
diff --git a/.github/workflows/osrm-backend.yml b/.github/workflows/osrm-backend.yml
@@ -700,15 +700,6 @@ jobs:
           mkdir -p $HOME/.ccache
           ccache --zero-stats
           ccache --max-size=256M
-      - name: Build PR Branch
-        run: |
-          mkdir -p pr/build
-          cd pr/build
-          cmake -DENABLE_CONAN=ON -DCMAKE_BUILD_TYPE=Release ..
-          make -j$(nproc) 
-          make -j$(nproc) benchmarks
-          cd ..
-          make -C test/data 
       - name: Checkout Base Branch
         uses: actions/checkout@v4
         with:
@@ -723,9 +714,23 @@ jobs:
           make -j$(nproc) benchmarks
           cd ..
           make -C test/data 
-      - name: Run Benchmarks
+      - name: Build PR Branch
+        run: |
+          mkdir -p pr/build
+          cd pr/build
+          cmake -DENABLE_CONAN=ON -DCMAKE_BUILD_TYPE=Release ..
+          make -j$(nproc) 
+          make -j$(nproc) benchmarks
+          cd ..
+          make -C test/data 
+      - name: Run PR Benchmarks 
         run: |
-          ./pr/scripts/ci/run_benchmarks.sh base pr
+          ./pr/scripts/ci/run_benchmarks.sh -f $(pwd)/pr -r $(pwd)/pr_results -s $(pwd)/pr -b $(pwd)/pr/build -o ~/data.osm.pbf -g ~/gps_traces.csv
+      - name: Run Base Benchmarks
+        run: |
+          # we intentionally use scripts from PR branch to be able to update them and see results in the same PR
+          ./pr/scripts/ci/run_benchmarks.sh -f $(pwd)/base -r $(pwd)/base_results -s $(pwd)/pr -b $(pwd)/base/build -o ~/data.osm.pbf -g ~/gps_traces.csv
+
       - name: Post Benchmark Results
         run: |
           python3 pr/scripts/ci/post_benchmark_results.py base_results pr_results

diff --git a/scripts/ci/e2e_benchmark.py b/scripts/ci/e2e_benchmark.py
@@ -1,5 +1,4 @@
 import requests
-import sys
 import random
 from collections import defaultdict
 import os
@@ -8,12 +7,13 @@
 import time
 import argparse
 
+
 class BenchmarkRunner:
-    def __init__(self):
+    def __init__(self, gps_traces_file_path):
         self.coordinates = []
         self.tracks = defaultdict(list)
 
-        gps_traces_file_path = os.path.expanduser('~/gps_traces.csv')
+        gps_traces_file_path = os.path.expanduser(gps_traces_file_path)
         with open(gps_traces_file_path, 'r') as file:
             reader = csv.DictReader(file)
             for row in reader:
@@ -36,10 +36,9 @@ def run(self, benchmark_name, host, num_requests, warmup_requests=50):
             response = requests.get(url)
             end_time = time.time()
             if response.status_code != 200:
-                if benchmark_name == 'match':
-                    code = response.json()['code']
-                    if code == 'NoSegment' or code == 'NoMatch':
-                        continue
+                code = response.json()['code']
+                if code in ['NoSegment', 'NoMatch', 'NoRoute', 'NoTrips']:
+                    continue
                 raise Exception(f"Error: {response.status_code} {response.text}")
             times.append((end_time - start_time) * 1000) # convert to ms
 
@@ -54,7 +53,7 @@ def make_url(self, host, benchmark_name):
             end_coord = f"{end[1]:.6f},{end[0]:.6f}"
             return f"{host}/route/v1/driving/{start_coord};{end_coord}?overview=full&steps=true"
         elif benchmark_name == 'table':
-            num_coords = random.randint(3, 100)
+            num_coords = random.randint(3, 12)
             selected_coords = random.sample(self.coordinates, num_coords)
             coords_str = ";".join([f"{coord[1]:.6f},{coord[0]:.6f}" for coord in selected_coords])
             return f"{host}/table/v1/driving/{coords_str}"
@@ -77,26 +76,63 @@ def make_url(self, host, benchmark_name):
         else:
             raise Exception(f"Unknown benchmark: {benchmark_name}")
 
+def bootstrap_confidence_interval(data, num_samples=1000, confidence_level=0.95):
+    means = []
+    for _ in range(num_samples):
+        sample = np.random.choice(data, size=len(data), replace=True)
+        means.append(np.mean(sample))
+    lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
+    upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
+    mean = np.mean(means)
+    return mean, lower_bound, upper_bound
+
+def calculate_confidence_interval(data):
+    mean, lower, upper = bootstrap_confidence_interval(data)
+    min_value = np.min(data)
+    return mean, (upper - lower) / 2, min_value
+
+
 def main():
     parser = argparse.ArgumentParser(description='Run GPS benchmark tests.')
     parser.add_argument('--host', type=str, required=True, help='Host URL')
     parser.add_argument('--method', type=str, required=True, choices=['route', 'table', 'match', 'nearest', 'trip'], help='Benchmark method')
     parser.add_argument('--num_requests', type=int, required=True, help='Number of requests to perform')
+    parser.add_argument('--iterations', type=int, required=True, help='Number of iterations to run the benchmark')
+    parser.add_argument('--gps_traces_file_path', type=str, required=True, help='Path to the GPS traces file')
 
     args = parser.parse_args()
 
-    random.seed(42)
+    np.random.seed(42)
+
+    runner = BenchmarkRunner(args.gps_traces_file_path)
+
+    all_times = []
+    for _ in range(args.iterations):
+        random.seed(42)
+        times = runner.run(args.method, args.host, args.num_requests)
+        all_times.append(times)
+    all_times = np.asarray(all_times)
+
+    assert all_times.shape == (args.iterations, all_times.shape[1])
+
 
-    runner = BenchmarkRunner()
-    times = runner.run(args.method, args.host, args.num_requests)
+    total_time, total_ci, total_best = calculate_confidence_interval(np.sum(all_times, axis=1))
+    ops_per_sec, ops_per_sec_ci, ops_per_sec_best = calculate_confidence_interval(float(all_times.shape[1]) / np.sum(all_times / 1000, axis=1))
+    min_time, min_ci, _ = calculate_confidence_interval(np.min(all_times, axis=1))
+    mean_time, mean_ci, _ = calculate_confidence_interval(np.mean(all_times, axis=1))
+    median_time, median_ci, _ = calculate_confidence_interval(np.median(all_times, axis=1))
+    perc_95_time, perc_95_ci, _ = calculate_confidence_interval(np.percentile(all_times, 95, axis=1))
+    perc_99_time, perc_99_ci, _ = calculate_confidence_interval(np.percentile(all_times, 99, axis=1))
+    max_time, max_ci, _ = calculate_confidence_interval(np.max(all_times, axis=1))
 
-    print(f'Total: {np.sum(times)}ms')
-    print(f"Min time: {np.min(times)}ms")
-    print(f"Mean time: {np.mean(times)}ms")
-    print(f"Median time: {np.median(times)}ms")
-    print(f"95th percentile: {np.percentile(times, 95)}ms")
-    print(f"99th percentile: {np.percentile(times, 99)}ms")
-    print(f"Max time: {np.max(times)}ms")
+    print(f'Ops: {ops_per_sec:.2f} ± {ops_per_sec_ci:.2f} ops/s. Best: {ops_per_sec_best:.2f} ops/s')
+    print(f'Total: {total_time:.2f}ms ± {total_ci:.2f}ms. Best: {total_best:.2f}ms')
+    print(f"Min time: {min_time:.2f}ms ± {min_ci:.2f}ms")
+    print(f"Mean time: {mean_time:.2f}ms ± {mean_ci:.2f}ms")
+    print(f"Median time: {median_time:.2f}ms ± {median_ci:.2f}ms")
+    print(f"95th percentile: {perc_95_time:.2f}ms ± {perc_95_ci:.2f}ms")
+    print(f"99th percentile: {perc_99_time:.2f}ms ± {perc_99_ci:.2f}ms")
+    print(f"Max time: {max_time:.2f}ms ± {max_ci:.2f}ms")
 
 if __name__ == '__main__':
     main()
diff --git a/scripts/ci/run_benchmarks.sh b/scripts/ci/run_benchmarks.sh
@@ -1,72 +1,121 @@
 #!/bin/bash
 set -eou pipefail
 
+function usage {
+    echo "Usage: $0 -f <folder> -r <results_folder> -s <scripts_folder> -b <binaries_folder> -o <osm_pbf> -g <gps_traces>"
+    exit 1
+}
+
+while getopts ":f:r:s:b:o:g:" opt; do
+  case $opt in
+    f) FOLDER="$OPTARG"
+    ;;
+    r) RESULTS_FOLDER="$OPTARG"
+    ;;
+    s) SCRIPTS_FOLDER="$OPTARG"
+    ;;
+    b) BINARIES_FOLDER="$OPTARG"
+    ;;
+    o) OSM_PBF="$OPTARG"
+    ;;
+    g) GPS_TRACES="$OPTARG"
+    ;;
+    \?) echo "Invalid option -$OPTARG" >&2
+        usage
+    ;;
+    :) echo "Option -$OPTARG requires an argument." >&2
+        usage
+    ;;
+  esac
+done
+
+if [ -z "${FOLDER:-}" ] || [ -z "${RESULTS_FOLDER:-}" ] || [ -z "${SCRIPTS_FOLDER:-}" ] || [ -z "${BINARIES_FOLDER:-}" ] || [ -z "${OSM_PBF:-}" ] || [ -z "${GPS_TRACES:-}" ]; then
+    usage
+fi
+
 function measure_peak_ram_and_time {
     COMMAND=$1
     OUTPUT_FILE=$2
-
-    OUTPUT=$(/usr/bin/time -f "%e %M" $COMMAND 2>&1 | tail -n 1)
-
-    TIME=$(echo $OUTPUT | awk '{print $1}')
-    PEAK_RAM_KB=$(echo $OUTPUT | awk '{print $2}')
-    PEAK_RAM_MB=$(echo "scale=2; $PEAK_RAM_KB / 1024" | bc)
-    echo "Time: ${TIME}s Peak RAM: ${PEAK_RAM_MB}MB" > $OUTPUT_FILE
+    if [ "$(uname)" == "Darwin" ]; then
+        # on macOS time has different parameters, so simply run command on macOS
+        $COMMAND > /dev/null 2>&1
+    else
+        OUTPUT=$(/usr/bin/time -f "%e %M" $COMMAND 2>&1 | tail -n 1)
+
+        TIME=$(echo $OUTPUT | awk '{print $1}')
+        PEAK_RAM_KB=$(echo $OUTPUT | awk '{print $2}')
+        PEAK_RAM_MB=$(echo "scale=2; $PEAK_RAM_KB / 1024" | bc)
+        echo "Time: ${TIME}s Peak RAM: ${PEAK_RAM_MB}MB" > $OUTPUT_FILE
+    fi
 }
 
 function run_benchmarks_for_folder {
-    echo "Running benchmarks for $1"
-
-    FOLDER=$1
-    RESULTS_FOLDER=$2
-    SCRIPTS_FOLDER=$3
-
     mkdir -p $RESULTS_FOLDER
 
-    BENCHMARKS_FOLDER="$FOLDER/build/src/benchmarks"
-
-    ./$BENCHMARKS_FOLDER/match-bench "./$FOLDER/test/data/mld/monaco.osrm" mld > "$RESULTS_FOLDER/match_mld.bench"
-    ./$BENCHMARKS_FOLDER/match-bench "./$FOLDER/test/data/ch/monaco.osrm" ch > "$RESULTS_FOLDER/match_ch.bench"
-    ./$BENCHMARKS_FOLDER/route-bench "./$FOLDER/test/data/mld/monaco.osrm" mld > "$RESULTS_FOLDER/route_mld.bench"
-    ./$BENCHMARKS_FOLDER/route-bench "./$FOLDER/test/data/ch/monaco.osrm" ch > "$RESULTS_FOLDER/route_ch.bench"
-    ./$BENCHMARKS_FOLDER/alias-bench > "$RESULTS_FOLDER/alias.bench"
-    ./$BENCHMARKS_FOLDER/json-render-bench  "./$FOLDER/src/benchmarks/portugal_to_korea.json" > "$RESULTS_FOLDER/json-render.bench"
-    ./$BENCHMARKS_FOLDER/packedvector-bench > "$RESULTS_FOLDER/packedvector.bench"
-    ./$BENCHMARKS_FOLDER/rtree-bench "./$FOLDER/test/data/monaco.osrm.ramIndex" "./$FOLDER/test/data/monaco.osrm.fileIndex" "./$FOLDER/test/data/monaco.osrm.nbg_nodes" > "$RESULTS_FOLDER/rtree.bench"
-
-    BINARIES_FOLDER="$FOLDER/build"
-
-    cp ~/data.osm.pbf $FOLDER
-
+    BENCHMARKS_FOLDER="$BINARIES_FOLDER/src/benchmarks"
+    echo "Running match-bench MLD"
+    $BENCHMARKS_FOLDER/match-bench "$FOLDER/test/data/mld/monaco.osrm" mld > "$RESULTS_FOLDER/match_mld.bench"
+    echo "Running match-bench CH"
+    $BENCHMARKS_FOLDER/match-bench "$FOLDER/test/data/ch/monaco.osrm" ch > "$RESULTS_FOLDER/match_ch.bench"
+    echo "Running route-bench MLD"
+    $BENCHMARKS_FOLDER/route-bench "$FOLDER/test/data/mld/monaco.osrm" mld > "$RESULTS_FOLDER/route_mld.bench"
+    echo "Running route-bench CH"
+    $BENCHMARKS_FOLDER/route-bench "$FOLDER/test/data/ch/monaco.osrm" ch > "$RESULTS_FOLDER/route_ch.bench"
+    echo "Running alias"
+    $BENCHMARKS_FOLDER/alias-bench > "$RESULTS_FOLDER/alias.bench"
+    echo "Running json-render-bench"
+    $BENCHMARKS_FOLDER/json-render-bench  "$FOLDER/src/benchmarks/portugal_to_korea.json" > "$RESULTS_FOLDER/json-render.bench"
+    echo "Running packedvector-bench"
+    $BENCHMARKS_FOLDER/packedvector-bench > "$RESULTS_FOLDER/packedvector.bench"
+    echo "Running rtree-bench"
+    $BENCHMARKS_FOLDER/rtree-bench "$FOLDER/test/data/monaco.osrm.ramIndex" "$FOLDER/test/data/monaco.osrm.fileIndex" "$FOLDER/test/data/monaco.osrm.nbg_nodes" > "$RESULTS_FOLDER/rtree.bench"
+
+    cp -rf $OSM_PBF $FOLDER/data.osm.pbf
+
+    echo "Running osrm-extract"
     measure_peak_ram_and_time "$BINARIES_FOLDER/osrm-extract -p $FOLDER/profiles/car.lua $FOLDER/data.osm.pbf" "$RESULTS_FOLDER/osrm_extract.bench"
+    echo "Running osrm-partition"
     measure_peak_ram_and_time "$BINARIES_FOLDER/osrm-partition $FOLDER/data.osrm" "$RESULTS_FOLDER/osrm_partition.bench"
+    echo "Running osrm-customize"
     measure_peak_ram_and_time "$BINARIES_FOLDER/osrm-customize $FOLDER/data.osrm" "$RESULTS_FOLDER/osrm_customize.bench"
+    echo "Running osrm-contract"
     measure_peak_ram_and_time "$BINARIES_FOLDER/osrm-contract $FOLDER/data.osrm" "$RESULTS_FOLDER/osrm_contract.bench"
 
-    for BENCH in nearest table trip route match; do
-        ./$BENCHMARKS_FOLDER/bench "$FOLDER/data.osrm" mld ~/gps_traces.csv ${BENCH} > "$RESULTS_FOLDER/random_${BENCH}_mld.bench" || true
-        ./$BENCHMARKS_FOLDER/bench "$FOLDER/data.osrm" ch ~/gps_traces.csv ${BENCH}  > "$RESULTS_FOLDER/random_${BENCH}_ch.bench" || true
+    for ALGORITHM in ch mld; do
+        for BENCH in nearest table trip route match; do
+            echo "Running random $BENCH $ALGORITHM"
+            START=$(date +%s.%N)
+            $BENCHMARKS_FOLDER/bench "$FOLDER/data.osrm" $ALGORITHM $GPS_TRACES ${BENCH} > "$RESULTS_FOLDER/random_${BENCH}_${ALGORITHM}.bench" 5 || true
+            END=$(date +%s.%N)
+            DIFF=$(echo "$END - $START" | bc)
+            echo "Took: ${DIFF}s"
+        done
     done
 
 
     for ALGORITHM in ch mld; do
-        $BINARIES_FOLDER/osrm-routed --algorithm $ALGORITHM $FOLDER/data.osrm &
+        $BINARIES_FOLDER/osrm-routed --algorithm $ALGORITHM $FOLDER/data.osrm > /dev/null 2>&1 &
         OSRM_ROUTED_PID=$!
 
         # wait for osrm-routed to start
-        if ! curl --retry-delay 3 --retry 10 --retry-all-errors "http://127.0.0.1:5000/route/v1/driving/13.388860,52.517037;13.385983,52.496891?steps=true"; then
+        if ! curl --retry-delay 3 --retry 10 --retry-all-errors "http://127.0.0.1:5000/route/v1/driving/13.388860,52.517037;13.385983,52.496891?steps=true" > /dev/null 2>&1; then
             echo "osrm-routed failed to start for algorithm $ALGORITHM"
             kill -9 $OSRM_ROUTED_PID
             continue
         fi
 
         for METHOD in route nearest trip table match; do
-            python3 $SCRIPTS_FOLDER/scripts/ci/e2e_benchmark.py --host http://localhost:5000 --method $METHOD --num_requests 1000 > $RESULTS_FOLDER/e2e_${METHOD}_${ALGORITHM}.bench
+            echo "Running e2e benchmark for $METHOD $ALGORITHM"
+            START=$(date +%s.%N)
+            python3 $SCRIPTS_FOLDER/scripts/ci/e2e_benchmark.py --host http://localhost:5000 --method $METHOD --iterations 5 --num_requests 1000 --gps_traces_file_path $GPS_TRACES > $RESULTS_FOLDER/e2e_${METHOD}_${ALGORITHM}.bench
+            END=$(date +%s.%N)
+            DIFF=$(echo "$END - $START" | bc)
+            echo "Took: ${DIFF}s"
         done
 
         kill -9 $OSRM_ROUTED_PID
     done
 }
 
-run_benchmarks_for_folder $1 "${1}_results" $2
-run_benchmarks_for_folder $2 "${2}_results" $2
+run_benchmarks_for_folder