From e8d255b82d8ecb75ff87b420d8ff456ceb8bafb3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 10 Feb 2025 12:55:09 +0100
Subject: [PATCH 1/8] tests: avoid spurious reconfigures in Workload

---
 test_runner/fixtures/workload.py       | 9 ++++++++-
 test_runner/regress/test_compaction.py | 1 +
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index eea0ec2b95c3..1947a9c3fbe0 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -53,6 +53,8 @@ def __init__(
         self._endpoint: Endpoint | None = None
         self._endpoint_opts = endpoint_opts or {}
 
+        self._configured_pageserver: int | None = None
+
     def branch(
         self,
         timeline_id: TimelineId,
@@ -92,8 +94,12 @@ def endpoint(self, pageserver_id: int | None = None) -> Endpoint:
                     **self._endpoint_opts,
                 )
                 self._endpoint.start(pageserver_id=pageserver_id)
+                self._configured_pageserver = pageserver_id
             else:
-                self._endpoint.reconfigure(pageserver_id=pageserver_id)
+                if self._configured_pageserver != pageserver_id:
+                    self._configured_pageserver = pageserver_id
+                    self._endpoint.reconfigure(pageserver_id=pageserver_id)
+                    self._endpoint_config = pageserver_id
 
         connstring = self._endpoint.safe_psql(
             "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'"
@@ -122,6 +128,7 @@ def init(self, pageserver_id: int | None = None, allow_recreate=False):
 
     def write_rows(self, n: int, pageserver_id: int | None = None, upload: bool = True):
         endpoint = self.endpoint(pageserver_id)
+
         start = self.expect_rows
         end = start + n - 1
         self.expect_rows += n
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index f3347b594e32..cbcf79157b13 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -692,6 +692,7 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder)
     workload.write_rows(1024, upload=False)
     workload.write_rows(1024, upload=False)
     workload.write_rows(1024, upload=False)
+    env.pageserver.http_client().timeline_checkpoint(env.initial_tenant, env.initial_timeline)
 
     def assert_broken():
         env.pageserver.assert_log_contains(BROKEN_LOG)

From 991524828e830764d1e9c49862c242e58d8bb385 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 10 Feb 2025 13:14:26 +0100
Subject: [PATCH 2/8] tests: fix test_pageserver_compaction_circuit_breaker

---
 test_runner/regress/test_compaction.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index cbcf79157b13..f10872590c9b 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -689,10 +689,7 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder)
     env.pageserver.http_client().configure_failpoints((FAILPOINT, "return"))
 
     # Write some data to trigger compaction
-    workload.write_rows(1024, upload=False)
-    workload.write_rows(1024, upload=False)
-    workload.write_rows(1024, upload=False)
-    env.pageserver.http_client().timeline_checkpoint(env.initial_tenant, env.initial_timeline)
+    workload.write_rows(32768, upload=False)
 
     def assert_broken():
         env.pageserver.assert_log_contains(BROKEN_LOG)

From bed23e4cdc0777176c60e7b69695746933c39e1c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 10 Feb 2025 13:24:11 +0100
Subject: [PATCH 3/8] tests: write more data in test_sharding_smoke

---
 test_runner/regress/test_sharding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 6f8070e2ba15..37effc37ee26 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -91,7 +91,7 @@ def get_sizes():
     workload.init()
 
     sizes_before = get_sizes()
-    workload.write_rows(256)
+    workload.write_rows(32768)
 
     # Test that we can read data back from a sharded tenant
     workload.validate()

From 6901934e1da7668dde872dddb30856eeb024f6b8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 11 Feb 2025 14:01:25 +0100
Subject: [PATCH 4/8] tests: register Workload for notifications in
 test_sharding_split_failures

---
 test_runner/regress/test_sharding.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 37effc37ee26..78e73dc3ddee 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1368,6 +1368,7 @@ def test_sharding_split_failures(
     workload = Workload(env, tenant_id, timeline_id)
     workload.init()
     workload.write_rows(100)
+    compute_reconfigure_listener.register_workload(workload)
 
     # Put the environment into a failing state (exact meaning depends on `failure`)
     failure.apply(env)

From fe80ddcbb372721833283f9339fd27f417331371 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 7 Feb 2025 19:32:15 +0100
Subject: [PATCH 5/8] Fix test_sharding_backpressure

---
 test_runner/fixtures/neon_cli.py      | 3 +++
 test_runner/fixtures/neon_fixtures.py | 4 ++++
 test_runner/regress/test_sharding.py  | 3 +++
 3 files changed, 10 insertions(+)

diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 33d422c590ed..9f13cf753254 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -486,6 +486,7 @@ def endpoint_create(
         lsn: Lsn | None = None,
         pageserver_id: int | None = None,
         allow_multiple=False,
+        update_catalog: bool = False,
     ) -> subprocess.CompletedProcess[str]:
         args = [
             "endpoint",
@@ -511,6 +512,8 @@ def endpoint_create(
             args.extend(["--pageserver-id", str(pageserver_id)])
         if allow_multiple:
             args.extend(["--allow-multiple"])
+        if update_catalog:
+            args.extend(["--update-catalog"])
 
         res = self.raw_cli(args)
         res.check_returncode()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3d3a445b9718..8ef3e3226d8b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3846,6 +3846,7 @@ def create(
         config_lines: list[str] | None = None,
         pageserver_id: int | None = None,
         allow_multiple: bool = False,
+        update_catalog: bool = False,
     ) -> Self:
         """
         Create a new Postgres endpoint.
@@ -3870,6 +3871,7 @@ def create(
             pg_version=self.env.pg_version,
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
+            update_catalog=update_catalog,
         )
         path = Path("endpoints") / self.endpoint_id / "pgdata"
         self.pgdata_dir = self.env.repo_dir / path
@@ -4283,6 +4285,7 @@ def create(
         hot_standby: bool = False,
         config_lines: list[str] | None = None,
         pageserver_id: int | None = None,
+        update_catalog: bool = False,
     ) -> Endpoint:
         ep = Endpoint(
             self.env,
@@ -4303,6 +4306,7 @@ def create(
             hot_standby=hot_standby,
             config_lines=config_lines,
             pageserver_id=pageserver_id,
+            update_catalog=update_catalog,
         )
 
     def stop_all(self, fail_on_error=True) -> Self:
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 78e73dc3ddee..b111c1c0310a 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1547,6 +1547,9 @@ def shards_info():
                 # Tip: set to 100MB to make the test fail
                 "max_replication_write_lag=1MB",
             ],
+            # We need `neon` extension for calling backpressure functions,
+            # this flag instructs `compute_ctl` to pre-install it.
+            "update_catalog": True,
         },
     )
     workload.init()

From 0bdf38f3b2b996b8723cf71fd5b5796f7d078011 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 11 Feb 2025 19:01:49 +0100
Subject: [PATCH 6/8] tests: extend log allow list for sharding GC issue

---
 test_runner/regress/test_sharding.py         | 7 +++++--
 test_runner/regress/test_storage_scrubber.py | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index b111c1c0310a..51dd3c76259c 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1819,6 +1819,9 @@ def test_sharding_gc(
         # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by
         # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does.
         # TODO: remove when /~https://github.com/neondatabase/neon/issues/10720 is fixed
-        ps.allowed_errors.append(
-            ".*could not find data for key 020000000000000000000000000000000000.*"
+        ps.allowed_errors.extend(
+            [
+                ".*could not find data for key 020000000000000000000000000000000000.*",
+                ".*could not ingest record.*",
+            ]
         )
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 46038ccbbb49..b8253fb125be 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -316,8 +316,11 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_
         # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by
         # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does.
         # TODO: remove when /~https://github.com/neondatabase/neon/issues/10720 is fixed
-        ps.allowed_errors.append(
-            ".*could not find data for key 020000000000000000000000000000000000.*"
+        ps.allowed_errors.extend(
+            [
+                ".*could not find data for key 020000000000000000000000000000000000.*",
+                ".*could not ingest record.*",
+            ]
         )
 
 

From f26cdf198ff9064abcfd6fc486b3240edb2ed1ba Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 11 Feb 2025 19:07:30 +0100
Subject: [PATCH 7/8] write even more data in test_sharding_smoke

---
 test_runner/regress/test_sharding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 51dd3c76259c..891087369045 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -91,7 +91,7 @@ def get_sizes():
     workload.init()
 
     sizes_before = get_sizes()
-    workload.write_rows(32768)
+    workload.write_rows(65536)
 
     # Test that we can read data back from a sharded tenant
     workload.validate()

From c765718a738312cebe031cbf2091d67a3b9717a8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 12 Feb 2025 11:00:23 +0100
Subject: [PATCH 8/8] tests: wait for compute to ack reconfigures in
 ComputeReconfigure

---
 test_runner/fixtures/compute_reconfigure.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py
index 33f01f80fbb9..425abef93504 100644
--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -69,7 +69,10 @@ def handler(request: Request) -> Response:
             # This causes the endpoint to query storage controller for its location, which
             # is redundant since we already have it here, but this avoids extending the
             # neon_local CLI to take full lists of locations
-            reconfigure_threads.submit(lambda workload=workload: workload.reconfigure())  # type: ignore[misc]
+            fut = reconfigure_threads.submit(lambda workload=workload: workload.reconfigure())  # type: ignore[misc]
+
+            # To satisfy semantics of notify-attach API, we must wait for the change to be applied before returning 200
+            fut.result()
 
         return Response(status=200)