From e8d255b82d8ecb75ff87b420d8ff456ceb8bafb3 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 10 Feb 2025 12:55:09 +0100 Subject: [PATCH 1/8] tests: avoid spurious reconfigures in Workload --- test_runner/fixtures/workload.py | 9 ++++++++- test_runner/regress/test_compaction.py | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index eea0ec2b95c3..1947a9c3fbe0 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -53,6 +53,8 @@ def __init__( self._endpoint: Endpoint | None = None self._endpoint_opts = endpoint_opts or {} + self._configured_pageserver: int | None = None + def branch( self, timeline_id: TimelineId, @@ -92,8 +94,12 @@ def endpoint(self, pageserver_id: int | None = None) -> Endpoint: **self._endpoint_opts, ) self._endpoint.start(pageserver_id=pageserver_id) + self._configured_pageserver = pageserver_id else: - self._endpoint.reconfigure(pageserver_id=pageserver_id) + if self._configured_pageserver != pageserver_id: + self._configured_pageserver = pageserver_id + self._endpoint.reconfigure(pageserver_id=pageserver_id) + self._endpoint_config = pageserver_id connstring = self._endpoint.safe_psql( "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'" @@ -122,6 +128,7 @@ def init(self, pageserver_id: int | None = None, allow_recreate=False): def write_rows(self, n: int, pageserver_id: int | None = None, upload: bool = True): endpoint = self.endpoint(pageserver_id) + start = self.expect_rows end = start + n - 1 self.expect_rows += n diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index f3347b594e32..cbcf79157b13 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -692,6 +692,7 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder) workload.write_rows(1024, upload=False) workload.write_rows(1024, upload=False) workload.write_rows(1024, upload=False) + env.pageserver.http_client().timeline_checkpoint(env.initial_tenant, env.initial_timeline) def assert_broken(): env.pageserver.assert_log_contains(BROKEN_LOG) From 991524828e830764d1e9c49862c242e58d8bb385 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 10 Feb 2025 13:14:26 +0100 Subject: [PATCH 2/8] tests: fix test_pageserver_compaction_circuit_breaker --- test_runner/regress/test_compaction.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index cbcf79157b13..f10872590c9b 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -689,10 +689,7 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder) env.pageserver.http_client().configure_failpoints((FAILPOINT, "return")) # Write some data to trigger compaction - workload.write_rows(1024, upload=False) - workload.write_rows(1024, upload=False) - workload.write_rows(1024, upload=False) - env.pageserver.http_client().timeline_checkpoint(env.initial_tenant, env.initial_timeline) + workload.write_rows(32768, upload=False) def assert_broken(): env.pageserver.assert_log_contains(BROKEN_LOG) From bed23e4cdc0777176c60e7b69695746933c39e1c Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 10 Feb 2025 13:24:11 +0100 Subject: [PATCH 3/8] tests: write more data in test_sharding_smoke --- test_runner/regress/test_sharding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 6f8070e2ba15..37effc37ee26 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -91,7 +91,7 @@ def get_sizes(): workload.init() sizes_before = get_sizes() - workload.write_rows(256) + workload.write_rows(32768) # Test that we can read data back from a sharded tenant workload.validate() From 6901934e1da7668dde872dddb30856eeb024f6b8 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 11 Feb 2025 14:01:25 +0100 Subject: [PATCH 4/8] tests: register Workload for notifications in test_sharding_split_failures --- test_runner/regress/test_sharding.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 37effc37ee26..78e73dc3ddee 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1368,6 +1368,7 @@ def test_sharding_split_failures( workload = Workload(env, tenant_id, timeline_id) workload.init() workload.write_rows(100) + compute_reconfigure_listener.register_workload(workload) # Put the environment into a failing state (exact meaning depends on `failure`) failure.apply(env) From fe80ddcbb372721833283f9339fd27f417331371 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Fri, 7 Feb 2025 19:32:15 +0100 Subject: [PATCH 5/8] Fix test_sharding_backpressure --- test_runner/fixtures/neon_cli.py | 3 +++ test_runner/fixtures/neon_fixtures.py | 4 ++++ test_runner/regress/test_sharding.py | 3 +++ 3 files changed, 10 insertions(+) diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 33d422c590ed..9f13cf753254 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -486,6 +486,7 @@ def endpoint_create( lsn: Lsn | None = None, pageserver_id: int | None = None, allow_multiple=False, + update_catalog: bool = False, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", @@ -511,6 +512,8 @@ def endpoint_create( args.extend(["--pageserver-id", str(pageserver_id)]) if allow_multiple: args.extend(["--allow-multiple"]) + if update_catalog: + args.extend(["--update-catalog"]) res = self.raw_cli(args) res.check_returncode() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 3d3a445b9718..8ef3e3226d8b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3846,6 +3846,7 @@ def create( config_lines: list[str] | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, + update_catalog: bool = False, ) -> Self: """ Create a new Postgres endpoint. @@ -3870,6 +3871,7 @@ def create( pg_version=self.env.pg_version, pageserver_id=pageserver_id, allow_multiple=allow_multiple, + update_catalog=update_catalog, ) path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = self.env.repo_dir / path @@ -4283,6 +4285,7 @@ def create( hot_standby: bool = False, config_lines: list[str] | None = None, pageserver_id: int | None = None, + update_catalog: bool = False, ) -> Endpoint: ep = Endpoint( self.env, @@ -4303,6 +4306,7 @@ def create( hot_standby=hot_standby, config_lines=config_lines, pageserver_id=pageserver_id, + update_catalog=update_catalog, ) def stop_all(self, fail_on_error=True) -> Self: diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 78e73dc3ddee..b111c1c0310a 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1547,6 +1547,9 @@ def shards_info(): # Tip: set to 100MB to make the test fail "max_replication_write_lag=1MB", ], + # We need `neon` extension for calling backpressure functions, + # this flag instructs `compute_ctl` to pre-install it. + "update_catalog": True, }, ) workload.init() From 0bdf38f3b2b996b8723cf71fd5b5796f7d078011 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 11 Feb 2025 19:01:49 +0100 Subject: [PATCH 6/8] tests: extend log allow list for sharding GC issue --- test_runner/regress/test_sharding.py | 7 +++++-- test_runner/regress/test_storage_scrubber.py | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index b111c1c0310a..51dd3c76259c 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1819,6 +1819,9 @@ def test_sharding_gc( # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does. # TODO: remove when /~https://github.com/neondatabase/neon/issues/10720 is fixed - ps.allowed_errors.append( - ".*could not find data for key 020000000000000000000000000000000000.*" + ps.allowed_errors.extend( + [ + ".*could not find data for key 020000000000000000000000000000000000.*", + ".*could not ingest record.*", + ] ) diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 46038ccbbb49..b8253fb125be 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -316,8 +316,11 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_ # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does. # TODO: remove when /~https://github.com/neondatabase/neon/issues/10720 is fixed - ps.allowed_errors.append( - ".*could not find data for key 020000000000000000000000000000000000.*" + ps.allowed_errors.extend( + [ + ".*could not find data for key 020000000000000000000000000000000000.*", + ".*could not ingest record.*", + ] ) From f26cdf198ff9064abcfd6fc486b3240edb2ed1ba Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 11 Feb 2025 19:07:30 +0100 Subject: [PATCH 7/8] write even more data in test_sharding_smoke --- test_runner/regress/test_sharding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 51dd3c76259c..891087369045 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -91,7 +91,7 @@ def get_sizes(): workload.init() sizes_before = get_sizes() - workload.write_rows(32768) + workload.write_rows(65536) # Test that we can read data back from a sharded tenant workload.validate() From c765718a738312cebe031cbf2091d67a3b9717a8 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 12 Feb 2025 11:00:23 +0100 Subject: [PATCH 8/8] tests: wait for compute to ack reconfigures in ComputeReconfigure --- test_runner/fixtures/compute_reconfigure.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index 33f01f80fbb9..425abef93504 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -69,7 +69,10 @@ def handler(request: Request) -> Response: # This causes the endpoint to query storage controller for its location, which # is redundant since we already have it here, but this avoids extending the # neon_local CLI to take full lists of locations - reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[misc] + fut = reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[misc] + + # To satisfy semantics of notify-attach API, we must wait for the change to be applied before returning 200 + fut.result() return Response(status=200)