Skip to content

Commit

Permalink
PS/Prefetch: Use a timeout for reading data from TCP (#10834)
Browse files Browse the repository at this point in the history
This reduces pressure on OS TCP buffers, reducing flush times in other
systems like PageServer.

## Problem

## Summary of changes
  • Loading branch information
MMeent authored Feb 27, 2025
1 parent ad37199 commit a283eda
Show file tree
Hide file tree
Showing 5 changed files with 211 additions and 26 deletions.
21 changes: 18 additions & 3 deletions pgxn/neon/libpagestore.c
Original file line number Diff line number Diff line change
Expand Up @@ -1099,6 +1099,10 @@ pageserver_try_receive(shardno_t shard_no)
{
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response");
pageserver_disconnect(shard_no);
/*
* Malformed responses from PageServer are a reason to raise
* errors and cancel transactions.
*/
PG_RE_THROW();
}
PG_END_TRY();
Expand All @@ -1122,7 +1126,8 @@ pageserver_try_receive(shardno_t shard_no)
char *msg = pchomp(PQerrorMessage(pageserver_conn));

pageserver_disconnect(shard_no);
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: could not read COPY data: %s", msg);
resp = NULL;
}
else
{
Expand Down Expand Up @@ -1321,6 +1326,16 @@ pg_init_libpagestore(void)
PGC_USERSET,
0, /* no flags required */
NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL);
DefineCustomIntVariable("neon.readahead_getpage_pull_timeout",
"readahead response pull timeout",
"Time between active tries to pull data from the "
"PageStream connection when we have pages which "
"were read ahead but not yet received.",
&readahead_getpage_pull_timeout_ms,
0, 0, 5 * 60 * 1000,
PGC_USERSET,
GUC_UNIT_MS,
NULL, NULL, NULL);
DefineCustomIntVariable("neon.protocol_version",
"Version of compute<->page server protocol",
NULL,
Expand All @@ -1334,7 +1349,7 @@ pg_init_libpagestore(void)

DefineCustomIntVariable("neon.pageserver_response_log_timeout",
"pageserver response log timeout",
"If the pageserver doesn't respond to a request within this timeout,"
"If the pageserver doesn't respond to a request within this timeout, "
"a message is printed to the log.",
&pageserver_response_log_timeout,
10000, 100, INT_MAX,
Expand All @@ -1344,7 +1359,7 @@ pg_init_libpagestore(void)

DefineCustomIntVariable("neon.pageserver_response_disconnect_timeout",
"pageserver response diconnect timeout",
"If the pageserver doesn't respond to a request within this timeout,"
"If the pageserver doesn't respond to a request within this timeout, "
"disconnect and reconnect.",
&pageserver_response_disconnect_timeout,
120000, 100, INT_MAX,
Expand Down
1 change: 1 addition & 0 deletions pgxn/neon/neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,7 @@ _PG_init(void)

pg_init_libpagestore();
pg_init_walproposer();
pagestore_smgr_init();
Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;

InitUnstableExtensionsSupport();
Expand Down
2 changes: 2 additions & 0 deletions pgxn/neon/neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ extern char *neon_tenant;
extern char *wal_acceptors_list;
extern int wal_acceptor_reconnect_timeout;
extern int wal_acceptor_connection_timeout;
extern int readahead_getpage_pull_timeout_ms;

#if PG_MAJORVERSION_NUM >= 17
extern uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE;
Expand Down Expand Up @@ -49,6 +50,7 @@ extern uint32 WAIT_EVENT_NEON_WAL_DL;

extern void pg_init_libpagestore(void);
extern void pg_init_walproposer(void);
extern void pagestore_smgr_init(void);

extern uint64 BackpressureThrottlingTime(void);
extern void SetNeonCurrentClusterSize(uint64 size);
Expand Down
6 changes: 5 additions & 1 deletion pgxn/neon/pagestore_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,11 @@ typedef struct
NeonResponse *(*receive) (shardno_t shard_no);
/*
* Try get the next response from the TCP buffers, if any.
* Returns NULL when the data is not yet available.
* Returns NULL when the data is not yet available.
*
* This will raise errors only for malformed responses (we can't put them
* back into connection). All other error conditions are soft errors and
* return NULL as "no response available".
*/
NeonResponse *(*try_receive) (shardno_t shard_no);
/*
Expand Down
Loading

1 comment on commit a283eda

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7896 tests run: 7506 passed, 0 failed, 390 skipped (full report)


Flaky tests (2)

Postgres 17

Code coverage* (full report)

  • functions: 32.8% (8641 of 26359 functions)
  • lines: 48.7% (73209 of 150476 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
a283eda at 2025-02-27T16:27:54.010Z :recycle:

Please sign in to comment.