From f604dd8a43c84071099cfb4090f5e8e99a3d1d1a Mon Sep 17 00:00:00 2001 From: Benjamin Wang Date: Mon, 20 Jan 2025 18:25:42 +0000 Subject: [PATCH] add comment to clarify the etcd shutting down workflow Signed-off-by: Benjamin Wang --- server/embed/etcd.go | 48 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/server/embed/etcd.go b/server/embed/etcd.go index 5f8f7e33d96..3caddd9c553 100644 --- a/server/embed/etcd.go +++ b/server/embed/etcd.go @@ -81,10 +81,16 @@ type Etcd struct { cfg Config stopc chan struct{} - errc chan error + // errc is used to receive error from sub goroutines (including + // client handler, peer handler and metrics handler). It's closed + // after all these sub goroutines exit (checked via `wg`). Writers + // should avoid writing after `stopc` is closed by selecting on + // reading from `stopc`. + errc chan error closeOnce sync.Once - wg sync.WaitGroup + // wg is used to track the lifecycle of all sub goroutines. + wg sync.WaitGroup } type peerListener struct { @@ -388,6 +394,24 @@ func (e *Etcd) Config() Config { // Close gracefully shuts down all servers/listeners. // Client requests will be terminated with request timeout. // After timeout, enforce remaning requests be closed immediately. +// +// The rough workflow to shut down etcd: +// 1. close the `stopc` channel, so that all error handlers (child +// goroutines) won't send back any errors anymore; +// 2. close all client and metrics listeners, so that etcd server +// stops receiving any new connection immediately; +// 3. stop the http and grpc servers gracefully, within request timeout; +// 4. call the cancel function to close the gateway context, so that +// all gateway connections are closed. +// 5. stop etcd server gracefully, and ensure the main raft loop +// goroutine is stopped; +// 6. stop all peer listeners, so that it stops receives peer connections +// and messages (wait up to 1-second); +// 7. wait for all child goroutines (i.e. client handlers, peer handlers +// and metrics handlers) to exit; +// 8. close the `errc` channel to release the resource. Note that it's only +// safe to close the `errc` after step 7 above is done, otherwise the +// child goroutines may send errors back to already closed `errc` channel. func (e *Etcd) Close() { fields := []zap.Field{ zap.String("name", e.cfg.Name), @@ -407,10 +431,14 @@ func (e *Etcd) Close() { lg.Sync() }() + // 1. close the `stopc` channel, so that all error handlers (child + // goroutines) won't send back any errors anymore; e.closeOnce.Do(func() { close(e.stopc) }) + // 2. close all client and metrics listeners, so that etcd server + // stops receiving any new connection immediately; for i := range e.Clients { if e.Clients[i] != nil { e.Clients[i].Close() @@ -421,7 +449,7 @@ func (e *Etcd) Close() { e.metricsListeners[i].Close() } - // close client requests with request timeout + // 3. stop the http and grpc servers gracefully, within request timeout; timeout := 2 * time.Second if e.Server != nil { timeout = e.Server.Cfg.ReqTimeout() @@ -434,6 +462,8 @@ func (e *Etcd) Close() { } } + // 4. call the cancel function to close the gateway context, so that + // all gateway connections are closed. for _, sctx := range e.sctxs { sctx.cancel() } @@ -443,12 +473,14 @@ func (e *Etcd) Close() { e.tracingExporterShutdown() } - // close rafthttp transports + // 5. stop etcd server gracefully, and ensure the main raft loop + // goroutine is stopped; if e.Server != nil { e.Server.Stop() } - // close all idle connections in peer handler (wait up to 1-second) + // 6. stop all peer listeners, so that it stops receives peer connections + // and messages (wait up to 1-second); for i := range e.Peers { if e.Peers[i] != nil && e.Peers[i].close != nil { ctx, cancel := context.WithTimeout(context.Background(), time.Second) @@ -457,7 +489,13 @@ func (e *Etcd) Close() { } } if e.errc != nil { + // 7. wait for all child goroutines (i.e. client handlers, peer handlers + // and metrics handlers) to exit; e.wg.Wait() + + // 8. close the `errc` channel to release the resource. Note that it's only + // safe to close the `errc` after step 7 above is done, otherwise the + // child goroutines may send errors back to already closed `errc` channel. close(e.errc) } }