Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add etcd snapshot and restore #2118

Closed
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions pkg/cli/cmds/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"github.com/rancher/k3s/pkg/version"
"github.com/rancher/spur/cli"
"github.com/rancher/spur/cli/altsrc"
"time"
galal-hussein marked this conversation as resolved.
Show resolved Hide resolved
)

const (
Expand Down Expand Up @@ -54,6 +55,9 @@ type Server struct {
ClusterInit bool
ClusterReset bool
EncryptSecrets bool
SnapshotDir string
SnapshotInterval time.Duration
RestorePath string
}

var ServerConfig Server
Expand Down Expand Up @@ -201,6 +205,22 @@ func NewServerCommand(action func(*cli.Context) error) *cli.Command {
Destination: &ServerConfig.DatastoreKeyFile,
EnvVars: []string{version.ProgramUpper + "_DATASTORE_KEYFILE"},
},
&cli.DurationFlag{
Name: "snapshot-interval",
Usage: "(db) snapshot interval time",
Destination: &ServerConfig.SnapshotInterval,
Value: 5 * time.Minute,
},
&cli.StringFlag{
Name: "snapshot-dir",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's too bad we didn't decide earlier if things are -path or -dir. I wanted to complain that we're adding two paths, but one is called Path and one is called Dir, but we're already inconsistent on this for other config settings. Should the the snapshot-restore-path point at a single file?

Usage: "(db) directory to save db snapshots",
Destination: &ServerConfig.SnapshotDir,
},
&cli.StringFlag{
Name: "snapshot-restore-path",
Usage: "(db) Snapshot restore path",
Destination: &ServerConfig.RestorePath,
},
&cli.StringFlag{
Name: "default-local-storage-path",
Usage: "(storage) Default local storage path for local provisioner storage class",
Expand Down
3 changes: 3 additions & 0 deletions pkg/cli/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ func run(app *cli.Context, cfg *cmds.Server) error {
serverConfig.ControlConfig.ClusterInit = cfg.ClusterInit
serverConfig.ControlConfig.ClusterReset = cfg.ClusterReset
serverConfig.ControlConfig.EncryptSecrets = cfg.EncryptSecrets
serverConfig.ControlConfig.SnapshotInterval = cfg.SnapshotInterval
serverConfig.ControlConfig.SnapshotDir = cfg.SnapshotDir
serverConfig.ControlConfig.RestorePath = cfg.RestorePath

if serverConfig.ControlConfig.SupervisorPort == 0 {
serverConfig.ControlConfig.SupervisorPort = serverConfig.ControlConfig.HTTPSPort
Expand Down
4 changes: 4 additions & 0 deletions pkg/cluster/managed.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ func (c *Cluster) start(ctx context.Context) error {
return c.managedDB.Reset(ctx, c.clientAccessInfo)
}

if c.config.RestorePath != "" {
return c.managedDB.Restore(ctx)
}

return c.managedDB.Start(ctx, c.clientAccessInfo)
}

Expand Down
1 change: 1 addition & 0 deletions pkg/cluster/managed/drivers.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ type Driver interface {
Reset(ctx context.Context, clientAccessInfo *clientaccess.Info) error
Start(ctx context.Context, clientAccessInfo *clientaccess.Info) error
Test(ctx context.Context, clientAccessInfo *clientaccess.Info) error
Restore(ctx context.Context) error
EndpointName() string
}

Expand Down
4 changes: 4 additions & 0 deletions pkg/daemons/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"net/http"
"sort"
"strings"
"time"

"github.com/rancher/kine/pkg/endpoint"
"github.com/rancher/wrangler-api/pkg/generated/controllers/core"
Expand Down Expand Up @@ -127,6 +128,9 @@ type Control struct {
EncryptSecrets bool
TLSMinVersion uint16
TLSCipherSuites []uint16
SnapshotDir string
SnapshotInterval time.Duration
RestorePath string

BindAddress string
SANs []string
Expand Down
92 changes: 92 additions & 0 deletions pkg/etcd/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"net/url"
"os"
"path/filepath"
"strconv"
"strings"
"time"

Expand All @@ -23,6 +24,7 @@ import (
"github.com/rancher/k3s/pkg/daemons/executor"
"github.com/sirupsen/logrus"
etcd "go.etcd.io/etcd/clientv3"
"go.etcd.io/etcd/clientv3/snapshot"
"go.etcd.io/etcd/etcdserver/etcdserverpb"
utilnet "k8s.io/apimachinery/pkg/util/net"
)
Expand Down Expand Up @@ -89,6 +91,25 @@ func nameFile(config *config.Control) string {
return filepath.Join(dataDir(config), "name")
}

func snapshotDir(config *config.Control) (string, error) {
if config.SnapshotDir == "" {
// we have to create the snapshot dir if we are using
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we create the default dir if it doesn't exist? What happens if the user specifies a nonexistent path? Do we want to create that also, or just fail when trying to save the snapshot?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer we fail in that case. I'd wouldn't want the software creating n+1 nested dir structures.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah same here, maybe check here that it's a writable directory?

// default snapshot dir if it doesnt exist
defaultSnapshotDir := filepath.Join(config.DataDir, "db", "snapshots")
if s, err := os.Stat(defaultSnapshotDir); err == nil && s.IsDir() {
return defaultSnapshotDir, nil
} else if os.IsNotExist(err) {
if err := os.MkdirAll(defaultSnapshotDir, 0755); err != nil {
return "", err
}
return defaultSnapshotDir, nil
} else {
return "", err
}
}
return config.SnapshotDir, nil
}

func (e *ETCD) IsInitialized(ctx context.Context, config *config.Control) (bool, error) {
if s, err := os.Stat(walDir(config)); err == nil && s.IsDir() {
return true, nil
Expand Down Expand Up @@ -120,6 +141,45 @@ func (e *ETCD) Reset(ctx context.Context, clientAccessInfo *clientaccess.Info) e
return e.newCluster(ctx, true)
}

func (e *ETCD) Restore(ctx context.Context) error {
// check the old etcd data dir
oldDataDir := dataDir(e.config) + "-old"
if s, err := os.Stat(oldDataDir); err == nil && s.IsDir() {
logrus.Infof("etcd already restored from a snapshot, restart without --snapshot-restore-path flag now. Backup and delete ${datadir}/server/db on each peer etcd server and rejoin the nodes")
os.Exit(0)
} else if os.IsNotExist(err) {
if e.config.RestorePath == "" {
return errors.New("no etcd restore path was specified")
}
// make sure snapshot exists before restoration
if _, err := os.Stat(e.config.RestorePath); err != nil {
return err
}
// move the data directory to a temp path
if err := os.Rename(dataDir(e.config), oldDataDir); err != nil {
return err
}
sManager := snapshot.NewV3(nil)
if err := sManager.Restore(snapshot.RestoreConfig{
SnapshotPath: e.config.RestorePath,
Name: e.name,
OutputDataDir: dataDir(e.config),
OutputWALDir: walDir(e.config),
PeerURLs: []string{e.peerURL()},
InitialCluster: fmt.Sprintf("%s=%s", e.name, e.peerURL()),
}); err != nil {
return err
}
} else {
return err
}
if err := e.setName(); err != nil {
return err
}

return e.newCluster(ctx, true)
}

func (e *ETCD) Start(ctx context.Context, clientAccessInfo *clientaccess.Info) error {
existingCluster, err := e.IsInitialized(ctx, e.config)
if err != nil {
Expand All @@ -130,6 +190,8 @@ func (e *ETCD) Start(ctx context.Context, clientAccessInfo *clientaccess.Info) e
Register(ctx, e, e.config.Runtime.Core.Core().V1().Node())
return nil
}
// starting snapshot thread
go e.Snapshot(ctx)

if existingCluster {
opt, err := executor.CurrentETCDOptions()
Expand Down Expand Up @@ -480,3 +542,33 @@ func (e *ETCD) clientURLs(ctx context.Context, clientAccessInfo *clientaccess.In
}
return clientURLs, memberList, nil
}

func (e *ETCD) Snapshot(ctx context.Context) {
briandowns marked this conversation as resolved.
Show resolved Hide resolved
ticker := time.NewTicker(e.config.SnapshotInterval)
defer ticker.Stop()
for range ticker.C {
snapshotTime := <-ticker.C
logrus.Infof("Taking etcd snapshot at %s", snapshotTime.String())
sManager := snapshot.NewV3(nil)
tlsConfig, err := toTLSConfig(e.runtime)
if err != nil {
logrus.Errorf("failed to get tls config for etcd: %v", err)
continue
}
etcdConfig := etcd.Config{
Endpoints: []string{"https://127.0.0.1:2379"},
TLS: tlsConfig,
Context: ctx,
}
snapshotDir, err := snapshotDir(e.config)
if err != nil {
logrus.Errorf("failed to get the snapshot dir: %v", err)
}
snapshotPath := filepath.Join(snapshotDir, "etcd-snapshot"+strconv.Itoa(int(snapshotTime.Unix())))

if err := sManager.Save(ctx, etcdConfig, snapshotPath); err != nil {
logrus.Errorf("failed to save snapshot %s: %v", snapshotPath, err)
continue
}
}
}
16 changes: 16 additions & 0 deletions vendor/go.etcd.io/etcd/clientv3/snapshot/doc.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 35 additions & 0 deletions vendor/go.etcd.io/etcd/clientv3/snapshot/util.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading