Skip to content

Commit

Permalink
Add machine UpToDate condition to KCP
Browse files Browse the repository at this point in the history
# Conflicts:
#	controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go
  • Loading branch information
fabriziopandini committed Nov 8, 2024
1 parent bf61b1e commit 7c8ba1d
Show file tree
Hide file tree
Showing 11 changed files with 624 additions and 117 deletions.
8 changes: 8 additions & 0 deletions api/v1beta1/machine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,15 @@ const (
const (
// MachineUpToDateV1Beta2Condition is true if the Machine spec matches the spec of the Machine's owner resource, e.g. KubeadmControlPlane or MachineDeployment.
// The Machine's owner (e.g. MachineDeployment) is authoritative to set their owned Machine's UpToDate conditions based on its current spec.
// NOTE: The Machine's owner might use this condition to surface also other use cases when Machine is considered not up to date, e.g. when MachineDeployment spec.rolloutAfter
// is expired and the Machine needs to be rolled out.
MachineUpToDateV1Beta2Condition = "UpToDate"

// MachineUpToDateV1Beta2Reason surface when a Machine spec matches the spec of the Machine's owner resource, e.g. KubeadmControlPlane or MachineDeployment.
MachineUpToDateV1Beta2Reason = "UpToDate"

// MachineNotUpToDateV1Beta2Reason surface when a Machine spec does not match the spec of the Machine's owner resource, e.g. KubeadmControlPlane or MachineDeployment.
MachineNotUpToDateV1Beta2Reason = "NotUpToDate"
)

// Machine's BootstrapConfigReady condition and corresponding reasons that will be used in v1Beta2 API version.
Expand Down
90 changes: 46 additions & 44 deletions controlplane/kubeadm/internal/control_plane.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ type ControlPlane struct {
Machines collections.Machines
machinesPatchHelpers map[string]*patch.Helper

machinesNotUptoDate collections.Machines
machinesNotUptoDateLogMessages map[string][]string
machinesNotUptoDateConditionMessages map[string][]string

// reconciliationTime is the time of the current reconciliation, and should be used for all "now" calculations
reconciliationTime metav1.Time

Expand Down Expand Up @@ -97,15 +101,35 @@ func NewControlPlane(ctx context.Context, managementCluster ManagementCluster, c
patchHelpers[machine.Name] = patchHelper
}

// Select machines that should be rolled out because of an outdated configuration or because rolloutAfter/Before expired.
reconciliationTime := metav1.Now()
machinesNotUptoDate := make(collections.Machines, len(ownedMachines))
machinesNotUptoDateLogMessages := map[string][]string{}
machinesNotUptoDateConditionMessages := map[string][]string{}
for _, m := range ownedMachines {
upToDate, logMessages, conditionMessages, err := UpToDate(m, kcp, &reconciliationTime, infraObjects, kubeadmConfigs)
if err != nil {
return nil, err
}
if !upToDate {
machinesNotUptoDate.Insert(m)
machinesNotUptoDateLogMessages[m.Name] = logMessages
machinesNotUptoDateConditionMessages[m.Name] = conditionMessages
}
}

return &ControlPlane{
KCP: kcp,
Cluster: cluster,
Machines: ownedMachines,
machinesPatchHelpers: patchHelpers,
KubeadmConfigs: kubeadmConfigs,
InfraResources: infraObjects,
reconciliationTime: metav1.Now(),
managementCluster: managementCluster,
KCP: kcp,
Cluster: cluster,
Machines: ownedMachines,
machinesPatchHelpers: patchHelpers,
machinesNotUptoDate: machinesNotUptoDate,
machinesNotUptoDateLogMessages: machinesNotUptoDateLogMessages,
machinesNotUptoDateConditionMessages: machinesNotUptoDateConditionMessages,
KubeadmConfigs: kubeadmConfigs,
InfraResources: infraObjects,
reconciliationTime: reconciliationTime,
managementCluster: managementCluster,
}, nil
}

Expand Down Expand Up @@ -152,16 +176,12 @@ func (c *ControlPlane) FailureDomainWithMostMachines(ctx context.Context, machin
return failuredomains.PickMost(ctx, c.Cluster.Status.FailureDomains.FilterControlPlane(), c.Machines, machines)
}

// NextFailureDomainForScaleUp returns the failure domain with the fewest number of up-to-date machines.
// NextFailureDomainForScaleUp returns the failure domain with the fewest number of up-to-date, not deleted machines.
func (c *ControlPlane) NextFailureDomainForScaleUp(ctx context.Context) (*string, error) {
if len(c.Cluster.Status.FailureDomains.FilterControlPlane()) == 0 {
return nil, nil
}
upToDateMachines, err := c.UpToDateMachines()
if err != nil {
return nil, errors.Wrapf(err, "failed to determine next failure domain for scale up")
}
return failuredomains.PickFewest(ctx, c.FailureDomains().FilterControlPlane(), upToDateMachines), nil
return failuredomains.PickFewest(ctx, c.FailureDomains().FilterControlPlane(), c.UpToDateMachines().Filter(collections.Not(collections.HasDeletionTimestamp))), nil
}

// InitialControlPlaneConfig returns a new KubeadmConfigSpec that is to be used for an initializing control plane.
Expand Down Expand Up @@ -198,40 +218,21 @@ func (c *ControlPlane) GetKubeadmConfig(machineName string) (*bootstrapv1.Kubead
}

// MachinesNeedingRollout return a list of machines that need to be rolled out.
func (c *ControlPlane) MachinesNeedingRollout() (collections.Machines, map[string]string, error) {
// Ignore machines to be deleted.
machines := c.Machines.Filter(collections.Not(collections.HasDeletionTimestamp))
func (c *ControlPlane) MachinesNeedingRollout() (collections.Machines, map[string][]string) {
// Note: Machines already deleted are dropped because they will be replaced by new machines after deletion completes.
return c.machinesNotUptoDate.Filter(collections.Not(collections.HasDeletionTimestamp)), c.machinesNotUptoDateLogMessages
}

// Return machines if they are scheduled for rollout or if with an outdated configuration.
machinesNeedingRollout := make(collections.Machines, len(machines))
rolloutReasons := map[string]string{}
for _, m := range machines {
reason, needsRollout, err := NeedsRollout(&c.reconciliationTime, c.KCP.Spec.RolloutAfter, c.KCP.Spec.RolloutBefore, c.InfraResources, c.KubeadmConfigs, c.KCP, m)
if err != nil {
return nil, nil, err
}
if needsRollout {
machinesNeedingRollout.Insert(m)
rolloutReasons[m.Name] = reason
}
}
return machinesNeedingRollout, rolloutReasons, nil
// NotUpToDateMachines return a list of machines that are not up to date with the control
// plane's configuration.
func (c *ControlPlane) NotUpToDateMachines() (collections.Machines, map[string][]string) {
return c.machinesNotUptoDate, c.machinesNotUptoDateConditionMessages
}

// UpToDateMachines returns the machines that are up to date with the control
// plane's configuration and therefore do not require rollout.
func (c *ControlPlane) UpToDateMachines() (collections.Machines, error) {
upToDateMachines := make(collections.Machines, len(c.Machines))
for _, m := range c.Machines {
_, needsRollout, err := NeedsRollout(&c.reconciliationTime, c.KCP.Spec.RolloutAfter, c.KCP.Spec.RolloutBefore, c.InfraResources, c.KubeadmConfigs, c.KCP, m)
if err != nil {
return nil, err
}
if !needsRollout {
upToDateMachines.Insert(m)
}
}
return upToDateMachines, nil
// plane's configuration.
func (c *ControlPlane) UpToDateMachines() collections.Machines {
return c.Machines.Difference(c.machinesNotUptoDate)
}

// getInfraResources fetches the external infrastructure resource for each machine in the collection and returns a map of machine.Name -> infraResource.
Expand Down Expand Up @@ -316,6 +317,7 @@ func (c *ControlPlane) PatchMachines(ctx context.Context) error {
controlplanev1.MachineEtcdPodHealthyCondition,
controlplanev1.MachineEtcdMemberHealthyCondition,
}}, patch.WithOwnedV1Beta2Conditions{Conditions: []string{
clusterv1.MachineUpToDateV1Beta2Condition,
controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition,
controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition,
controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition,
Expand Down
77 changes: 75 additions & 2 deletions controlplane/kubeadm/internal/control_plane_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
. "github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"

clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
Expand All @@ -30,8 +31,6 @@ import (
)

func TestControlPlane(t *testing.T) {
g := NewWithT(t)

t.Run("Failure domains", func(t *testing.T) {
controlPlane := &ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{},
Expand All @@ -53,14 +52,88 @@ func TestControlPlane(t *testing.T) {
}

t.Run("With all machines in known failure domain, should return the FD with most number of machines", func(*testing.T) {
g := NewWithT(t)
g.Expect(*controlPlane.FailureDomainWithMostMachines(ctx, controlPlane.Machines)).To(Equal("two"))
})

t.Run("With some machines in non defined failure domains", func(*testing.T) {
g := NewWithT(t)
controlPlane.Machines.Insert(machine("machine-5", withFailureDomain("unknown")))
g.Expect(*controlPlane.FailureDomainWithMostMachines(ctx, controlPlane.Machines)).To(Equal("unknown"))
})
})

t.Run("MachinesUpToDate", func(t *testing.T) {
g := NewWithT(t)
cluster := &clusterv1.Cluster{
Status: clusterv1.ClusterStatus{
FailureDomains: clusterv1.FailureDomains{
"one": failureDomain(true),
"two": failureDomain(true),
"three": failureDomain(true),
},
},
}
kcp := &controlplanev1.KubeadmControlPlane{
Spec: controlplanev1.KubeadmControlPlaneSpec{
Version: "v1.31.0",
},
}
machines := collections.Machines{
"machine-1": &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{Name: "m1"},
Spec: clusterv1.MachineSpec{
Version: ptr.To("v1.31.0"),
FailureDomain: ptr.To("one"),
InfrastructureRef: corev1.ObjectReference{Kind: "GenericInfrastructureMachine", APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", Name: "m1"},
}},
"machine-2": &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{Name: "m2"},
Spec: clusterv1.MachineSpec{
Version: ptr.To("v1.29.0"), // not up-to-date
FailureDomain: ptr.To("two"),
InfrastructureRef: corev1.ObjectReference{Kind: "GenericInfrastructureMachine", APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", Name: "m2"},
}},
"machine-3": &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{Name: "m3", DeletionTimestamp: ptr.To(metav1.Now())}, // deleted
Spec: clusterv1.MachineSpec{
Version: ptr.To("v1.29.3"), // not up-to-date
FailureDomain: ptr.To("three"),
InfrastructureRef: corev1.ObjectReference{Kind: "GenericInfrastructureMachine", APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", Name: "m3"},
}},
"machine-4": &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{Name: "m4", DeletionTimestamp: ptr.To(metav1.Now())}, // deleted
Spec: clusterv1.MachineSpec{
Version: ptr.To("v1.31.0"),
FailureDomain: ptr.To("two"),
InfrastructureRef: corev1.ObjectReference{Kind: "GenericInfrastructureMachine", APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", Name: "m1"},
}},
}
controlPlane, err := NewControlPlane(ctx, nil, env.GetClient(), cluster, kcp, machines)
g.Expect(err).NotTo(HaveOccurred())

g.Expect(controlPlane.Machines).To(HaveLen(4))

machinesNotUptoDate, machinesNotUptoDateConditionMessages := controlPlane.NotUpToDateMachines()
g.Expect(machinesNotUptoDate.Names()).To(ConsistOf("m2", "m3"))
g.Expect(machinesNotUptoDateConditionMessages).To(HaveLen(2))
g.Expect(machinesNotUptoDateConditionMessages).To(HaveKeyWithValue("m2", []string{"Version v1.29.0, v1.31.0 required"}))
g.Expect(machinesNotUptoDateConditionMessages).To(HaveKeyWithValue("m3", []string{"Version v1.29.3, v1.31.0 required"}))

machinesNeedingRollout, machinesNotUptoDateLogMessages := controlPlane.MachinesNeedingRollout()
g.Expect(machinesNeedingRollout.Names()).To(ConsistOf("m2"))
g.Expect(machinesNotUptoDateLogMessages).To(HaveLen(2))
g.Expect(machinesNotUptoDateLogMessages).To(HaveKeyWithValue("m2", []string{"Machine version \"v1.29.0\" is not equal to KCP version \"v1.31.0\""}))
g.Expect(machinesNotUptoDateLogMessages).To(HaveKeyWithValue("m3", []string{"Machine version \"v1.29.3\" is not equal to KCP version \"v1.31.0\""}))

upToDateMachines := controlPlane.UpToDateMachines()
g.Expect(upToDateMachines).To(HaveLen(2))
g.Expect(upToDateMachines.Names()).To(ConsistOf("m1", "m4"))

fd, err := controlPlane.NextFailureDomainForScaleUp(ctx)
g.Expect(err).NotTo(HaveOccurred())
g.Expect(fd).To(Equal(ptr.To("two"))) // deleted up-to-date machines should not be counted when picking the next failure domain for scale up
})
}

func TestHasMachinesToBeRemediated(t *testing.T) {
Expand Down
Loading

0 comments on commit 7c8ba1d

Please sign in to comment.