Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prepare runbook scripts for backup being enabled #9998

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/runbook/scripts/reduce-citus-disk-size.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env bash
set -euox pipefail
set -euo pipefail

source ./utils.sh

Expand Down
101 changes: 8 additions & 93 deletions docs/runbook/scripts/restore-stackgres-backup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ function createSGShardedClusterConfigWithRestore() {
"downloadDiskConcurrency": 1
}
}
EOF)
EOF
)
CLUSTER_CONFIG=$(echo "${sourceConfig}" | \
jq --argjson initialDataConfig "${initialDataConfig}" '.spec.initialData=$initialDataConfig')
log "Created SGShardedCluster configuration to restore backup ${BACKUP_TO_RESTORE}"
Expand Down Expand Up @@ -144,75 +145,6 @@ function findShardedCluster() {
doContinue
}

function fixClusterAuth() {
local sgPasswords=$(kubectl get secret "${CLUSTER}" -o json |
ksd |
jq -r '.stringData')
local superuserUsername=$(echo "${sgPasswords}" | jq -r '.["superuser-username"]')
local superuserPassword=$(echo "${sgPasswords}" | jq -r '.["superuser-password"]')
local replicationUsername=$(echo "${sgPasswords}" | jq -r '.["replication-username"]')
local replicationPassword=$(echo "${sgPasswords}" | jq -r '.["replication-password"]')
local authenticatorUsername=$(echo "${sgPasswords}" | jq -r '.["authenticator-username"]')
local authenticatorPassword=$(echo "${sgPasswords}" | jq -r '.["authenticator-password"]')

# Mirror Node Passwords
local mirrorNodePasswords=$(kubectl get secret "${HELM_RELEASE_NAME}-passwords" -o json |
ksd |
jq -r '.stringData')
local graphqlUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_GRAPHQL_DB_USERNAME')
local graphqlPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_GRAPHQL_DB_PASSWORD')
local grpcUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_GRPC_DB_USERNAME')
local grpcPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_GRPC_DB_PASSWORD')
local importerUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_IMPORTER_DB_USERNAME')
local importerPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_IMPORTER_DB_PASSWORD')
local ownerUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_IMPORTER_DB_OWNER')
local ownerPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_IMPORTER_DB_OWNERPASSWORD')
local restUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_REST_DB_USERNAME')
local restPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_REST_DB_PASSWORD')
local restJavaUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_RESTJAVA_DB_USERNAME')
local restJavaPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_RESTJAVA_DB_PASSWORD')
local rosettaUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_ROSETTA_DB_USERNAME')
local rosettaPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_ROSETTA_DB_PASSWORD')
local web3Username=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_WEB3_DB_USERNAME')
local web3Password=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_WEB3_DB_PASSWORD')
local dbName=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_IMPORTER_DB_NAME')
local sql=$(cat <<EOF
alter user ${superuserUsername} with password '${superuserPassword}';
alter user ${graphqlUsername} with password '${graphqlPassword}';
alter user ${grpcUsername} with password '${grpcPassword}';
alter user ${importerUsername} with password '${importerPassword}';
alter user ${ownerUsername} with password '${ownerPassword}';
alter user ${restUsername} with password '${restPassword}';
alter user ${restJavaUsername} with password '${restJavaPassword}';
alter user ${rosettaUsername} with password '${rosettaPassword}';
alter user ${web3Username} with password '${web3Password}';
alter user ${replicationUsername} with password '${replicationPassword}';
alter user ${authenticatorUsername} with password '${authenticatorPassword}';

\c ${dbName}
insert into pg_dist_authinfo(nodeid, rolename, authinfo)
values (0, '${superuserUsername}', 'password=${superuserPassword}'),
(0, '${graphqlUsername}', 'password=${graphqlPassword}'),
(0, '${grpcUsername}', 'password=${grpcPassword}'),
(0, '${importerUsername}', 'password=${importerPassword}'),
(0, '${ownerUsername}', 'password=${ownerPassword}'),
(0, '${restUsername}', 'password=${restPassword}'),
(0, '${restJavaUsername}', 'password=${restJavaPassword}'),
(0, '${rosettaUsername}', 'password=${rosettaPassword}'),
(0, '${web3Username}', 'password=${web3Password}') on conflict (nodeid, rolename)
do
update set authinfo = excluded.authinfo;
EOF)

log "Fixing passwords and pg_dist_authinfo for all pods in the cluster"
for pod in $(kubectl get pods -l 'app=StackGresCluster,role=master' -o name); do
log "Updating passwords and pg_dist_authinfo for ${pod}"
echo "$sql" | kubectl exec -i "${pod}" -c postgres-util -- psql -U "${superuserUsername}" -f -
done

checkCitusMetadataSyncStatus "${CURRENT_NAMESPACE}"
}

function getSnapshotHandle() {
local sgBackup=$1

Expand Down Expand Up @@ -249,7 +181,7 @@ function pickShardedBackup() {
echo "WARNING!!! You are about to restore an older backup, all later backups have to be removed before proceeding"
doContinue
count=$((backupIndex-1))
kubectl delete sgshardedbackups $(echo "${allBackups[@]:0:${count}}" | sed 's/[:TZ0-9\-]\+\///g')
kubectl delete sgshardedbackups $(echo "${allBackups[@]:0:${count}}" | sed -E 's/[:TZ0-9\-]+\///g')
log "Deleted ${count} most recent SGShardedBackups"
fi

Expand Down Expand Up @@ -286,8 +218,8 @@ function recreateManagedCluster() {
flux resume helmrelease -n "${CURRENT_NAMESPACE}" "${HELM_RELEASE_NAME}" --timeout 30m
fi

waitForStackGresClusterPods
fixClusterAuth
unpauseCitus "${CURRENT_NAMESPACE}"
updateStackgresCreds "${CLUSTER}" "${CURRENT_NAMESPACE}"
routeTraffic "${CURRENT_NAMESPACE}"
log "SGShardedCluster ${CLUSTER} is ready"
}
Expand All @@ -301,8 +233,9 @@ function restoreBackup() {
log "Creating SGShardedCluster with the restore configuration"
echo "${CLUSTER_CONFIG}" | kubectl apply -f -

waitForStackGresClusterPods
fixClusterAuth
unpauseCitus "${CURRENT_NAMESPACE}"
updateStackgresCreds "${CLUSTER}" "${CURRENT_NAMESPACE}"
checkCitusMetadataSyncStatus "${CURRENT_NAMESPACE}"
checkCoordinatorReplica

# Once again remove ownerReferences since in restore they will get updated with new owners
Expand Down Expand Up @@ -363,24 +296,6 @@ function swapPv() {
kubectl wait --for=jsonpath='{.status.phase}'=Bound pvc "${pvcs[@]}" --timeout=-1s
}

function waitForStackGresClusterPods() {
log "Waiting for all StackGresCluster StatefulSets to be created"
while ! kubectl describe sgshardedclusters "${CLUSTER}" >/dev/null 2>&1; do
sleep 1
done

expectedTotal=$(($(kubectl get sgshardedclusters "${CLUSTER}" -o jsonpath='{.spec.shards.clusters}')+1))
while [[ "$(kubectl get sts -l 'app=StackGresCluster' -o name | wc -l)" -ne "${expectedTotal}" ]]; do
sleep 1
done

log "Waiting for all StackGresCluster pods to be ready"
for sts in $(kubectl get sts -l 'app=StackGresCluster' -o name); do
expected=$(kubectl get "${sts}" -o jsonpath='{.spec.replicas}')
kubectl wait --for=jsonpath='{.status.readyReplicas}'=${expected} "${sts}" --timeout=-1s
done
}

CURRENT_NAMESPACE=$(kubectl config view --minify --output 'jsonpath={..namespace}')

prepare
Expand Down
145 changes: 68 additions & 77 deletions docs/runbook/scripts/restore-volume-snapshot.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ set -euo pipefail
source ./utils.sh

REPLACE_DISKS="${REPLACE_DISKS:-true}"
STACKGRES_MINIO_ROOT="${STACKGRES_MINIO_ROOT:-sgbackups.stackgres.io}"
ZFS_POOL_NAME="${ZFS_POOL_NAME:-zfspv-pool}"

function configureAndValidate() {
CURRENT_CONTEXT=$(kubectl config current-context)
GCP_PROJECT="$(readUserInput "Enter GCP Project for target: ")"
if [[ -z "${GCP_PROJECT}" ]]; then
log "GCP_PROJECT is not set and is required. Exiting"
Expand Down Expand Up @@ -120,6 +120,7 @@ function configureAndValidate() {
function prepareDiskReplacement() {
for namespace in "${NAMESPACES[@]}"; do
unrouteTraffic "${namespace}"
kubectl delete sgshardedbackups.stackgres.io -n "${namespace}" --all
pauseCitus "${namespace}"
done

Expand Down Expand Up @@ -159,7 +160,16 @@ function renameZfsVolumes() {
log "Snapshot pvc ${ZFS_POOL_NAME}/${snapshotPvcVolumeName} already matches pvc ${ZFS_POOL_NAME}/${pvcVolumeName}"
fi
done
kubectl_common exec "${podInfo}" -c openebs-zfs-plugin -- zfs list

local zfsSnapshots="$(kubectl_common exec "${podInfo}" -c openebs-zfs-plugin -- bash -c 'zfs list -H -o name -t snapshot')"

if [[ -z "${zfsSnapshots}" ]]; then
log "No snapshots found for pool ${ZFS_POOL_NAME} and node ${nodeId}"
else
log "Deleting snapshots ${zfsSnapshots}"
kubectl_common exec "${podInfo}" -c openebs-zfs-plugin -- bash -c "echo \"${zfsSnapshots}\" | xargs -n1 zfs destroy"
fi
kubectl_common exec "${podInfo}" -c openebs-zfs-plugin -- zfs list -t filesystem,snapshot
done
log "ZFS datasets renamed"
}
Expand Down Expand Up @@ -188,6 +198,33 @@ function replaceDisks() {
renameZfsVolumes
}

function cleanupBackupStorage() {
local namespace="${1}"
local shardedClusterName="${2}"
local minioPod=$(kubectl_common get pods -l 'app.kubernetes.io/name=minio' -o json | jq -r '.items[0].metadata.name')
if [[ "${minioPod}" == "null" ]]; then
echo "Minio pod not found. Skipping cleanup"
else
local backups=$(kubectl get sgshardedclusters.stackgres.io -n "${namespace}" "${shardedClusterName}" -o json | jq -r '.spec.configurations.backups')
if [[ "${backups}" == "null" ]]; then
echo "No backup configuration found for sharded cluster ${shardedClusterName} in namespace ${namespace}. Skipping cleanup"
return
fi

kubectl patch sgshardedclusters.stackgres.io "${shardedClusterName}" -n "${namespace}" --type='json' -p '[{"op": "remove", "path": "/spec/configurations/backups"}]';

local minioDataPath=$(kubectl_common exec "${minioPod}" -- sh -c 'echo $MINIO_DATA_DIR')
local backupStorages=($(echo "${backups}" | jq -r '.[].sgObjectStorage'))
for backupStorage in "${backupStorages[@]}"; do
local minioBucket=$(kubectl get sgObjectStorage.stackgres.io -n "${namespace}" "${backupStorage}" -o json | jq -r '.spec.s3Compatible.bucket')
local pathToDelete="${minioDataPath}/${minioBucket}/${STACKGRES_MINIO_ROOT}/${namespace}"
echo "Cleaning up wal files in minio bucket ${minioBucket}. Will delete all files at path ${pathToDelete}"
doContinue
kubectl_common exec "${minioPod}" -- mc rm --recursive --force "${pathToDelete}"
done
fi
}

function configureShardedClusterResource() {
local pvcsInNamespace="${1}"
local shardedClusterName="${2}"
Expand All @@ -201,23 +238,37 @@ function configureShardedClusterResource() {
sort_by(.citusCluster.citusGroup, .citusCluster.podName)|
to_entries|
map({index: .key, pods: {persistentVolume: {size: .value.snapshotPvcSize}}})')
local shardedClusterPatch=$(echo "${workerPvcOverrides}" |
jq -r --arg coordinatorPvcSize "${coordinatorPvcSize}" \
'{

log "Patching sharded cluster ${shardedClusterName} in namespace ${namespace}"
local shardedCluster=$(kubectl get sgshardedclusters.stackgres.io -n "${namespace}" "${shardedClusterName}" -o json)
local shardedClusterPatch=$(echo "${shardedCluster} ${workerPvcOverrides}" |
jq -s --arg COORDINATOR_PVC_SIZE "${coordinatorPvcSize}" \
'.[0] as $cluster |
.[1] as $overrides |
$cluster |
if(.spec.configurations | has("backups"))
then .spec.configurations.backups | map(del(.paths))
else
[]
end |
{
spec: {
configurations: {
backups: (.)
},
coordinator: {
pods: {
persistentVolume: {
size: $coordinatorPvcSize
size: $COORDINATOR_PVC_SIZE
}
}
},
shards: {
overrides: (.)
overrides: $overrides
}
}
}')
log "Patching sharded cluster ${shardedClusterName} in namespace ${namespace} with ${shardedClusterPatch}"
}
}')
cleanupBackupStorage "${namespace}" "${shardedClusterName}"
kubectl patch sgshardedclusters.stackgres.io -n "${namespace}" "${shardedClusterName}" --type merge -p "${shardedClusterPatch}"
log "
**** IMPORTANT ****
Expand All @@ -234,55 +285,11 @@ function configureShardedClusterResource() {
function markAndConfigurePrimaries() {
local pvcsInNamespace="${1}"
local shardedClusterName="${2}"
local namespace="${3}"

# Stackgres Passwords
local primaryCoordinator=$(echo "${pvcsInNamespace}" |
jq -r 'map(select(.snapshotPrimary and .citusCluster.isCoordinator))|first')
local sgPasswordsSecretName=$(echo "${primaryCoordinator}" | jq -r '.citusCluster.clusterName')
local sgPasswords=$(kubectl get secret -n "${namespace}" "${sgPasswordsSecretName}" -o json |
ksd |
jq -r '.stringData')
local superuserUsername=$(echo "${sgPasswords}" | jq -r '.["superuser-username"]')
local superuserPassword=$(echo "${sgPasswords}" | jq -r '.["superuser-password"]')
local replicationUsername=$(echo "${sgPasswords}" | jq -r '.["replication-username"]')
local replicationPassword=$(echo "${sgPasswords}" | jq -r '.["replication-password"]')
local authenticatorUsername=$(echo "${sgPasswords}" | jq -r '.["authenticator-username"]')
local authenticatorPassword=$(echo "${sgPasswords}" | jq -r '.["authenticator-password"]')

# Mirror Node Passwords
local mirrorNodePasswords=$(kubectl get secret -n "${namespace}" "${HELM_RELEASE_NAME}-passwords" -o json |
ksd |
jq -r '.stringData')
local graphqlUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_GRAPHQL_DB_USERNAME')
local graphqlPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_GRAPHQL_DB_PASSWORD')
local grpcUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_GRPC_DB_USERNAME')
local grpcPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_GRPC_DB_PASSWORD')
local importerUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_IMPORTER_DB_USERNAME')
local importerPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_IMPORTER_DB_PASSWORD')
local ownerUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_IMPORTER_DB_OWNER')
local ownerPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_IMPORTER_DB_OWNERPASSWORD')
local restUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_REST_DB_USERNAME')
local restPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_REST_DB_PASSWORD')
local restJavaUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_RESTJAVA_DB_USERNAME')
local restJavaPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_RESTJAVA_DB_PASSWORD')
local rosettaUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_ROSETTA_DB_USERNAME')
local rosettaPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_ROSETTA_DB_PASSWORD')
local web3Username=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_WEB3_DB_USERNAME')
local web3Password=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_WEB3_DB_PASSWORD')
local dbName=$(echo "${mirrorNodePasswords}" | jq -r '.HEDERA_MIRROR_IMPORTER_DB_NAME')
local updatePasswordsSql=$(cat <<EOF
alter user ${superuserUsername} with password '${superuserPassword}';
alter user ${graphqlUsername} with password '${graphqlPassword}';
alter user ${grpcUsername} with password '${grpcPassword}';
alter user ${importerUsername} with password '${importerPassword}';
alter user ${ownerUsername} with password '${ownerPassword}';
alter user ${restUsername} with password '${restPassword}';
alter user ${restJavaUsername} with password '${restJavaPassword}';
alter user ${rosettaUsername} with password '${rosettaPassword}';
alter user ${web3Username} with password '${web3Password}';
alter user ${replicationUsername} with password '${replicationPassword}';
alter user ${authenticatorUsername} with password '${authenticatorPassword}';
EOF)


local clusterGroups=$(echo "${pvcsInNamespace}" |
jq -r 'group_by(.citusCluster.clusterName)|
Expand Down Expand Up @@ -332,26 +339,10 @@ group ${citusGroup}. Will failover"
fi
log "Patching cluster ${clusterName} in namespace ${namespace} with ${clusterPatch}"
kubectl patch sgclusters.stackgres.io -n "${namespace}" "${clusterName}" --type merge -p "${clusterPatch}"

log "Executing sql command for cluster ${clusterName}: ${updatePasswordsSql}"
kubectl exec -n "${namespace}" "${primaryPod}" -c postgres-util -- \
psql -U "${superuserUsername}" -c "${updatePasswordsSql}"

kubectl exec -n "${namespace}" "${primaryPod}" -c postgres-util \
-- psql -U "${superuserUsername}" -d "${dbName}" -c \
"insert into pg_dist_authinfo(nodeid, rolename, authinfo)
values (0, '${superuserUsername}', 'password=${superuserPassword}'),
(0, '${graphqlUsername}', 'password=${graphqlPassword}'),
(0, '${grpcUsername}', 'password=${grpcPassword}'),
(0, '${importerUsername}', 'password=${importerPassword}'),
(0, '${ownerUsername}', 'password=${ownerPassword}'),
(0, '${restUsername}', 'password=${restPassword}'),
(0, '${restJavaUsername}', 'password=${restJavaPassword}'),
(0, '${rosettaUsername}', 'password=${rosettaPassword}'),
(0, '${web3Username}', 'password=${web3Password}') on conflict (nodeid, rolename)
do
update set authinfo = excluded.authinfo;"
done

waitForPatroniMasters "${namespace}"
updateStackgresCreds "${shardedClusterName}" "${namespace}"
}

function patchCitusClusters() {
Expand All @@ -375,8 +366,8 @@ function patchCitusClusters() {
local shardedClusterName=$(echo "${pvcsInNamespace}" | jq -r '.[0].citusCluster.shardedClusterName')

configureShardedClusterResource "${pvcsInNamespace}" "${shardedClusterName}" "${namespace}"
unpauseCitus "${namespace}" "true"
markAndConfigurePrimaries "${pvcsInNamespace}" "${shardedClusterName}"
unpauseCitus "${namespace}" "true" "false"
markAndConfigurePrimaries "${pvcsInNamespace}" "${shardedClusterName}" "${namespace}"
routeTraffic "${namespace}"
done
}
Expand Down
Loading
Loading