内容简介:本文风哥教程参考Linux官方文档、Red Hat Enterprise Linux官方文档、Ansible Automation Platform官方文档、Docker官方文档、Kubernetes官方文档和Podman官方文档等内容,更多学习教程公众号风哥教程itpux_com详细介绍了相关技术的配置和使用方法。
本文档介
风哥提示:
绍Kubernetes集群灾难恢复实战案例。
Part01-etcd数据恢复
1.1 etcd备份恢复
[root@k8s-master ~]# cat > /usr/local/bin/etcd-backup.sh << 'EOF' #!/bin/bash # etcd-backup.sh # from:www.itpux.com.qq113257174.wx:itpux-com # web: http://www.fgedu.net.cn BACKUP_DIR="/backup/etcd" DATE=$(date +%Y%m%d-%H%M%S) ETCD_ENDPOINT="https://127.0.0.1:2379" ETCD_CACERT="/etc/kubernetes/pki/etcd/ca.crt" ETCD_CERT="/etc/kubernetes/pki/etcd/server.crt" ETCD_KEY="/etc/kubernetes/pki/etcd/server.key" mkdir -p $BACKUP_DIR ETCDCTL_API=3 etcdctl snapshot save $BACKUP_DIR/etcd-snapshot-$DATE.db \ --endpoints=$ETCD_ENDPOINT \ --cacert=$ETCD_CACERT \ --cert=$ETCD_CERT \ --key=$ETCD_KEY # 保留最近7天备份 find $BACKUP_DIR -name "etcd-snapshot-*.db" -mtime +7 -delete echo "etcd backup completed: $BACKUP_DIR/etcd-snapshot-$DATE.db" EOF [root@k8s-master ~]# chmod +x /usr/local/bin/etcd-backup.sh # 配置定时备份 [root@k8s-master ~]# cat > /etc/cron.d/etcd-backup << 'EOF' 0 */4 * * * root /usr/local/bin/etcd-backup.sh >> /var/log/etcd-backup.log 2>&1
EOF
# 模拟etcd数据损坏恢复
[root@k8s-master ~]# systemctl stop kubelet
[root@k8s-master ~]# mv /var/lib/etcd /var/lib/etcd.corrupted
# 恢复etcd数据
[root@k8s-master ~]# ETCDCTL_API=3 etcdctl snapshot restore /backup/etcd/etcd-snapshot-20260404.db \
–data-dir=/var/lib/etcd \
–name=k8s-master \
–initial-cluster=k8s-master=https://192.168.1.100:2380 \
–initial-cluster-token=etcd-cluster \
–initial-advertise-peer-urls=https://192.168.1.100:2380
2026-04-04 23:00:00.000000 I | mvcc: restore compact to 12345678
2026-04-04 23:00:00.000000 I | etcdserver: starting member abc123 in cluster etcd-cluster
2026-04-04 23:00:00.000000 I | etcdserver: set the initial cluster version to 3.5
2026-04-04 23:00:00.000000 I | etcdserver: starting server… [member: abc123, revision: 12345678, applied: 12345678, lease: 0]
2026-04-04 23:00:00.000000 I | etcdserver: published {Name:k8s-master ClientURLs:[https://192.168.1.100:2379]} to cluster
[root@k8s-master ~]# chown -R etcd:etcd /var/lib/etcd
[root@k8s-master ~]# systemctl start kubelet
# 验证恢复结果
[root@k8s-master ~]# kubectl get nodes
NAME STATUS ROLES AGE VERSION
k8s-master Ready control-plane 100d v1.28.3
k8s-node1 Ready
k8s-node2 Ready
Part02-控制平面恢复
2.1 Master节点恢复
[root@k8s-master ~]# kubeadm init phase certs all –config /backup/kubernetes-backup/kubeadm-config.yaml
[certs] Using certificateDir folder “/etc/kubernetes/pki”
[certs] Generating “ca” certificate and key
[certs] Generating “apiserver” certificate and key
[certs] apiserver serving cert is signed for DNS names [k8s-master kubernetes kubernetes.default kubernetes.default.svc kubernetes.default.svc.cluster.local] and IPs [10.96.0.1 192.168.1.100]
[certs] Generating “apiserver-kubelet-client” certificate and key
[certs] Generating “front-proxy-ca” certificate and key
[certs] Generating “front-proxy-client” certificate and key
[certs] Generating “etcd/ca” certificate and key
[certs] Generating “etcd/server” certificate and key
[certs] Generating “etcd/peer” certificate and key
[certs] Generating “etcd/healthcheck-client” certificate and key
[certs] Generating “apiserver-etcd-client” certificate and key
[certs] Generating “sa” key and public key
# 恢复kubeconfig
[root@k8s-master ~]# kubeadm init phase kubeconfig all –config /backup/kubernetes-backup/kubeadm-config.yaml
[kubeconfig] Using kubeconfig folder “/etc/kubernetes”
[kubeconfig] Writing “admin.conf” kubeconfig file
[kubeconfig] Writing “kubelet.conf” kubeconfig file
[kubeconfig] Writing “controller-manager.conf” kubeconfig file
[kubeconfig] Writing “scheduler.conf” kubeconfig file
# 恢复控制平面静态Pod
[root@k8s-master ~]# kubeadm init phase control-plane all –config /backup/kubernetes-backup/kubeadm-config.yaml
[control-plane] Using manifest folder “/etc/kubernetes/manifests”
[control-plane] Creating static Pod manifest for “kube-apiserver”
[control-plane] Creating static Pod manifest for “kube-controller-manager”
[control-plane] Creating static Pod manifest for “kube-scheduler”
# 等待控制平面就绪
[root@k8s-master ~]# kubectl wait –for=condition=Ready pods -n kube-system -l component=kube-apiserver –timeout=300s
pod/kube-apiserver-k8s-master condition met
[root@k8s-master ~]# kubectl wait –for=condition=Ready pods -n kube-system -l component=kube-controller-manager –timeout=300s
pod/kube-controller-manager-k8s-master condition met
[root@k8s-master ~]# kubectl wait –for=condition=Ready pods -n kube-system -l component=kube-scheduler –timeout=300s
pod/kube-scheduler-k8s-master condition met
Part03-应用恢复
3.1 应用数据恢复
[root@k8s-master ~]# velero backup get
NAME STATUS ERRORS WARNINGS CREATED EXPIRES STORAGE LOCATION SELECTOR
fgedu-daily Completed 0 0 2026-04-04 00:00:00 +0800 CST 29d default
# 查看备份详情
[root@k8s-master ~]# velero backup describe fgedu-daily
Name: fgedu-daily
Namespace: velero
Labels: velero.io/storage-location=default
Annotations: velero.io/source-cluster-k8s-gitversion=v1.28.3
velero.io/source-cluster-k8s-major-version=1
velero.io/source-cluster-k8s-minor-version=28
Phase: Completed
Namespaces:
Included: *
Excluded:
Resources:
Included: *
Excluded:
Cluster-scoped: auto
Backup Volumes:
Included: *
Excluded:
# 恢复特定命名空间
[root@k8s-master ~]# velero restore create –from-backup fgedu-daily –include-namespaces fgedu-prod
Restore request “fgedu-daily-20260404230000” submitted successfully.
Run `velero restore describe fgedu-daily-20260404230000` for more details.
# 查看恢复状态
[root@k8s-master ~]# velero restore describe fgedu-daily-20260404230000
Name: fgedu-daily-20260404230000
Namespace: velero
Labels:
Annotations:
Phase: Completed
Total items to be restored: 25
Items restored: 25
Started: 2026-04-04 23:00:00 +0800 CST
Completed: 2026-04-04 23:01:00 +0800 CST
Warnings:
Velero:
Cluster:
Namespaces:
fgedu-prod: could not restore, ConfigMap “kube-root-ca.crt” already exists. Warning: the in-cluster version is different from the backed up version.
# 验证恢复的应用
[root@k8s-master ~]# kubectl get all -n fgedu-prod
NAME READY STATUS RESTARTS AGE
pod/fgedu-app-abc12-xyz789 1/1 Running 0 5m
pod/fgedu-app-abc12-abc12 1/1 Running 0 5m
pod/fgedu-app-abc12-def34 1/1 Running 0 5m
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
service/fgedu-app-svc ClusterIP 10.96.100.100
NAME READY UP-TO-DATE AVAILABLE AGE
deployment.apps/fgedu-app 3/3 3 3 5m
Part04-灾难恢复演练
4.学习交流加群风哥QQ1132571741 演练脚本
[root@k8s-master ~]# cat > /usr/local/bin/dr-drill.sh << 'EOF' #!/bin/bash # dr-drill.sh # from:www.itpux.com.qq113257174.wx:itpux-com # web: http://www.fgedu.net.cn DR_NAMESPACE="fgedu-dr-test" BACKUP_NAME="dr-test-$(date +%Y%m%d-%H%M%S)" echo "=== Kubernetes灾难恢复演练 ===" echo "开始时间: $(date)" # 1. 创建测试命名空间 echo "1. 创建测试命名空间..." kubectl create namespace $DR_NAMESPACE # 2. 部署测试应用 echo "2. 部署测试应用..." kubectl apply -f - <
- 定期备份etcd数据
- 使用Velero备份应用
- 定期进行恢复演练
- 保留多个备份版本
- 文档化恢复流程
本文由风哥教程整理发布,仅用于学习测试使用,转载注明出处:http://www.fgedu.net.cn/10327.html
