内容简介:本文风哥教程参考Linux官方文档、Red Hat Enterprise Linux官方文档、Ansible Automation Platform官方文档、Docker官方文档、Kubernetes官方文档和Podman官方文档等内容,详细介绍了相关技术的配置和使用方法。
风哥提示:
本文档介绍存储灾难恢复的规划和实施方法。
Part01-灾难恢复规划
1.1 灾难恢复策略
[root@dr-server ~]# cat > /root/dr-plan.txt << 'EOF' 灾难恢复规划 ============ 1. RTO/RPO定义 - RTO(恢复时间目标):系统恢复所需时间 - RPO(恢复点目标):可接受的数据丢失量 2. 灾难恢复级别 - 级别1:实时复制,RTO<1小时,RPO=0 - 级别2:异步复制,RTO<4小时,RPO<1小时 - 级别3:定期备份,RTO<24小时,RPO<24小时 3. 灾难恢复架构 - 同城双活:两个数据中心同时提供服务 - 同城灾备:主备模式,备用数据中心待命 - 异地灾备:跨地域数据复制 4. 故障切换流程 - 故障检测 - 业务评估 - 决策切换 - 执行切换 - 业务验证 5. 回切流程 - 数据同步 - 业务验证 - 决策回切 - 执行回切 EOF # 创建灾难恢复目录 [root@dr-server ~]# mkdir -p /dr/{scripts,logs,config}
1.2 数据复制配置
[root@primary ~]# dnf install -y drbd drbd-utils
# 配置DRBD资源
[root@primary ~]# cat > /etc/drbd.d/r0.res << 'EOF'
resource r0 {
protocol C;
meta-disk internal;
device /dev/drbd0;
disk /dev/sdb1;
net {
cram-hmac-alg sha1;
shared-secret "drbd-secret";
}
on primary.fgedu.net.cn {
address 192.168.1.10:7789;
}
on secondary.fgedu.net.cn {
address 192.168.1.11:7789;
}
}
EOF
# 初始化DRBD
[root@primary ~]# drbdadm create-md r0
initializing activity log
initializing bitmap (320 KB)
New drbd meta data block successfully created.
[root@secondary ~]# drbdadm create-md r0
initializing activity log
initializing bitmap (320 KB)
New drbd meta data block successfully created.
# 启动DRBD
[root@primary ~]# drbdadm up r0
[root@secondary ~]# drbdadm up r0
# 设置主节点
[root@primary ~]# drbdadm primary --force r0
# 查看状态
[root@primary ~]# drbdadm status r0
r0 role:Primary
disk:UpToDate
secondary.fgedu.net.cn role:Secondary
replication:Established peer-disk:UpToDate
# 格式化并挂载
[root@primary ~]# mkfs.xfs /dev/drbd0
[root@primary ~]# mount /dev/drbd0 /data
# 测试数据同步
[root@primary ~]# echo "test data" > /data/test.txt
[root@primary ~]# cat /data/test.txt
test data
# 切换到备节点
[root@primary ~]# umount /data
[root@primary ~]# drbdadm secondary r0
[root@secondary ~]# drbdadm primary r0
[root@secondary ~]# mount /dev/drbd0 /data
[root@secondary ~]# cat /data/test.学习交流加群风哥QQ113257174txt
test data
Part02-故障切换实施
2.1 自动故障切换
[root@primary ~]# dnf install -y pacemaker corosync pcs
# 设置密码
[root@primary ~]# echo “hacluster” | passwd –stdin hacluster
Changing password for user hacluster.
passwd: all authentication tokens updated successfully.
# 启动服务
[root@primary ~]# systemctl enable –now pcsd
Created symlink /etc/systemd/system/multi-user.更多视频教程www.fgedu.ne学习交流加群风哥微信: itpux-co更多学习教程公众号风哥教程itpux_commt.cntarget.wants/pcsd.service → /usr/lib/systemd/system/pcsd.service.
# 认证节点
[root@primary ~]# pcs cluster auth primary secondary -u hacluster -p hacluster
primary: Authorized
secondary: Authorized
# 创建集群
[root@primary ~]# pcs cluster setup dr-cluster primary secondary
Destroying cluster on nodes: primary, secondary…
primary: Stopping Cluster (pacemaker)…
primary: Stopping Cluster (corosync)…
secondary: Stopping Cluster (pacemaker)…
secondary: Stopping Cluster (corosync)…
primary: Successfully destroyed cluster
secondary: Successfully destroyed cluster
Sending ‘pacemaker Remote auth’ to ‘primary’, ‘secondary’
primary: successful distribution of the file ‘pacemaker Remote auth’
secondary: successful distribution of the file ‘pacemaker Remote auth’
Sending cluster config files to the nodes…
primary: Succeeded
secondary: Succeeded
Synchronizing pcsd SSL certificates on nodes primary, secondary…
primary: Success
secondary: Success
Restarting pcsd on the nodes in order to reload the SSL certificates…
primary: Success
secondary: Success
# 启动集群
[root@primary ~]# pcs cluster start –all
primary: Starting Cluster (corosync)…
secondary: Starting Cluster (corosync)…
primary: Starting Cluster (pacemaker)…
secondary: Starting Cluster (pacemaker)…
# 查看集群状态
[root@primary ~]# pcs status cluster
Cluster Status:
Cluster Summary:
* Stack: corosync
* Current DC: primary (version 2.1.2-4.el9) – partition with quorum
* Last updated: Fri Apr 4 22:20:00 2026
* Last change: Fri Apr 4 22:19:00 2026 by hacluster via crmd on primary
* 2 nodes configured
* 0 resource instances configured
# 禁用STONITH
[root@primary ~]# pcs property set stonith-enabled=false
# 创建DRBD资源
[root@primary ~]# pcs resource create drbd_r0 ocf:linbit:drbd drbd_resource=r0 op monitor interval=30s
# 创建主从资源
[root@primary ~]# pcs resource promotable drbd_r0 promoted-max=1 promoted-node-max=1 clone-max=2 clone-node-max=1 notify=true
# 创建文件系统资源
[root@primary ~]# pcs resource create fs_data ocf:heartbeat:Filesystem device=”/dev/drbd0″ directory=”/data” fstype=”xfs”
# 设置约束
[root@primary ~]# pcs constraint colocation add fs_data with drbd_r0-clone INFINITY with-rsc-role=Promoted
[root@primary ~]# pcs constraint order promote drbd_r0-clone then start fs_data
# 创建虚拟IP资源
[root@primary ~]# pcs resource create vip ocf:heartbeat:IPaddr2 ip=192.168.1.100 cidr_netmask=24 op monitor interval=30s
# 设置约束
[root@primary ~]# pcs constraint colocation add vip with fs_data INFINITY
[root@primary ~]# pcs constraint order fs_data then vip
# 查看资源状态
[root@primary ~]# pcs status resources
* Clone Set: drbd_r0-clone [drbd_r0] (promotable):
* Promoted: [ primary ]
* Unpromoted: [ secondary ]
* fs_data (ocf:heartbeat:Filesystem): Started primary
* vip (ocf:heartbeat:IPaddr2): Started primary
2.2 手动故障切换
[root@primary ~]# pcs cluster standby primary
[root@primary ~]# pcs status resources
* Clone Set: drbd_r0-clone [drbd_r0] (promotable):
* Promoted: [ secondary ]
* Unpromoted: [ primary ]
* fs_data (ocf:heartbeat:Filesystem): Started secondary
* vip (ocf:heartbeat:IPaddr2): Started secondary
# 恢复节点
[root@primary ~]# pcs cluster unstandby primary
[root@primary ~]# pcs status resources
* Clone Set: drbd_r0-clone [drbd_r0] (promotable):
* Promoted: [ primary ]
* Unpromoted: [ secondary ]
* fs_data (ocf:heartbeat:Filesystem): Started primary
* vip (ocf:heartbeat:IPaddr2): Started primary
# 创建故障切换脚本
[root@primary ~]# cat > /dr/scripts/failover.sh << 'EOF'
#!/bin/bash
ACTION=$1
NODE=$2
case $ACTION in
failover)
echo "Failing over from $NODE..."
pcs cluster standby $NODE
sleep 5
pcs status resources
;;
failback)
echo "Failing back to $NODE..."
pcs cluster unstandby $NODE
sleep 5
pcs status resources
;;
status)
pcs status resources
;;
*)
echo "Usage: $0 {failover|failback|status} [node]"
exit 1
;;
esac
EOF
[root@primary ~]# chmod +x /dr/scripts/failover.sh
# 测试故障切换
[root@primary ~]# /dr/scripts/failover.sh failover primary
Failing over from primary...
primary: successfully placed node in standby mode
* Clone Set: drbd_r0-clone [drbd_r0] (promotable):
* Promoted: [ secondary ]
* Unpromoted: [ primary ]
* fs_data (ocf:heartbeat:Filesystem): Started secondary
* vip (ocf:heartbeat:IPaddr2): Started secondary
- 制定详细的灾难恢复计划
- 定期进行灾难恢复演练
- 配置自动故障切换
- 保留多个恢复点
- 验证数据完整性
本文由风哥教程整理发布,仅用于学习测试使用,转载注明出处:http://www.fgedu.net.cn/10327.html
