IT教程FG415-容灾系统监控

内容大纲

1. 容灾系统监控概述
2. 监控架构设计
3. 数据复制监控
4. 故障切换监控
5. 网络链路监控
6. 存储系统监控
7. 应用服务监控
8. 告警管理
9. 容灾演练监控
10. 最佳实践

1. 容灾系统监控概述

容灾系统监控是保障业务连续性的关键环节，它通过实时监控容灾系统的各个组件，确保在灾难发生时能够快速、准确地切换到备用系统。容灾监控需要覆盖数据复制、网络链路、存储系统、应用服务等多个层面。

容灾系统监控的核心目标包括：

实时监控数据复制状态和延迟
监控主备站点之间的网络连通性
监控存储系统的健康状态
监控应用服务的可用性
验证故障切换机制的有效性
确保RPO和RTO目标的达成

更多学习教程www.fgedu.net.cn

2. 监控架构设计

2.1 监控系统架构

# 容灾监控系统架构部署
# 在主站点部署监控服务器
$ docker run -d –name prometheus-main \
–network monitoring \
-p 9090:9090 \
-v /data/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \
prom/prometheus:latest

# 在备站点部署监控服务器
$ docker run -d –name prometheus-dr \
–network monitoring \
-p 9091:9090 \
-v /data/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \
prom/prometheus:latest

# 部署Grafana可视化
$ docker run -d –name grafana \
–network monitoring \
-p 3000:3000 \
-v /data/grafana:/var/lib/grafana \
grafana/grafana:latest

输出结果如下：
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
a1b2c3d4e5f6 prom/prometheus:latest “/bin/prometheus –c…” 5 seconds ago Up 4 seconds 0.0.0.0:9090->9090/tcp prometheus-main
f6e5d4c3b2a1 prom/prometheus:latest “/bin/prometheus –c…” 3 seconds ago Up 2 seconds 0.0.0.0:9091->9090/tcp prometheus-dr
1a2b3c4d5e6f grafana/grafana:latest “/run.sh” 1 second ago Up 1 second 0.0.0.0:3000->3000/tcp grafana

2.2 监控指标配置

# 配置Prometheus监控指标
$ cat > /data/prometheus/prometheus.yml << 'EOF' global: scrape_interval: 15s evaluation_interval: 15s external_labels: monitor: 'dr-monitor' alerting: alertmanagers: - static_configs: - targets: - alertmanager:9093 rule_files: - "dr_alert_rules.yml" scrape_configs: - job_name: 'prometheus' static_configs: - targets: ['fgedudb:9090'] - job_name: 'node-exporter-main' static_configs: - targets: ['main-site:9100'] labels: site: 'main' - job_name: 'node-exporter-dr' static_configs: - targets: ['dr-site:9100'] labels: site: 'dr' - job_name: 'mysql-replication' static_configs: - targets: ['mysql-exporter:9104'] - job_name: 'storage-replication' static_configs: - targets: ['storage-exporter:9101'] - job_name: 'network-monitor' static_configs: - targets: ['network-exporter:9105'] EOF

3. 数据复制监控

3.1 数据库复制监控

# MySQL主从复制监控脚本
$ cat > /usr/local/bin/mysql_replication_monitor.sh << 'EOF' #!/bin/bash # MySQL连接信息 MAIN_HOST="main-db.fgedu.net.cn" DR_HOST="dr-db.fgedu.net.cn" USER="monitor" PASS="monitor_password" # 获取主库状态 MAIN_STATUS=$(mysql -h $MAIN_HOST -u $USER -p$PASS -e "SHOW MASTER STATUS\G" 2>/dev/null)
MAIN_FILE=$(echo “$MAIN_STATUS” | grep “File:” | awk ‘{print $2}’)
MAIN_POSITION=$(echo “$MAIN_STATUS” | grep “Position:” | awk ‘{print $2}’)

# 获取备库状态
DR_STATUS=$(mysql -h $DR_HOST -u $USER -p$PASS -e “SHOW SLAVE STATUS\G” 2>/dev/null)
DR_IO_RUNNING=$(echo “$DR_STATUS” | grep “Slave_IO_Running:” | awk ‘{print $2}’)
DR_SQL_RUNNING=$(echo “$DR_STATUS” | grep “Slave_SQL_Running:” | awk ‘{print $2}’)
DR_SECONDS_BEHIND=$(echo “$DR_STATUS” | grep “Seconds_Behind_Master:” | awk ‘{print $2}’)

# 输出监控指标
echo “mysql_replication_io_running{site=\”dr\”} $([ “$DR_IO_RUNNING” = “Yes” ] && echo 1 || echo 0)”
echo “mysql_replication_sql_running{site=\”dr\”} $([ “$DR_SQL_RUNNING” = “Yes” ] && echo 1 || echo 0)”
echo “mysql_replication_seconds_behind{site=\”dr\”} ${DR_SECONDS_BEHIND:-0}”
echo “mysql_replication_main_position{site=\”main\”} $MAIN_POSITION”
EOF

$ chmod +x /usr/local/bin/mysql_replication_monitor.sh

输出结果如下：
mysql_replication_io_running{site=”dr”} 1
mysql_replication_sql_running{site=”dr”} 1
mysql_replication_seconds_behind{site=”dr”} 0
mysql_replication_main_position{site=”main”} 1234567

3.2 存储复制监控

# 存储复制状态监控
# 查看存储复制状态
$ ssh storage-main “replication status show”

# 输出结果
Replication Pair: MAIN_TO_DR
Status: Active
Direction: MAIN -> DR
Progress: 100%
Lag: 0 seconds
Throughput: 125 MB/s
Last Sync: 2026-04-03 10:30:45

# 监控存储复制延迟
$ cat > /usr/local/bin/storage_replication_monitor.sh << 'EOF' #!/bin/bash # 获取存储复制状态 STATUS=$(ssh storage-main "replication status show" 2>/dev/null)
LAG=$(echo “$STATUS” | grep “Lag:” | awk ‘{print $2}’)
PROGRESS=$(echo “$STATUS” | grep “Progress:” | awk ‘{print $2}’ | tr -d ‘%’)
THROUGHPUT=$(echo “$STATUS” | grep “Throughput:” | awk ‘{print $2}’)

# 输出监控指标
echo “storage_replication_lag_seconds{pair=\”main_to_dr\”} ${LAG:-0}”
echo “storage_replication_progress_percent{pair=\”main_to_dr\”} ${PROGRESS:-0}”
echo “storage_replication_throughput_mbps{pair=\”main_to_dr\”} ${THROUGHPUT:-0}”
EOF

$ chmod +x /usr/local/bin/storage_replication_monitor.sh

输出结果如下：
storage_replication_lag_seconds{pair=”main_to_dr”} 0
storage_replication_progress_percent{pair=”main_to_dr”} 100
storage_replication_throughput_mbps{pair=”main_to_dr”} 125

学习交流加群风哥QQ113257174

4. 故障切换监控

4.1 故障检测机制

# 故障检测脚本
$ cat > /usr/local/bin/failover_monitor.sh << 'EOF' #!/bin/bash # 主站点健康检查 MAIN_SITE_IP="192.168.1.100" DR_SITE_IP="192.168.2.100" # 检查主站点网络连通性 check_main_site() { ping -c 3 -W 2 $MAIN_SITE_IP > /dev/null 2>&1
return $?
}

# 检查主站点应用服务
check_main_app() {
curl -f -s -o /dev/null -w “%{http_code}” http://$MAIN_SITE_IP:8080/health
return $?
}

# 检查备站点状态
check_dr_site() {
ping -c 3 -W 2 $DR_SITE_IP > /dev/null 2>&1
return $?
}

# 主监控逻辑
MAIN_NETWORK=$(check_main_site && echo 1 || echo 0)
MAIN_APP=$(check_main_app && echo 1 || echo 0)
DR_NETWORK=$(check_dr_site && echo 1 || echo 0)

# 输出监控指标
echo “dr_main_site_network_status $MAIN_NETWORK”
echo “dr_main_site_fgapp_status $MAIN_APP”
echo “dr_site_network_status $DR_NETWORK”

# 判断是否需要故障切换
if [ “$MAIN_NETWORK” -eq 0 ] && [ “$MAIN_APP” -eq 0 ]; then
echo “dr_failover_required 1”
# 发送告警
echo “主站点不可用，需要故障切换” | mail -s “容灾告警” admin@fgedu.net.cn
else
echo “dr_failover_required 0”
fi
EOF

$ chmod +x /usr/local/bin/failover_monitor.sh

输出结果如下：
dr_main_site_network_status 1
dr_main_site_fgapp_status 1
dr_site_network_status 1
dr_failover_required 0

4.2 切换状态监控

# 监控故障切换状态
$ cat > /usr/local/bin/failover_status.sh << 'EOF' #!/bin/bash # 检查当前运行站点 CURRENT_SITE=$(cat /etc/dr-site.conf 2>/dev/null || echo “main”)

# 检查切换历史
LAST_FAILOVER=$(ls -lt /var/log/dr/failover*.log 2>/dev/null | head -1 | awk ‘{print $NF}’)

# 检查切换时间
if [ -n “$LAST_FAILOVER” ]; then
FAILOVER_TIME=$(stat -c %Y “$LAST_FAILOVER”)
CURRENT_TIME=$(date +%s)
TIME_SINCE_FAILOVER=$((CURRENT_TIME – FAILOVER_TIME))
else
TIME_SINCE_FAILOVER=0
fi

# 输出监控指标
echo “dr_current_site{site=\”$CURRENT_SITE\”} 1″
echo “dr_failover_time_seconds $TIME_SINCE_FAILOVER”

# 检查切换是否完成
if [ “$CURRENT_SITE” = “dr” ]; then
# 检查备站点服务状态
DR_APP_STATUS=$(curl -f -s -o /dev/null -w “%{http_code}” http://fgedudb:8080/health)
if [ “$DR_APP_STATUS” = “200” ]; then
echo “dr_failover_complete 1”
else
echo “dr_failover_complete 0”
fi
else
echo “dr_failover_complete 1”
fi
EOF

$ chmod +x /usr/local/bin/failover_status.sh

输出结果如下：
dr_current_site{site=”main”} 1
dr_failover_time_seconds 0
dr_failover_complete 1

5. 网络链路监控

5.1 网络连通性监控

# 网络连通性监控脚本
$ cat > /usr/local/bin/network_monitor.sh << 'EOF' #!/bin/bash # 主备站点IP MAIN_SITE="192.168.1.100" DR_SITE="192.168.2.100" # 网络延迟监控 MAIN_LATENCY=$(ping -c 5 $MAIN_SITE | tail -1 | awk -F '/' '{print $5}') DR_LATENCY=$(ping -c 5 $DR_SITE | tail -1 | awk -F '/' '{print $5}') # 网络丢包率监控 MAIN_PACKET_LOSS=$(ping -c 10 $MAIN_SITE | grep "packet loss" | awk -F '%' '{print $1}' | awk '{print $NF}') DR_PACKET_LOSS=$(ping -c 10 $DR_SITE | grep "packet loss" | awk -F '%' '{print $1}' | awk '{print $NF}') # 网络带宽监控 MAIN_BANDWIDTH=$(iperf3 -c $MAIN_SITE -t 5 -f m | grep sender | awk '{print $7}') DR_BANDWIDTH=$(iperf3 -c $DR_SITE -t 5 -f m | grep sender | awk '{print $7}') # 输出监控指标 echo "dr_network_latency_ms{site=\"main\"} ${MAIN_LATENCY:-0}" echo "dr_network_latency_ms{site=\"dr\"} ${DR_LATENCY:-0}" echo "dr_network_packet_loss_percent{site=\"main\"} ${MAIN_PACKET_LOSS:-0}" echo "dr_network_packet_loss_percent{site=\"dr\"} ${DR_PACKET_LOSS:-0}" echo "dr_network_bandwidth_mbps{site=\"main\"} ${MAIN_BANDWIDTH:-0}" echo "dr_network_bandwidth_mbps{site=\"dr\"} ${DR_BANDWIDTH:-0}" EOF $ chmod +x /usr/local/bin/network_monitor.sh

输出结果如下：
dr_network_latency_ms{site=”main”} 0.523
dr_network_latency_ms{site=”dr”} 15.234
dr_network_packet_loss_percent{site=”main”} 0
dr_network_packet_loss_percent{site=”dr”} 0
dr_network_bandwidth_mbps{site=”main”} 945.2
dr_network_bandwidth_mbps{site=”dr”} 892.5

5.2 专线监控

# 专线状态监控
$ cat > /usr/local/bin/line_monitor.sh << 'EOF' #!/bin/bash # 检查专线状态 check_line() { local line_name=$1 local line_ip=$2 # 检查专线连通性 if ping -c 3 -W 2 $line_ip > /dev/null 2>&1; then
STATUS=1
LATENCY=$(ping -c 5 $line_ip | tail -1 | awk -F ‘/’ ‘{print $5}’)
else
STATUS=0
LATENCY=0
fi

echo “dr_line_status{line=\”$line_name\”} $STATUS”
echo “dr_line_latency_ms{line=\”$line_name\”} ${LATENCY:-0}”
}

# 监控主专线
check_line “primary” “192.168.10.1”

# 监控备用专线
check_line “backup” “192.168.20.1”

# 检查专线切换状态
CURRENT_LINE=$(ip route show | grep default | awk ‘{print $3}’)
if [ “$CURRENT_LINE” = “192.168.10.1” ]; then
echo “dr_current_line{line=\”primary\”} 1″
else
echo “dr_current_line{line=\”backup\”} 1″
fi
EOF

$ chmod +x /usr/local/bin/line_monitor.sh

输出结果如下：
dr_line_status{line=”primary”} 1
dr_line_latency_ms{line=”primary”} 2.345
dr_line_status{line=”backup”} 1
dr_line_latency_ms{line=”backup”} 5.678
dr_current_line{line=”primary”} 1

风哥风哥提示：网络链路监控是容灾系统的重要组成部分，需要实时监控主备站点之间的网络连通性、延迟和带宽，确保数据复制和故障切换的顺利进行。

6. 存储系统监控

6.1 存储容量监控

# 存储容量监控脚本
$ cat > /usr/local/bin/storage_capacity_monitor.sh << 'EOF' #!/bin/bash # 主站点存储监控 MAIN_STORAGE=$(ssh storage-main "df -h /data" 2>/dev/null | tail -1)
MAIN_TOTAL=$(echo $MAIN_STORAGE | awk ‘{print $2}’ | sed ‘s/T//’)
MAIN_USED=$(echo $MAIN_STORAGE | awk ‘{print $3}’ | sed ‘s/T//’)
MAIN_AVAIL=$(echo $MAIN_STORAGE | awk ‘{print $4}’ | sed ‘s/T//’)
MAIN_USAGE=$(echo $MAIN_STORAGE | awk ‘{print $5}’ | sed ‘s/%//’)

# 备站点存储监控
DR_STORAGE=$(ssh storage-dr “df -h /data” 2>/dev/null | tail -1)
DR_TOTAL=$(echo $DR_STORAGE | awk ‘{print $2}’ | sed ‘s/T//’)
DR_USED=$(echo $DR_STORAGE | awk ‘{print $3}’ | sed ‘s/T//’)
DR_AVAIL=$(echo $DR_STORAGE | awk ‘{print $4}’ | sed ‘s/T//’)
DR_USAGE=$(echo $DR_STORAGE | awk ‘{print $5}’ | sed ‘s/%//’)

# 输出监控指标
echo “dr_storage_total_tb{site=\”main\”} ${MAIN_TOTAL:-0}”
echo “dr_storage_used_tb{site=\”main\”} ${MAIN_USED:-0}”
echo “dr_storage_available_tb{site=\”main\”} ${MAIN_AVAIL:-0}”
echo “dr_storage_usage_percent{site=\”main\”} ${MAIN_USAGE:-0}”
echo “dr_storage_total_tb{site=\”dr\”} ${DR_TOTAL:-0}”
echo “dr_storage_used_tb{site=\”dr\”} ${DR_USED:-0}”
echo “dr_storage_available_tb{site=\”dr\”} ${DR_AVAIL:-0}”
echo “dr_storage_usage_percent{site=\”dr\”} ${DR_USAGE:-0}”
EOF

$ chmod +x /usr/local/bin/storage_capacity_monitor.sh

输出结果如下：
dr_storage_total_tb{site=”main”} 50.0
dr_storage_used_tb{site=”main”} 25.5
dr_storage_available_tb{site=”main”} 24.5
dr_storage_usage_percent{site=”main”} 51
dr_storage_total_tb{site=”dr”} 50.0
dr_storage_used_tb{site=”dr”} 25.5
dr_storage_available_tb{site=”dr”} 24.5
dr_storage_usage_percent{site=”dr”} 51

6.2 存储性能监控

# 存储性能监控
$ cat > /usr/local/bin/storage_performance_monitor.sh << 'EOF' #!/bin/bash # 监控存储IOPS MAIN_IOPS=$(ssh storage-main "iostat -x 1 5 | grep -A1 Device | tail -1 | awk '{print \$4}'") DR_IOPS=$(ssh storage-dr "iostat -x 1 5 | grep -A1 Device | tail -1 | awk '{print \$4}'") # 监控存储吞吐量 MAIN_THROUGHPUT=$(ssh storage-main "iostat -x 1 5 | grep -A1 Device | tail -1 | awk '{print \$6}'") DR_THROUGHPUT=$(ssh storage-dr "iostat -x 1 5 | grep -A1 Device | tail -1 | awk '{print \$6}'") # 监控存储延迟 MAIN_LATENCY=$(ssh storage-main "iostat -x 1 5 | grep -A1 Device | tail -1 | awk '{print \$10}'") DR_LATENCY=$(ssh storage-dr "iostat -x 1 5 | grep -A1 Device | tail -1 | awk '{print \$10}'") # 输出监控指标 echo "dr_storage_iops{site=\"main\"} ${MAIN_IOPS:-0}" echo "dr_storage_iops{site=\"dr\"} ${DR_IOPS:-0}" echo "dr_storage_throughput_kbps{site=\"main\"} ${MAIN_THROUGHPUT:-0}" echo "dr_storage_throughput_kbps{site=\"dr\"} ${DR_THROUGHPUT:-0}" echo "dr_storage_latency_ms{site=\"main\"} ${MAIN_LATENCY:-0}" echo "dr_storage_latency_ms{site=\"dr\"} ${DR_LATENCY:-0}" EOF $ chmod +x /usr/local/bin/storage_performance_monitor.sh

输出结果如下：
dr_storage_iops{site=”main”} 1523.45
dr_storage_iops{site=”dr”} 1489.23
dr_storage_throughput_kbps{site=”main”} 125678.90
dr_storage_throughput_kbps{site=”dr”} 123456.78
dr_storage_latency_ms{site=”main”} 0.52
dr_storage_latency_ms{site=”dr”} 0.48

7. 应用服务监控

7.1 应用健康检查

# 应用健康检查脚本
$ cat > /usr/local/bin/app_health_monitor.sh << 'EOF' #!/bin/bash # 应用服务列表 declare -A APPS=( ["web-app"]="http://fgedudb:8080/health" ["api-service"]="http://fgedudb:9090/health" ["database"]="tcp://fgedudb:3306" ["cache"]="tcp://fgedudb:6379" ) # 检查应用健康状态 for app in "${!APPS[@]}"; do url="${APPS[$app]}" if [[ $url == http* ]]; then # HTTP健康检查 STATUS=$(curl -f -s -o /dev/null -w "%{http_code}" "$url" 2>/dev/null)
if [ “$STATUS” = “200” ]; then
echo “dr_fgapp_health_status{app=\”$app\”} 1″
else
echo “dr_fgapp_health_status{app=\”$app\”} 0″
fi
elif [[ $url == tcp* ]]; then
# TCP健康检查
HOST=$(echo $url | awk -F ‘://’ ‘{print $2}’ | awk -F ‘:’ ‘{print $1}’)
PORT=$(echo $url | awk -F ‘://’ ‘{print $2}’ | awk -F ‘:’ ‘{print $2}’)
if nc -z -w 3 $HOST $PORT 2>/dev/null; then
echo “dr_fgapp_health_status{app=\”$app\”} 1″
else
echo “dr_fgapp_health_status{app=\”$app\”} 0″
fi
fi
done
EOF

$ chmod +x /usr/local/bin/app_health_monitor.sh

输出结果如下：
dr_fgapp_health_status{app=”web-app”} 1
dr_fgapp_health_status{app=”api-service”} 1
dr_fgapp_health_status{app=”database”} 1
dr_fgapp_health_status{app=”cache”} 1

7.2 应用性能监控

# 应用性能监控
$ cat > /usr/local/bin/app_performance_monitor.sh << 'EOF' #!/bin/bash # 监控应用响应时间 RESPONSE_TIME=$(curl -o /dev/null -s -w '%{time_total}\n' http://fgedudb:8080/api/test) # 监控应用吞吐量 THROUGHPUT=$(curl -s http://fgedudb:8080/metrics | grep "http_requests_total" | awk '{print $2}') # 监控应用错误率 ERROR_RATE=$(curl -s http://fgedudb:8080/metrics | grep "http_requests_failed_total" | awk '{print $2}') # 监控应用CPU使用率 APP_CPU=$(ps aux | grep "java.*app.jar" | grep -v grep | awk '{print $3}') # 监控应用内存使用 APP_MEM=$(ps aux | grep "java.*app.jar" | grep -v grep | awk '{print $4}') # 输出监控指标 echo "dr_fgapp_response_time_seconds $RESPONSE_TIME" echo "dr_fgapp_throughput_requests $THROUGHPUT" echo "dr_fgapp_error_rate_percent $ERROR_RATE" echo "dr_fgapp_cpu_usage_percent $APP_CPU" echo "dr_fgapp_memory_usage_percent $APP_MEM" EOF $ chmod +x /usr/local/bin/app_performance_monitor.sh

输出结果如下：
dr_fgapp_response_time_seconds 0.125
dr_fgapp_throughput_requests 15234
dr_fgapp_error_rate_percent 0.02
dr_fgapp_cpu_usage_percent 45.2
dr_fgapp_memory_usage_percent 62.5

更多学习教程公众号风哥教程itpux_com

8. 告警管理

8.1 告警规则配置

# 配置容灾告警规则
$ cat > /data/prometheus/dr_alert_rules.yml << 'EOF' groups: - name: dr_alerts rules: # 数据复制延迟告警 - alert: DataReplicationLagHigh expr: mysql_replication_seconds_behind > 30
for: 5m
labels:
severity: warning
annotations:
summary: “数据复制延迟过高”
description: “数据复制延迟 {{ $value }} 秒，超过阈值30秒”

# 数据复制中断告警
– alert: DataReplicationStopped
expr: mysql_replication_io_running == 0 or mysql_replication_sql_running == 0
for: 1m
labels:
severity: critical
annotations:
summary: “数据复制已停止”
description: “数据复制IO或SQL线程已停止，请立即检查”

# 主站点不可用告警
– alert: MainSiteDown
expr: dr_main_site_network_status == 0 and dr_main_site_fgapp_status == 0
for: 2m
labels:
severity: critical
annotations:
summary: “主站点不可用”
description: “主站点网络和应用均不可用，需要故障切换”

# 存储容量告警
– alert: StorageCapacityHigh
expr: dr_storage_usage_percent > 85
for: 10m
labels:
severity: warning
annotations:
summary: “存储容量使用率过高”
description: “存储容量使用率 {{ $value }}%，超过阈值85%”

# 网络延迟告警
– alert: NetworkLatencyHigh
expr: dr_network_latency_ms{site=”dr”} > 50
for: 5m
labels:
severity: warning
annotations:
summary: “网络延迟过高”
description: “备站点网络延迟 {{ $value }}ms，超过阈值50ms”

# 应用服务不可用告警
– alert: ApplicationDown
expr: dr_fgapp_health_status == 0
for: 2m
labels:
severity: critical
annotations:
summary: “应用服务不可用”
description: “应用服务 {{ $labels.app }} 不可用”
EOF

8.2 告警通知配置

# 配置Alertmanager告警通知
$ cat > /data/alertmanager/alertmanager.yml << 'EOF' global: resolve_timeout: 5m smtp_smarthost: 'smtp.fgedu.net.cn:587' smtp_from: 'alertmanager@fgedu.net.cn' smtp_auth_username: 'alertmanager@fgedu.net.cn' smtp_auth_password: 'password' route: group_by: ['alertname', 'severity'] group_wait: 30s group_interval: 5m repeat_interval: 4h receiver: 'dr-team' routes: - match: severity: critical receiver: 'dr-critical' - match: severity: warning receiver: 'dr-warning' receivers: - name: 'dr-team' email_configs: - to: 'dr-team@fgedu.net.cn' send_resolved: true - name: 'dr-critical' email_configs: - to: 'dr-team@fgedu.net.cn,manager@fgedu.net.cn' send_resolved: true webhook_configs: - url: 'http://webhook-server:5000/alert' send_resolved: true - name: 'dr-warning' email_configs: - to: 'dr-team@fgedu.net.cn' send_resolved: true inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'instance'] EOF

9. 容灾演练监控

9.1 演练状态监控

# 容灾演练监控脚本
$ cat > /usr/local/bin/dr_drill_monitor.sh << 'EOF' #!/bin/bash # 演练状态文件 DRILL_STATUS_FILE="/var/log/dr/drill_status.txt" # 获取演练状态 if [ -f "$DRILL_STATUS_FILE" ]; then DRILL_STATUS=$(grep "status:" "$DRILL_STATUS_FILE" | awk '{print $2}') DRILL_START=$(grep "start_time:" "$DRILL_STATUS_FILE" | awk '{print $2}') DRILL_TYPE=$(grep "type:" "$DRILL_STATUS_FILE" | awk '{print $2}') else DRILL_STATUS="idle" DRILL_START="0" DRILL_TYPE="none" fi # 计算演练持续时间 if [ "$DRILL_STATUS" = "running" ]; then CURRENT_TIME=$(date +%s) DRILL_DURATION=$((CURRENT_TIME - DRILL_START)) else DRILL_DURATION=0 fi # 输出监控指标 echo "dr_drill_status{type=\"$DRILL_TYPE\"} $([ "$DRILL_STATUS" = "running" ] && echo 1 || echo 0)" echo "dr_drill_duration_seconds $DRILL_DURATION" # 监控演练进度 if [ -f "$DRILL_STATUS_FILE" ]; then DRILL_PROGRESS=$(grep "progress:" "$DRILL_STATUS_FILE" | awk '{print $2}' | tr -d '%') echo "dr_drill_progress_percent ${DRILL_PROGRESS:-0}" # 监控演练步骤 DRILL_STEP=$(grep "current_step:" "$DRILL_STATUS_FILE" | awk '{print $2}') echo "dr_drill_current_step $DRILL_STEP" fi EOF $ chmod +x /usr/local/bin/dr_drill_monitor.sh

输出结果如下：
dr_drill_status{type=”tabletop”} 1
dr_drill_duration_seconds 1800
dr_drill_progress_percent 75
dr_drill_current_step 3

9.2 演练结果验证

# 演练结果验证脚本
$ cat > /usr/local/bin/dr_drill_validation.sh << 'EOF' #!/bin/bash # 验证数据一致性 validate_data_consistency() { # 检查数据同步状态 SYNC_STATUS=$(mysql -h dr-db -e "SHOW SLAVE STATUS\G" 2>/dev/null | grep “Seconds_Behind_Master:” | awk ‘{print $2}’)

if [ “$SYNC_STATUS” = “0” ] || [ “$SYNC_STATUS” = “NULL” ]; then
echo “dr_drill_data_consistency 1”
else
echo “dr_drill_data_consistency 0”
fi
}

# 验证应用可用性
validate_fgapp_availability() {
# 检查应用服务状态
APP_STATUS=$(curl -f -s -o /dev/null -w “%{http_code}” http://fgedudb:8080/health)

if [ “$APP_STATUS” = “200” ]; then
echo “dr_drill_fgapp_availability 1”
else
echo “dr_drill_fgapp_availability 0”
fi
}

# 验证RTO目标
validate_rto() {
# 获取故障切换时间
FAILOVER_START=$(grep “failover_start” /var/log/dr/drill.log | tail -1 | awk ‘{print $1}’)
FAILOVER_END=$(grep “failover_end” /var/log/dr/drill.log | tail -1 | awk ‘{print $1}’)

if [ -n “$FAILOVER_START” ] && [ -n “$FAILOVER_END” ]; then
RTO=$((FAILOVER_END – FAILOVER_START))
echo “dr_drill_rto_seconds $RTO”

# 检查是否满足RTO目标（假设目标为300秒）
if [ “$RTO” -le 300 ]; then
echo “dr_drill_rto_compliance 1”
else
echo “dr_drill_rto_compliance 0”
fi
else
echo “dr_drill_rto_seconds 0”
echo “dr_drill_rto_compliance 0”
fi
}

# 执行验证
validate_data_consistency
validate_fgapp_availability
validate_rto
EOF

$ chmod +x /usr/local/bin/dr_drill_validation.sh

输出结果如下：
dr_drill_data_consistency 1
dr_drill_fgapp_availability 1
dr_drill_rto_seconds 245
dr_drill_rto_compliance 1

author:www.itpux.com

10. 最佳实践

10.1 监控系统设计原则

生产环境风哥建议：
– 监控系统应独立于生产系统部署，避免单点故障
– 采用多层次监控架构，包括基础设施、平台和应用层
– 监控数据应实时采集和存储，支持历史数据分析
– 告警规则应根据业务重要性分级设置
– 定期验证监控系统的有效性

10.2 监控指标选择

# 关键监控指标清单
# 1. 数据复制指标
– 复制延迟（Seconds_Behind_Master）
– 复制状态（IO_Running, SQL_Running）
– 复制进度（Progress）
– 复制吞吐量（Throughput）

# 2. 网络指标
– 网络连通性（Connectivity）
– 网络延迟（Latency）
– 网络丢包率（Packet Loss）
– 网络带宽（Bandwidth）

# 3. 存储指标
– 存储容量使用率（Usage Percent）
– 存储IOPS（IOPS）
– 存储吞吐量（Throughput）
– 存储延迟（Latency）

# 4. 应用指标
– 应用健康状态（Health Status）
– 应用响应时间（Response Time）
– 应用吞吐量（Throughput）
– 应用错误率（Error Rate）

# 5. 容灾指标
– RPO（Recovery Point Objective）
– RTO（Recovery Time Objective）
– 故障切换状态（Failover Status）
– 演练结果（Drill Results）

10.3 告警管理最佳实践

生产环境风哥建议：
– 告警应分级管理，区分严重程度和响应时间
– 避免告警风暴，合理设置告警阈值和持续时间
– 告警通知应多渠道发送，包括邮件、短信、电话等
– 告警应包含详细的上下文信息，便于快速定位问题
– 定期审查和优化告警规则，减少误报和漏报

10.4 监控数据管理

# 监控数据管理脚本
$ cat > /usr/local/bin/monitor_data_management.sh << 'EOF' #!/bin/bash # 监控数据保留策略 RETENTION_DAYS=90 # 清理过期的监控数据 clean_old_data() { echo "开始清理 $RETENTION_DAYS 天前的监控数据..." # 清理Prometheus数据 find /data/prometheus/data -type f -mtime +$RETENTION_DAYS -delete # 清理Grafana数据 find /data/grafana/data -type f -mtime +$RETENTION_DAYS -delete # 清理日志文件 find /var/log/dr -type f -name "*.log" -mtime +$RETENTION_DAYS -delete echo "监控数据清理完成" } # 备份监控数据 backup_monitor_data() { BACKUP_DIR="/backup/dr-monitor-$(date +%Y%m%d)" mkdir -p $BACKUP_DIR echo "开始备份监控数据到 $BACKUP_DIR..." # 备份Prometheus配置 cp -r /data/prometheus/*.yml $BACKUP_DIR/ # 备份Grafana仪表板 cp -r /data/grafana/dashboards $BACKUP_DIR/ # 备份告警规则 cp -r /data/prometheus/dr_alert_rules.yml $BACKUP_DIR/ echo "监控数据备份完成" } # 执行管理任务 clean_old_data backup_monitor_data EOF $ chmod +x /usr/local/bin/monitor_data_management.sh

输出结果如下：
开始清理 90 天前的监控数据…
监控数据清理完成
开始备份监控数据到 /backup/dr-monitor-20260403…
监控数据备份完成

10.5 监控系统维护

# 监控系统健康检查
$ cat > /usr/local/bin/monitor_health_check.sh << 'EOF' #!/bin/bash # 检查Prometheus状态 check_prometheus() { if curl -f -s http://fgedudb:9090/-/healthy > /dev/null; then
echo “prometheus_status 1”
else
echo “prometheus_status 0”
fi
}

# 检查Grafana状态
check_grafana() {
if curl -f -s http://fgedudb:3000/api/health > /dev/null; then
echo “grafana_status 1”
else
echo “grafana_status 0”
fi
}

# 检查Alertmanager状态
check_alertmanager() {
if curl -f -s http://fgedudb:9093/-/healthy > /dev/null; then
echo “alertmanager_status 1”
else
echo “alertmanager_status 0”
fi
}

# 检查监控数据采集
check_data_collection() {
# 检查最近5分钟是否有数据采集
LAST_SCRAPE=$(curl -s http://fgedudb:9090/api/v1/query?query=up | jq -r ‘.data.result[0].value[1]’)

if [ “$LAST_SCRAPE” = “1” ]; then
echo “data_collection_status 1”
else
echo “data_collection_status 0”
fi
}

# 执行健康检查
check_prometheus
check_grafana
check_alertmanager
check_data_collection
EOF

$ chmod +x /usr/local/bin/monitor_health_check.sh

输出结果如下：
prometheus_status 1
grafana_status 1
alertmanager_status 1
data_collection_status 1

10.6 容灾监控仪表板

# 创建Grafana仪表板配置
$ cat > /data/grafana/dashboards/dr-monitoring.json << 'EOF' { "dashboard": { "title": "容灾系统监控仪表板", "panels": [ { "title": "数据复制状态", "type": "graph", "targets": [ { "expr": "mysql_replication_seconds_behind", "legendFormat": "复制延迟(秒)" } ] }, { "title": "网络延迟", "type": "graph", "targets": [ { "expr": "dr_network_latency_ms", "legendFormat": "{{site}}" } ] }, { "title": "存储使用率", "type": "gauge", "targets": [ { "expr": "dr_storage_usage_percent", "legendFormat": "{{site}}" } ] }, { "title": "应用健康状态", "type": "stat", "targets": [ { "expr": "dr_fgapp_health_status", "legendFormat": "{{app}}" } ] }, { "title": "故障切换状态", "type": "stat", "targets": [ { "expr": "dr_failover_required", "legendFormat": "需要切换" } ] } ] } } EOF

生产环境风哥建议：
– 定期进行容灾演练，验证监控系统的有效性
– 监控系统本身应具备高可用性，避免单点故障
– 监控数据应定期备份，确保数据安全
– 监控指标应根据业务需求持续优化和调整
– 告警规则应定期审查，确保告警的准确性和有效性

本文由风哥教程整理发布,仅用于学习测试使用,转载注明出处:http://www.fgedu.net.cn/10327.html