Linux教程FG157-存储监控与告警

内容大纲

Part01-存储监控概述
Part02-磁盘I/O监控
Part03-存储空间监控
Part04-磁盘健康监控
Part05-告警配置
Part06-实战案例

内容简介：本文风哥教程参考Linux官方文档、Red Hat Enterprise Linux官方文档、Ansible Automation Platform官方文档、Docker官方文档、Kubernetes官方文档和Podman官方文档等内容，详细介绍了相关技术的配置和使用方法。

1. 存储监控概述

存储监控是实时监控存储系统的性能、空间和健康状态，及时发现潜在问题。有效的监控可以帮助管理员预防故障，提高系统可靠性。

更多学习教程公众号风哥教程itpux_com

# 存储监控的关键指标
# I/O性能：读写速率、IOPS、响应时间
# 存储空间：使用率、可用空间、inode使用率
# 磁盘健康：SMART状态、错误率、温度
# 阵列状态：RAID状态、重建进度、磁盘状态

2. 磁盘I/O监控

使用iostat和iotop命令监控磁盘I/O性能。

学习交流加群风哥QQ113257174

# 磁盘I/O监控

# 1. 查看磁盘I/O统计
[root@localhost ~]# iostat -x 1 3
Linux 5.14.0-362.el9.x86_64 (localhost.localdomain) 04/03/2026 _x86_64_ (4 CPU)

avg-cpu: %user %nice %system %iowait %steal %idle
2.50 0.00 1.25 15.00 0.00 81.25

Device r/s w/s rkB/s wkB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sda 50.00 100.00 200.00 400.00 5.00 10.00 10.00 10.00 10.00 20.00 0.50 4.00 4.00 5.00 75.00
sdb 30.00 50.00 120.00 200.00 3.00 5.00 10.00 10.00 15.00 25.00 0.40 4.00 4.00 6.00 48.00

# 2. 查看实时I/O进程
[root@localhost ~]# iotop -o -b -n 3
Total DISK READ: 0.00 B/s | Total DISK WRITE: 200.00 K/s
PID PRIO USER DISK READ DISK WRITE SWAPIN IO> COMMAND
1234 be/3 root 0.00 B/s 200.00 K/s 0.00 % 0.00 % [jbd2/sda1-8]
5678 be/4 mysql 0.00 B/s 150.00 K/s 0.00 % 0.00 % mysqld
9012 be/4 nginx 0.00 B/s 50.00 K/s 0.00 % 0.00 % nginx

# 3. 查看块设备统计
[root@localhost ~]# lsblk -o NAME,SIZE,TYPE,FSTYPE,MOUNTPOINT,ROTA
NAME SIZE TYPE FSTYPE MOUNTPOINT ROTA
sda 500G disk 1
├─sda1 1G part vfat /boot 1
├─sda2 499G part LVM2 1
└─sda3 500M part swap 1
sdb 1T disk xfs /data 1
sdc 2T disk ext4 /backup 1

# 4. 查看I/O调度器
[root@localhost ~]# cat /sys/block/sda/queue/scheduler
[deadline] mq-deadline none

# 5. 查看设备队列深度
[root@localhost ~]# cat /sys/block/sda/queue/nr_requests
256

3. 存储空间监控

使用df和du命令监控存储空间使用情况。

# 存储空间监控

# 1. 查看文件系统使用情况
[root@localhost ~]# df -h
Filesystem Size Used Avail Use% Mounted on
devtmpfs 3.9G 0 3.9G 0% /dev
tmpfs 3.9G 0 3.9G 0% /dev/shm
tmpfs 3.9G 8.0M 3.9G 1% /run
/dev/mapper/vg0-root 50G 15G 35G 30% /
/dev/sda1 976M 150M 826M 16% /boot
/dev/sdb1 1.0T 500G 500G 50% /data
/dev/sdc1 2.0T 1.5T 500G 75% /backup

# 2. 查看inode使用情况
[root@localhost ~]# df -i
Filesystem Inodes IUsed IFree IUse% Mounted on
devtmpfs 1000000 400 999600 1% /dev
tmpfs 1000000 1 1000000 1% /dev/shm
tmpfs 1000000 700 999300 1% /run
/dev/mapper/vg0-root 13107200 500000 12607200 4% /
/dev/sda1 262144 1000 261144 1% /boot
/dev/sdb1 52428800 2000000 50428800 4% /data
/dev/sdc1 104857600 80000000 24857600 77% /backup

# 3. 查看目录大小
[root@localhost ~]# du -sh /var/log
2.5G /var/log

# 4. 查看最大的目录
[root@localhost ~]# du -h –max-depth=1 /var | sort -hr
2.5G /var
1.8G /var/lib
500M /var/log
200M /var/cache
100M /var/spool

# 5. 查找大文件
[root@localhost ~]# find /var/log -type f -size +100M -exec ls -lh {} \;
-rw-r–r– 1 root root 200M Apr 3 10:00 /var/log/messages
-rw-r–r– 1 root root 150M Apr 3 10:00 /var/log/secure
-rw-r–r– 1 root root 120M Apr 3 10:00 /var/log/audit/audit.log

from PG视频:www.itpux.com

4. 磁盘健康监控

使用smartctl监控磁盘健康状态。

# 磁盘健康监控

# 1. 查看磁盘健康状态
[root@localhost ~]# smartctl -H /dev/sda
smartctl 7.3 2022-02-28 r5335 [x86_64-linux-5.14.0] (local build)
Copyright (C) 2002-22, Bruce Allen, Christian Franke, www.smartmontools.org

=== START OF READ SMART DATA SECTION ===
SMART overall-health self-assessment test result: PASSED

# 2. 查看磁盘温度
[root@localhost ~]# smartctl -A /dev/sda | grep “Temperature”
194 Temperature_Celsius 0x0022 100 100 000 Old_age Always – 35 (Min/Max 20/45)

# 3. 查看磁盘错误日志
[root@localhost ~]# smartctl -l error /dev/sda
SMART Error Log Version: 1
No Errors Logged

# 4. 查看磁盘自检日志
[root@localhost ~]# smartctl -l selftest /dev/sda
SMART Self-test log structure revision number 1
Num Test_Description Status Remaining LifeTime(hours) LBA_of_first_error
# 1 Short offline Completed without error 00% 1234 –
# 2 Short offline Completed without error 00% 1200 –
# 3 Extended offline Completed without error 00% 1150 –

# 5. 查看磁盘属性
[root@localhost ~]# smartctl -A /dev/sda | grep -E “ID|Attribute|Raw”
ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE
1 Raw_Read_Error_Rate -O-RC- 200 200 051 – 0
3 Spin_Up_Time -O–C- 150 150 021 – 4500
4 Start_Stop_Count -O–C- 100 100 000 – 50
5 Reallocated_Sector_Ct -O–C- 200 200 140 – 0
9 Power_On_Hours -O–C- 100 100 000 – 1234
10 Spin_Retry_Count -O–C- 100 100 051 – 0
12 Power_Cycle_Count -O–C- 100 100 000 – 50
192 Power-Off_Retract_Count -O–C- 200 200 000 – 20
193 Load_Cycle_Count -O–C- 100 100 000 – 500
194 Temperature_Celsius -O-RC- 100 100 000 – 35
196 Reallocated_Event_Count -O–C- 200 200 000 – 0
197 Current_Pending_Sector -O–C- 200 200 000 – 0
198 Offline_Uncorrectable -O–C- 100 100 000 – 0
199 UDMA_CRC_Error_Count -O-RC- 200 200 000 – 0

5. 告警配置

配置存储告警脚本和邮件通知。

学习交流加群风哥微信: itpux-com

# 告警配置

# 1. 创建磁盘使用率告警脚本
[root@localhost ~]# cat > /usr/local/bin/disk-alert.sh << 'EOF' #!/bin/bash # script.sh # from:www.itpux.com.qq113257174.wx:itpux-com # web: http://www.fgedu.net.cn # 设置告警阈值 WARNING_THRESHOLD=80 CRITICAL_THRESHOLD=90 # 获取磁盘使用率 DISK_USAGE=$(df -h | grep -vE '^Filesystem|tmpfs|cdrom' | awk '{print $5}' | sed 's/%//g' | sort -rn | head -1) # 检查是否超过阈值 if [ "$DISK_USAGE" -ge "$CRITICAL_THRESHOLD" ]; then echo "CRITICAL: Disk usage is ${DISK_USAGE}%" echo "Disk usage is ${DISK_USAGE}%" | mail -s "CRITICAL: Disk usage alert" admin@fgedu.net.cn elif [ "$DISK_USAGE" -ge "$WARNING_THRESHOLD" ]; then echo "WARNING: Disk usage is ${DISK_USAGE}%" echo "Disk usage is ${DISK_USAGE}%" | mail -s "WARNING: Disk usage alert" admin@fgedu.net.cn else echo "OK: Disk usage is ${DISK_USAGE}%" fi EOF # 2. 设置脚本执行权限 [root@localhost ~]# chmod +x /usr/local/bin/disk-alert.sh # 3. 创建SMART告警脚本 [root@localhost ~]# cat > /usr/local/bin/smart-alert.sh << 'EOF' #!/bin/bash # 检查磁盘SMART状态 DISKS="/dev/sda /dev/sdb /dev/sdc" for disk in $DISKS; do SMART_STATUS=$(smartctl -H $disk | grep "SMART overall-health self-assessment test result" | awk '{print $6}') if [ "$SMART_STATUS" != "PASSED" ]; then echo "CRITICAL: SMART status for $disk is $SMART_STATUS" echo "SMART status for $disk is $SMART_STATUS" | mail -s "CRITICAL: SMART alert for $disk" admin@fgedu.net.cn fi done EOF # 4. 设置脚本执行权限 [root@localhost ~]# chmod +x /usr/local/bin/smart-alert.sh # 5. 创建RAID告警脚本 [root@localhost ~]# cat > /usr/local/bin/raid-alert.sh << 'EOF' #!/bin/bash # 检查RAID状态 RAID_DEVICES=$(mdadm --detail --scan | awk '{print $2}') for device in $RAID_DEVICES; do RAID_STATUS=$(cat /proc/mdstat | grep $device | awk '{print $4}') if [ "$RAID_STATUS" != "[UU]" ] && [ "$RAID_STATUS" != "[UUUU]" ]; then echo "CRITICAL: RAID status for $device is $RAID_STATUS" echo "RAID status for $device is $RAID_STATUS" | mail -s "CRITICAL: RAID alert for $device" admin@fgedu.net.cn fi done EOF # 6. 设置脚本执行权限 [root@localhost ~]# chmod +x /usr/local/bin/raid-alert.sh # 7. 配置定时任务 [root@localhost ~]# cat > /etc/cron.d/storage-monitor << 'EOF' # 存储监控定时任务 */5 * * * * root /usr/local/bin/disk-alert.sh */30 * * * * root /usr/local/bin/smart-alert.sh */15 * * * * root /usr/local/bin/raid-alert.sh EOF # 8. 重启cron服务 [root@localhost ~]# systemctl restart crond [root@localhost ~]# systemctl status crond ● crond.service - Command Scheduler Loaded: loaded (/usr/lib/systemd/system/crond.service; enabled; preset: enabled) Active: active (running) since Fri 2026-04-03 10:00:00 CST; 30min ago Main PID: 1234 (crond) Tasks: 1 (limit: 4915) Memory: 2.5M CPU: 10ms CGroup: /system.slice/crond.service └─1234 /usr/sbin/crond -n # 9. 测试告警脚本 [root@localhost ~]# /usr/local/bin/disk-alert.sh OK: Disk usage is 50% # 10. 查看cron日志 [root@localhost ~]# tail -f /var/log/cron Apr 3 10:30:01 localhost CROND[5678]: (root) CMD (/usr/local/bin/disk-alert.sh) Apr 3 10:30:01 localhost CROND[5679]: (root) CMD (/usr/local/bin/smart-alert.sh) Apr 3 10:30:01 localhost CROND[5680]: (root) CMD (/usr/local/bin/raid-alert.sh)

6. 实战案例

配置完整的存储监控和告警系统。

# 实战案例：配置存储监控和告警系统

# 1. 创建综合监控脚本
[root@localhost ~]# cat > /usr/local/bin/storage-monitor.sh << 'EOF' #!/bin/bash # script.sh # from:www.itpux.com.qq113257174.wx:itpux-com # web: http://www.fgedu.net.cn # 综合存储监控脚本 LOG_FILE="/var/log/storage-monitor.log" ALERT_EMAIL="admin@fgedu.net.cn" # 记录日志函数 log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> $LOG_FILE
}

# 磁盘使用率监控
check_disk_usage() {
WARNING_THRESHOLD=80
CRITICAL_THRESHOLD=90

df -h | grep -vE ‘^Filesystem|tmpfs|cdrom’ | while read line; do
USAGE=$(echo $line | awk ‘{print $5}’ | sed ‘s/%//g’)
MOUNT=$(echo $line | awk ‘{print $6}’)

if [ “$USAGE” -ge “$CRITICAL_THRESHOLD” ]; then
log “CRITICAL: Disk usage for $MOUNT is ${USAGE}%”
echo “Disk usage for $MOUNT is ${USAGE}%” | mail -s “CRITICAL: Disk usage alert” $ALERT_EMAIL
elif [ “$USAGE” -ge “$WARNING_THRESHOLD” ]; then
log “WARNING: Disk usage for $MOUNT is ${USAGE}%”
echo “Disk usage for $MOUNT is ${USAGE}%” | mail -s “WARNING: Disk usage alert” $ALERT_EMAIL
fi
done
}

# SMART监控
check_smart() {
DISKS=”/dev/sda /dev/sdb /dev/sdc”

for disk in $DISKS; do
SMART_STATUS=$(smartctl -H $disk 2>/dev/null | grep “SMART overall-health self-assessment test result” | awk ‘{print $6}’)

if [ “$SMART_STATUS” != “PASSED” ]; then
log “CRITICAL: SMART status for $disk is $SMART_STATUS”
echo “SMART status for $disk is $SMART_STATUS” | mail -s “CRITICAL: SMART alert for $disk” $ALERT_EMAIL
fi
done
}

# RAID监控
check_raid() {
if [ -f /proc/mdstat ]; then
cat /proc/mdstat | grep -E “^md” | while read line; do
DEVICE=$(echo $line | awk ‘{print $1}’)
STATUS=$(echo $line | awk ‘{print $4}’)

if [ “$STATUS” != “[UU]” ] && [ “$STATUS” != “[UUUU]” ]; then
log “CRITICAL: RAID status for $DEVICE is $STATUS”
echo “RAID status for $DEVICE is $STATUS” | mail -s “CRITICAL: RAID alert for $DEVICE” $ALERT_EMAIL
fi
done
fi
}

# I/O性能监控
check_io_performance() {
IOSTAT_OUTPUT=$(iostat -x 1 2 | tail -n +4)

echo “$IOSTAT_OUTPUT” | grep -v “^$” | while read line; do
DEVICE=$(echo $line | awk ‘{print $1}’)
UTIL=$(echo $line | awk ‘{print $NF}’)

if [ “$UTIL” -ge 90 ]; then
log “WARNING: I/O utilization for $DEVICE is ${UTIL}%”
fi
done
}

# 主函数
main() {
log “Starting storage monitoring…”
check_disk_usage
check_smart
check_raid
check_io_performance
log “Storage monitoring completed.”
}

# 执行主函数
main
EOF

# 2. 设置脚本执行权限
[root@localhost ~]# chmod +x /usr/local/bin/storage-monitor.sh

# 3. 创建日志目录
[root@localhost ~]# touch /var/log/storage-monitor.log
[root@localhost ~]# chmod 644 /var/log/storage-monitor.log

# 4. 配置定时任务
[root@localhost ~]# echo “*/10 * * * * root /usr/local/bin/storage-monitor.sh” > /etc/cron.d/storage-monitor

# 5. 重启cron服务
[root@localhost ~]# systemctl restart crond

# 6. 测试监控脚本
[root@localhost ~]# /usr/local/bin/storage-monitor.sh

# 7. 查看日志
[root@localhost ~]# tail -f /var/log/storage-monitor.log
[2026-04-03 10:30:00] Starting storage monitoring…
[2026-04-03 10:30:00] WARNING: Disk usage for /backup is 75%
[2026-04-03 10:30:00] Storage monitoring completed.

# 8. 配置日志轮转
[root@localhost ~]# cat > /etc/logrotate.d/storage-monitor << 'EOF' /var/log/storage-monitor.log { daily rotate 7 compress delaycompress missingok notifempty create 0644 root root } EOF # 9. 测试日志轮转 [root@localhost ~]# logrotate -f /etc/logrotate.d/storage-monitor # 10. 验证配置 [root@localhost ~]# ls -lh /var/log/storage-monitor.log* -rw-r--r-- 1 root root 1.2K Apr 3 10:30 /var/log/storage-monitor.log -rw-r--r-- 1 root root 500 Apr 3 10:00 /var/log/storage-monitor.log.1

提示

定期检查监控日志，确保告警系统正常工作。建议将告警邮件发送到多个接

风哥提示：

收者，确保及时响应。对于关键系统，可以考虑使用专业的监控工具如Zabbix、Prometheus等。