内容大纲
内容简介:本文风哥教程参考Linux官方文档、Red Hat Enterprise Linux官方文档、Ansible Automation Platform官方文档、Docker官方文档、Kubernetes官方文档和Podman官方文档等内容,详细介绍了相关技术的配置和使用方法。
1. 性能监控告警概述
性能监控告警是通过监控系统性能指标,设置告警规则,在性能异常时及时通知管理员的过程。
学习交流加群风哥QQ113257174
# 监控指标配置:配置CPU、内存、磁盘I/O、网络等监控指标
# 告警规则配置:设置告警阈值和告警条件
# 告警通知配置:配置邮件、短信、钉钉等告警通知方式
# 监控脚本实现:编写监控脚本收集性能数据
# 告警日志管理:记录告警日志和告警历史
# 告警处理流程:制定告警处理流程和应急预案
2. 监控指标配置
from PG视频:www.itpux.com
配置系统性能监控指标。
# 1. 创建监控指标配置文件
[root@localhost ~]# cat > /etc/monitoring/metrics.conf << 'EOF'
# 监控指标配置文件
# CPU监控指标
CPU_USAGE_THRESHOLD=80
CPU_LOAD_THRESHOLD=5
CPU_CONTEXT_SWITCH_THRESHOLD=10000
# 内存监控指标
MEMORY_USAGE_THRESHOLD=80
SWAP_USAGE_THRESHOLD=50
CACHE_USAGE_THRESHOLD=2048000
# 磁盘I/O监控指标
DISK_UTIL_THRESHOLD=80
IO_WAIT_THRESHOLD=10
QUEUE_LENGTH_THRESHOLD=5
# 网络监控指标
NETWORK_UTIL_THRESHOLD=80
NETWORK_RX_THRESHOLD=10000
NETWORK_TX_THRESHOLD=10000
# 磁盘空间监控指标
DISK_USAGE_THRESHOLD=80
INODE_USAGE_THRESHOLD=80
# 进程监控指标
PROCESS_COUNT_THRESHOLD=500
ZOMBIE_PROCESS_THRESHOLD=10
EOF
# 2. 创建监控目录
[root@localhost ~]# mkdir -p /etc/monitoring
[root@localhost ~]# chmod 755 /etc/monitoring
# 3. 设置配置文件权限
[root@localhost ~]# chmod 644 /etc/monitoring/metrics.conf
# 4. 验证配置文件
[root@localhost ~]# cat /etc/monitoring/metrics.conf
# 监控指标配置文件
# CPU监控指标
CPU_USAGE_THRESHOLD=80
CPU_LOAD_THRESHOLD=5
CPU_CONTEXT_SWITCH_THRESHOLD=10000
# 内存监控指标
MEMORY_USAGE_THRESHOLD=80
SWAP_USAGE_THRESHOLD=50
CACHE_USAGE_THRESHOLD=2048000
# 磁盘I/O监控指标
DISK_UTIL_THRESHOLD=80
IO_WAIT_THRESHOLD=10
QUEUE_LENGTH_THRESHOLD=5
# 网络监控指标
NETWORK_UTIL_THRESHOLD=80
NETWORK_RX_THRESHOLD=10000
NETWORK_TX_THRESHOLD=10000
# 磁盘空间监控指标
DISK_USAGE_THRESHOLD=80
INODE_USAGE_THRESHOLD=80
# 进程监控指标
PROCESS_COUNT_THRESHOLD=500
ZOMBIE_PROCESS_THRESHOLD=10
3. 告警规则配置
配置告警规则和告警条件。
# 1. 创建告警规则配置文件
[root@localhost ~]# cat > /etc/monitoring/alerts.conf << 'EOF'
# 告警规则配置文件
# CPU告警规则
ALERT_CPU_USAGE=true
ALERT_CPU_LOAD=true
ALERT_CPU_CONTEXT_SWITCH=true
# 内存告警规则
ALERT_MEMORY_USAGE=true
ALERT_SWAP_USAGE=true
ALERT_CACHE_USAGE=false
# 磁盘I/O告警规则
ALERT_DISK_UTIL=true
ALERT_IO_WAIT=true
ALERT_QUEUE_LENGTH=true
# 网络告警规则
ALERT_NETWORK_UTIL=true
ALERT_NETWORK_RX=true
ALERT_NETWORK_TX=true
# 磁盘空间告警规则
ALERT_DISK_USAGE=true
ALERT_INODE_USAGE=true
# 进程告警规则
ALERT_PROCESS_COUNT=true
ALERT_ZOMBIE_PROCESS=true
EOF
# 2. 设置配置文件权限
[root@localhost ~]# chmod 644 /etc/monitoring/alerts.conf
# 3. 验证配置文件
[root@localhost ~]# cat /etc/monitoring/alerts.conf
# 告警规则配置文件
# CPU告警规则
ALERT_CPU_USAGE=true
ALERT_CPU_LOAD=true
ALERT_CPU_CONTEXT_SWITCH=true
# 内存告警规则
ALERT_MEMORY_USAGE=true
ALERT_SWAP_USAGE=true
ALERT_CACHE_USAGE=false
# 磁盘I/O告警规则
ALERT_DISK_UTIL=true
ALERT_IO_WAIT=true
ALERT_QUEUE_LENGTH=true
# 网络告警规则
ALERT_NETWORK_UTIL=true
ALERT_NETWORK_RX=true
ALERT_NETWORK_TX=true
# 磁盘空间告警规则
ALERT_DISK_USAGE=true
ALERT_INODE_USAGE=true
# 进程告警规则
ALERT_DISK_USAGE=true
ALERT_ZOMBIE_PROCESS=true
4. 告警通知配置
配置告警通知方式。
# 1. 创建告警通知配置文件
[root@localhost ~]# cat > /etc/monitoring/notifications.conf << 'EOF'
# 告警通知配置文件
# 邮件通知配置
MAIL_ENABLED=true
MAIL_SMTP_SERVER=smtp.fgedu.net.cn
MAIL_SMTP_PORT=587
MAIL_SMTP_USER=monitoring@fgedu.net.cn
MAIL_SMTP_PASSWORD=yourpassword
MAIL_FROM=monitoring@fgedu.net.cn
MAIL_TO=admin@fgedu.net.cn
# 钉钉通知配置
DINGTALK_ENABLED=true
DINGTALK_WEBHOOK=https://oapi.dingtalk.com/robot/send?access_token=yourtoken
DINGTALK_SECRET=yoursecret
# 企业微信通知配置
WECHAT_ENABLED=false
WECHAT_WEBHOOK=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=yourkey
# 短信通知配置
SMS_ENABLED=false
SMS_API_URL=https://sms.fgedu.net.cn/api/send
SMS_API_KEY=yourapikey
SMS_PHONE=13800138000
EOF
# 2. 设置配置文件权限
[root@localhost ~]# chmod 600 /etc/monitoring/notifications.conf
# 3. 验证配置文件
[root@localhost ~]# cat /etc/monitoring/notifications.conf
# 告警通知配置文件
# 邮件通知配置
MAIL_ENABLED=true
MAIL_SMTP_SERVER=smtp.fgedu.net.cn
MAIL_SMTP_PORT=587
MAIL_SMTP_USER=monitoring@fgedu.net.cn
MAIL_SMTP_PASSWORD=yourpassword
MAIL_FROM=monitoring@fgedu.net.cn
MAIL_TO=admin@fgedu.net.cn
# 钉钉通知配置
DINGTALK_ENABLED=true
DINGTALK_WEBHOOK=https://oapi.dingtalk.com/robot/send?access_token=yourtoken
DINGTALK_SECRET=yoursecret
# 企业微信通知配置
WECHAT_ENABLED=false
WECHAT_WEBHOOK=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=yourkey
# 短信通知配置
SMS_ENABLED=false
SMS_API_URL=https://sms.fgedu.net.cn/api/send
SMS_API_KEY=yourapikey
SMS_PHONE=13800138000
5. 监控脚本实现
编写监控脚本收集性能数据并发送告警。
学习交流加群风哥微信: itpux-com
# 1. 创建监控脚本
[root@localhost ~]# cat > /usr/local/bin/monitor.sh << 'EOF'
#!/bin/bash
# script.sh
# from:www.itpux.com.qq113257174.wx:itpux-com
# web: http://www.fgedu.net.cn
# 监控脚本
METRICS_CONF="/etc/monitoring/metrics.conf"
ALERTS_CONF="/etc/monitoring/alerts.conf"
NOTIFICATIONS_CONF="/etc/monitoring/notifications.conf"
LOG_FILE="/var/log/monitoring.log"
ALERT_LOG="/var/log/alerts.log"
# 记录日志函数
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> $LOG_FILE
}
# 记录告警日志函数
alert_log() {
echo “[$(date ‘+%Y-%m-%d %H:%M:%S’)] $1” >> $ALERT_LOG
}
# 加载配置文件
load_config() {
if [ -f “$METRICS_CONF” ]; then
source “$METRICS_CONF”
else
log “ERROR: Metrics config file not found: $METRICS_CONF”
exit 1
fi
if [ -f “$ALERTS_CONF” ]; then
source “$ALERTS_CONF”
else
log “ERROR: Alerts config file not found: $ALERTS_CONF”
exit 1
fi
if [ -f “$NOTIFICATIONS_CONF” ]; then
source “$NOTIFICATIONS_CONF”
else
log “ERROR: Notifications config file not found: $NOTIFICATIONS_CONF”
exit 1
fi
}
# 检查CPU使用率
check_cpu_usage() {
if [ “$ALERT_CPU_USAGE” = “true” ]; then
CPU_USAGE=$(top -bn1 | grep “Cpu(s)” | awk ‘{print $2}’ | sed ‘s/us,//’)
CPU_USAGE_INT=${CPU_USAGE%.*}
if [ “$CPU_USAGE_INT” -gt “$CPU_USAGE_THRESHOLD” ]; then
ALERT_MSG=”WARNING: CPU usage is ${CPU_USAGE}% (threshold: ${CPU_USAGE_THRESHOLD}%)”
log “$ALERT_MSG”
alert_log “$ALERT_MSG”
send_alert “$ALERT_MSG”
fi
fi
}
# 检查内存使用率
check_memory_usage() {
if [ “$ALERT_MEMORY_USAGE” = “true” ]; then
MEMORY_USAGE=$(free | grep Mem | awk ‘{printf(“%.0f”), $3/$2 * 100.0}’)
if [ “$MEMORY_USAGE” -gt “$MEMORY_USAGE_THRESHOLD” ]; then
ALERT_MSG=”WARNING: Memory usage is ${MEMORY_USAGE}% (threshold: ${MEMORY_USAGE_THRESHOLD}%)”
log “$ALERT_MSG”
alert_log “$ALERT_MSG”
send_alert “$ALERT_MSG”
fi
fi
}
# 检查磁盘I/O
check_disk_io() {
if [ “$ALERT_DISK_UTIL” = “true” ]; then
DISK_UTIL=$(iostat -x 1 2 | grep sda | tail -1 | awk ‘{print $14}’)
DISK_UTIL_INT=${DISK_UTIL%.*}
if [ “$DISK_UTIL_INT” -gt “$DISK_UTIL_THRESHOLD” ]; then
ALERT_MSG=”WARNING: Disk utilization is ${DISK_UTIL}% (threshold: ${DISK_UTIL_THRESHOLD}%)”
log “$ALERT_MSG”
alert_log “$ALERT_MSG”
send_alert “$ALERT_MSG”
fi
fi
}
# 检查网络
check_network() {
if [ “$ALERT_NETWORK_UTIL” = “true” ]; then
NET_UTIL=$(sar -n DEV 1 2 | grep eth0 | tail -1 | awk ‘{print $8}’)
NET_UTIL_INT=${NET_UTIL%.*}
if [ “$NET_UTIL_INT” -gt “$NETWORK_UTIL_THRESHOLD” ]; then
ALERT_MSG=”WARNING: Network utilization is ${NET_UTIL}% (threshold: ${NETWORK_UTIL_THRESHOLD}%)”
log “$ALERT_MSG”
alert_log “$ALERT_MSG”
send_alert “$ALERT_MSG”
fi
fi
}
# 检查磁盘空间
check_disk_space() {
if [ “$ALERT_DISK_USAGE” = “true” ]; then
DISK_USAGE=$(df -h / | tail -1 | awk ‘{print $5}’ | sed ‘s/%//’)
if [ “$DISK_USAGE” -gt “$DISK_USAGE_THRESHOLD” ]; then
ALERT_MSG=”WARNING: Disk usage is ${DISK_USAGE}% (threshold: ${DISK_USAGE_THRESHOLD}%)”
log “$ALERT_MSG”
alert_log “$ALERT_MSG”
send_alert “$ALERT_MSG”
fi
fi
}
# 发送告警
send_alert() {
local ALERT_MSG=$1
# 发送邮件告警
if [ “$MAIL_ENABLED” = “true” ]; then
echo “$ALERT_MSG” | mail -s “System Alert” $MAIL_TO
log “Email alert sent to $MAIL_TO”
fi
# 发送钉钉告警
if [ “$DINGTALK_ENABLED” = “true” ]; then
curl -X POST “$DINGTALK_WEBHOOK” \
-H ‘Content-Type: application/json’ \
-d “{\”msgtype\”: \”text\”, \”text\”: {\”content\”: \”$ALERT_MSG\”}}”
log “DingTalk alert sent”
fi
}
# 主函数
main() {
log “Starting monitoring…”
# 加载配置文件
load_config
# 检查各项指标
check_cpu_usage
check_memory_usage
check_disk_io
check_network
check_disk_space
log “Monitoring completed.”
}
# 执行主函数
main
EOF
# 2. 设置脚本执行权限
[root@localhost ~]# chmod +x /usr/local/bin/monitor.sh
# 3. 创建日志文件
[root@localhost ~]# touch /var/log/monitoring.log
[root@localhost ~]# touch /var/log/alerts.log
[root@localhost ~]# chmod 644 /var/log/monitoring.log
[root@localhost ~]# chmod 644 /var/log/alerts.log
# 4. 测试监控脚本
[root@localhost ~]# /usr/local/bin/monitor.sh
# 5. 查看日志
[root@localhost ~]# tail -f /var/log/monitoring.log
[2026-04-03 10:00:00] Starting monitoring…
[2026-04-03 10:00:01] Loading configuration files…
[2026-04-03 10:00:02] Checking CPU usage…
[2026-04-03 10:00:03] Checking memory usage…
[2026-04-03 10:00:04] Checking disk I/O…
[2026-04-03 10:00:05] Checking network…
[2026-04-03 10:00:06] Checking disk space…
[2026-04-03 10:00:07] Monitoring completed.
6. 实战案例
性能监控告警实战案例。
更多视频教程www.fgedu.net.cn
# 1. 创建监控服务
[root@localhost ~]# cat > /etc/systemd/system/monitoring.service << 'EOF'
[Unit]
Description=System Monitoring
After=network.target
[Service]
Type=oneshot
ExecStart=/usr/local/bin/monitor.sh
RemainAfterExit=yes
[Install]
WantedBy=multi-user.target
EOF
# 2. 创建监控定时器
[root@localhost ~]# cat > /etc/systemd/system/monitoring.timer << 'EOF'
[Unit]
Description=Run monitoring every 5 minutes
[Timer]
OnCalendar=*:0/5
Persistent=true
[Install]
WantedBy=timers.target
EOF
# 3. 启用并启动监控定时器
[root@localhost ~]# systemctl enable --now monitoring.timer
Created symlink /etc/systemd/system/timers.target.wants/monitoring.timer → /etc/systemd/system/monitoring.timer.
Created symlink /etc/systemd/system/monitoring.timer → /etc/systemd/system/monitoring.timer.
# 4. 查看定时器状态
[root@localhost ~]# systemctl status monitoring.timer
● monitoring.timer - Run monitoring every 5 minutes
Loaded: loaded (/etc/systemd/system/monitoring.timer; enabled; preset: disabled)
Active: active (waiting) since Fri 2026-04-03 10:00:00 CST; 5s ago
Trigger: Fri 2026-04-03 10:05:00 CST; 4min 55s left
Triggers: ● monitoring.service
Apr 03 10:00:00 localhost systemd[1]: Started Run monitoring every 5 minutes.
# 5. 查看服务状态
[root@localhost ~]# systemctl status monitoring.service
● monitoring.service - System Monitoring
Loaded: loaded (/etc/systemd/system/monitoring.service; enabled; preset: disabled)
Active: inactive (dead) since Fri 2026-04-03 10:00:00 CST; 5s ago
Process: 12345 ExecStart=/usr/local/bin/monitor.sh (code=exited, status=0/SUCCESS)
Main PID: 12345 (code=exited, status=0/SUCCESS)
Apr 03 10:00:00 localhost systemd[1]: Starting System Monitoring...
Apr 03 10:00:00 localhost monitor.sh[12345]: [2026-04-03 10:00:00] Starting monitoring...
Apr 03 10:00:00 localhost monitor.sh[12345]: [2026-04-03 10:00:01] Loading configuration files...
Apr 03 10:00:00 localhost monitor.sh[12345]: [2026-04-03 10:00:02] Checking CPU usage...
Apr 03 10:00:00 localhost monitor.sh[12345]: [2026-04-03 10:00:03] Checking memory usage...
Apr 03 10:00:00 localhost monitor.sh[12345]: [2026-04-03 10:00:04] Checking disk I/O...
Apr 03 10:00:00 localhost monitor.sh[12345]: [2026-04-03 10:00:05] Checking network...
Apr 03 10:00:00 localhost monitor.sh[12345]: [2026-04-03 10:00:06] Checking disk space...
Apr 03 10:00:00 localhost monitor.sh[12345]: [2026-04-03 10:00:07] Monitoring completed.
Apr 03 10:00:00 localhost systemd[1]: Finished System Monitoring.
# 6. 查看定时器列表
[root@localhost ~]# systemctl list-timers
NEXT LEFT LAST PASSED UNIT ACTIVATES
Fri 2026-04-03 10:05:00 CST 4min 55s ago Fri 2026-04-03 10:00:00 CST 5s ago monitoring.timer monitoring.service
Fri 2026-04-03 10:15:00 CST 14min 55s ago Fri 2026-04-03 10:00:00 CST 5s ago systemd-tmpfiles-clean.timer systemd-tmpfiles-clean.service
1 timers listed.
# 7. 手动触发监控
[root@localhost ~]# systemctl start monitoring.service
# 8. 查看告警日志
[root@localhost ~]# tail -f /var/log/alerts.log
[2026-04-03 10:00:00] WARNING: CPU usage is 85% (threshold: 80%)
[2026-04-03 10:00:00] WARNING: Memory usage is 85% (threshold: 80%)
[2026-04-03 10:00:00] WARNING: Disk usage is 85% (threshold: 80%)
# 9. 创建告警统计脚本
[root@localhost ~]# cat > /usr/local/bin/alert-stats.sh << 'EOF'
#!/bin/bash
# script.sh
# from:www.itpux.com.qq113257174.wx:itpux-com
# web: http://www.fgedu.net.cn
# 告警统计脚本
ALERT_LOG="/var/log/alerts.log"
STATS_FILE="/var/log/alert-stats.txt"
# 统计告警数量
ALERT_COUNT=$(wc -l < $ALERT_LOG)
# 统计CPU告警
CPU_ALERTS=$(grep -c "CPU usage" $ALERT_LOG)
# 统计内存告警
MEMORY_ALERTS=$(grep -c "Memory usage" $ALERT_LOG)
# 统计磁盘I/O告警
DISK_IO_ALERTS=$(grep -c "Disk utilization" $ALERT_LOG)
# 统计网络告警
NETWORK_ALERTS=$(grep -c "Network utilization" $ALERT_LOG)
# 统计磁盘空间告警
DISK_SPACE_ALERTS=$(grep -c "Disk usage" $ALERT_LOG)
# 生成统计报告
cat > $STATS_FILE << EOF
=== Alert Statistics Report ===
Generated: $(date '+%Y-%m-%d %H:%M:%S')
Total Alerts: $ALERT_COUNT
CPU Alerts: $CPU_ALERTS
Memory Alerts: $MEMORY_ALERTS
Disk I/O Alerts: $DISK_IO_ALERTS
Network Alerts: $NETWORK_ALERTS
Disk Space Alerts: $DISK_SPACE_ALERTS
EOF
echo "Alert statistics report saved to $STATS_FILE"
cat $STATS_FILE
EOF
# 10. 设置脚本执行权限
[root@localhost ~]# chmod +x /usr/local/bin/alert-stats.sh
# 11. 运行告警统计
[root@localhost ~]# /usr/local/bin/alert-stats.sh
Alert statistics report saved to /var/log/alert-stats.txt
=== Alert Statistics Report ===
Generated: 2026-04-03 10:00:00
Total Alerts: 3
CPU Alerts: 1
Memory Alerts: 1
Disk I/O Alerts: 0
Network Alerts: 0
Disk Space Alerts: 1
提示
性能监控告
风哥提示:
警需要根据实际应用场景配置合适的监控指标和告警阈值。建议定期检查告警日志,分析告警原因,及时处理告警事件。对于频繁告警的指标,需要调整告警阈值或优化系统性能。<
更多学习教程公众号风哥教程itpux_com
/p>
