1. 首页 > Linux教程 > 正文

Linux教程FG176-性能监控告警

内容大纲

内容简介:本文风哥教程参考Linux官方文档、Red Hat Enterprise Linux官方文档、Ansible Automation Platform官方文档、Docker官方文档、Kubernetes官方文档和Podman官方文档等内容,详细介绍了相关技术的配置和使用方法。

1. 性能监控告警概述

性能监控告警是通过监控系统性能指标,设置告警规则,在性能异常时及时通知管理员的过程。

学习交流加群风哥QQ113257174

# 性能监控告警的主要内容
# 监控指标配置:配置CPU、内存、磁盘I/O、网络等监控指标
# 告警规则配置:设置告警阈值和告警条件
# 告警通知配置:配置邮件、短信、钉钉等告警通知方式
# 监控脚本实现:编写监控脚本收集性能数据
# 告警日志管理:记录告警日志和告警历史
# 告警处理流程:制定告警处理流程和应急预案

2. 监控指标配置

from PG视频:www.itpux.com

配置系统性能监控指标。

# 监控指标配置

# 1. 创建监控指标配置文件
[root@localhost ~]# cat > /etc/monitoring/metrics.conf << 'EOF' # 监控指标配置文件 # CPU监控指标 CPU_USAGE_THRESHOLD=80 CPU_LOAD_THRESHOLD=5 CPU_CONTEXT_SWITCH_THRESHOLD=10000 # 内存监控指标 MEMORY_USAGE_THRESHOLD=80 SWAP_USAGE_THRESHOLD=50 CACHE_USAGE_THRESHOLD=2048000 # 磁盘I/O监控指标 DISK_UTIL_THRESHOLD=80 IO_WAIT_THRESHOLD=10 QUEUE_LENGTH_THRESHOLD=5 # 网络监控指标 NETWORK_UTIL_THRESHOLD=80 NETWORK_RX_THRESHOLD=10000 NETWORK_TX_THRESHOLD=10000 # 磁盘空间监控指标 DISK_USAGE_THRESHOLD=80 INODE_USAGE_THRESHOLD=80 # 进程监控指标 PROCESS_COUNT_THRESHOLD=500 ZOMBIE_PROCESS_THRESHOLD=10 EOF # 2. 创建监控目录 [root@localhost ~]# mkdir -p /etc/monitoring [root@localhost ~]# chmod 755 /etc/monitoring # 3. 设置配置文件权限 [root@localhost ~]# chmod 644 /etc/monitoring/metrics.conf # 4. 验证配置文件 [root@localhost ~]# cat /etc/monitoring/metrics.conf # 监控指标配置文件 # CPU监控指标 CPU_USAGE_THRESHOLD=80 CPU_LOAD_THRESHOLD=5 CPU_CONTEXT_SWITCH_THRESHOLD=10000 # 内存监控指标 MEMORY_USAGE_THRESHOLD=80 SWAP_USAGE_THRESHOLD=50 CACHE_USAGE_THRESHOLD=2048000 # 磁盘I/O监控指标 DISK_UTIL_THRESHOLD=80 IO_WAIT_THRESHOLD=10 QUEUE_LENGTH_THRESHOLD=5 # 网络监控指标 NETWORK_UTIL_THRESHOLD=80 NETWORK_RX_THRESHOLD=10000 NETWORK_TX_THRESHOLD=10000 # 磁盘空间监控指标 DISK_USAGE_THRESHOLD=80 INODE_USAGE_THRESHOLD=80 # 进程监控指标 PROCESS_COUNT_THRESHOLD=500 ZOMBIE_PROCESS_THRESHOLD=10

3. 告警规则配置

配置告警规则和告警条件。

# 告警规则配置

# 1. 创建告警规则配置文件
[root@localhost ~]# cat > /etc/monitoring/alerts.conf << 'EOF' # 告警规则配置文件 # CPU告警规则 ALERT_CPU_USAGE=true ALERT_CPU_LOAD=true ALERT_CPU_CONTEXT_SWITCH=true # 内存告警规则 ALERT_MEMORY_USAGE=true ALERT_SWAP_USAGE=true ALERT_CACHE_USAGE=false # 磁盘I/O告警规则 ALERT_DISK_UTIL=true ALERT_IO_WAIT=true ALERT_QUEUE_LENGTH=true # 网络告警规则 ALERT_NETWORK_UTIL=true ALERT_NETWORK_RX=true ALERT_NETWORK_TX=true # 磁盘空间告警规则 ALERT_DISK_USAGE=true ALERT_INODE_USAGE=true # 进程告警规则 ALERT_PROCESS_COUNT=true ALERT_ZOMBIE_PROCESS=true EOF # 2. 设置配置文件权限 [root@localhost ~]# chmod 644 /etc/monitoring/alerts.conf # 3. 验证配置文件 [root@localhost ~]# cat /etc/monitoring/alerts.conf # 告警规则配置文件 # CPU告警规则 ALERT_CPU_USAGE=true ALERT_CPU_LOAD=true ALERT_CPU_CONTEXT_SWITCH=true # 内存告警规则 ALERT_MEMORY_USAGE=true ALERT_SWAP_USAGE=true ALERT_CACHE_USAGE=false # 磁盘I/O告警规则 ALERT_DISK_UTIL=true ALERT_IO_WAIT=true ALERT_QUEUE_LENGTH=true # 网络告警规则 ALERT_NETWORK_UTIL=true ALERT_NETWORK_RX=true ALERT_NETWORK_TX=true # 磁盘空间告警规则 ALERT_DISK_USAGE=true ALERT_INODE_USAGE=true # 进程告警规则 ALERT_DISK_USAGE=true ALERT_ZOMBIE_PROCESS=true

4. 告警通知配置

配置告警通知方式。

# 告警通知配置

# 1. 创建告警通知配置文件
[root@localhost ~]# cat > /etc/monitoring/notifications.conf << 'EOF' # 告警通知配置文件 # 邮件通知配置 MAIL_ENABLED=true MAIL_SMTP_SERVER=smtp.fgedu.net.cn MAIL_SMTP_PORT=587 MAIL_SMTP_USER=monitoring@fgedu.net.cn MAIL_SMTP_PASSWORD=yourpassword MAIL_FROM=monitoring@fgedu.net.cn MAIL_TO=admin@fgedu.net.cn # 钉钉通知配置 DINGTALK_ENABLED=true DINGTALK_WEBHOOK=https://oapi.dingtalk.com/robot/send?access_token=yourtoken DINGTALK_SECRET=yoursecret # 企业微信通知配置 WECHAT_ENABLED=false WECHAT_WEBHOOK=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=yourkey # 短信通知配置 SMS_ENABLED=false SMS_API_URL=https://sms.fgedu.net.cn/api/send SMS_API_KEY=yourapikey SMS_PHONE=13800138000 EOF # 2. 设置配置文件权限 [root@localhost ~]# chmod 600 /etc/monitoring/notifications.conf # 3. 验证配置文件 [root@localhost ~]# cat /etc/monitoring/notifications.conf # 告警通知配置文件 # 邮件通知配置 MAIL_ENABLED=true MAIL_SMTP_SERVER=smtp.fgedu.net.cn MAIL_SMTP_PORT=587 MAIL_SMTP_USER=monitoring@fgedu.net.cn MAIL_SMTP_PASSWORD=yourpassword MAIL_FROM=monitoring@fgedu.net.cn MAIL_TO=admin@fgedu.net.cn # 钉钉通知配置 DINGTALK_ENABLED=true DINGTALK_WEBHOOK=https://oapi.dingtalk.com/robot/send?access_token=yourtoken DINGTALK_SECRET=yoursecret # 企业微信通知配置 WECHAT_ENABLED=false WECHAT_WEBHOOK=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=yourkey # 短信通知配置 SMS_ENABLED=false SMS_API_URL=https://sms.fgedu.net.cn/api/send SMS_API_KEY=yourapikey SMS_PHONE=13800138000

5. 监控脚本实现

编写监控脚本收集性能数据并发送告警。

学习交流加群风哥微信: itpux-com

# 监控脚本实现

# 1. 创建监控脚本
[root@localhost ~]# cat > /usr/local/bin/monitor.sh << 'EOF' #!/bin/bash # script.sh # from:www.itpux.com.qq113257174.wx:itpux-com # web: http://www.fgedu.net.cn # 监控脚本 METRICS_CONF="/etc/monitoring/metrics.conf" ALERTS_CONF="/etc/monitoring/alerts.conf" NOTIFICATIONS_CONF="/etc/monitoring/notifications.conf" LOG_FILE="/var/log/monitoring.log" ALERT_LOG="/var/log/alerts.log" # 记录日志函数 log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> $LOG_FILE
}

# 记录告警日志函数
alert_log() {
echo “[$(date ‘+%Y-%m-%d %H:%M:%S’)] $1” >> $ALERT_LOG
}

# 加载配置文件
load_config() {
if [ -f “$METRICS_CONF” ]; then
source “$METRICS_CONF”
else
log “ERROR: Metrics config file not found: $METRICS_CONF”
exit 1
fi

if [ -f “$ALERTS_CONF” ]; then
source “$ALERTS_CONF”
else
log “ERROR: Alerts config file not found: $ALERTS_CONF”
exit 1
fi

if [ -f “$NOTIFICATIONS_CONF” ]; then
source “$NOTIFICATIONS_CONF”
else
log “ERROR: Notifications config file not found: $NOTIFICATIONS_CONF”
exit 1
fi
}

# 检查CPU使用率
check_cpu_usage() {
if [ “$ALERT_CPU_USAGE” = “true” ]; then
CPU_USAGE=$(top -bn1 | grep “Cpu(s)” | awk ‘{print $2}’ | sed ‘s/us,//’)
CPU_USAGE_INT=${CPU_USAGE%.*}

if [ “$CPU_USAGE_INT” -gt “$CPU_USAGE_THRESHOLD” ]; then
ALERT_MSG=”WARNING: CPU usage is ${CPU_USAGE}% (threshold: ${CPU_USAGE_THRESHOLD}%)”
log “$ALERT_MSG”
alert_log “$ALERT_MSG”
send_alert “$ALERT_MSG”
fi
fi
}

# 检查内存使用率
check_memory_usage() {
if [ “$ALERT_MEMORY_USAGE” = “true” ]; then
MEMORY_USAGE=$(free | grep Mem | awk ‘{printf(“%.0f”), $3/$2 * 100.0}’)

if [ “$MEMORY_USAGE” -gt “$MEMORY_USAGE_THRESHOLD” ]; then
ALERT_MSG=”WARNING: Memory usage is ${MEMORY_USAGE}% (threshold: ${MEMORY_USAGE_THRESHOLD}%)”
log “$ALERT_MSG”
alert_log “$ALERT_MSG”
send_alert “$ALERT_MSG”
fi
fi
}

# 检查磁盘I/O
check_disk_io() {
if [ “$ALERT_DISK_UTIL” = “true” ]; then
DISK_UTIL=$(iostat -x 1 2 | grep sda | tail -1 | awk ‘{print $14}’)
DISK_UTIL_INT=${DISK_UTIL%.*}

if [ “$DISK_UTIL_INT” -gt “$DISK_UTIL_THRESHOLD” ]; then
ALERT_MSG=”WARNING: Disk utilization is ${DISK_UTIL}% (threshold: ${DISK_UTIL_THRESHOLD}%)”
log “$ALERT_MSG”
alert_log “$ALERT_MSG”
send_alert “$ALERT_MSG”
fi
fi
}

# 检查网络
check_network() {
if [ “$ALERT_NETWORK_UTIL” = “true” ]; then
NET_UTIL=$(sar -n DEV 1 2 | grep eth0 | tail -1 | awk ‘{print $8}’)
NET_UTIL_INT=${NET_UTIL%.*}

if [ “$NET_UTIL_INT” -gt “$NETWORK_UTIL_THRESHOLD” ]; then
ALERT_MSG=”WARNING: Network utilization is ${NET_UTIL}% (threshold: ${NETWORK_UTIL_THRESHOLD}%)”
log “$ALERT_MSG”
alert_log “$ALERT_MSG”
send_alert “$ALERT_MSG”
fi
fi
}

# 检查磁盘空间
check_disk_space() {
if [ “$ALERT_DISK_USAGE” = “true” ]; then
DISK_USAGE=$(df -h / | tail -1 | awk ‘{print $5}’ | sed ‘s/%//’)

if [ “$DISK_USAGE” -gt “$DISK_USAGE_THRESHOLD” ]; then
ALERT_MSG=”WARNING: Disk usage is ${DISK_USAGE}% (threshold: ${DISK_USAGE_THRESHOLD}%)”
log “$ALERT_MSG”
alert_log “$ALERT_MSG”
send_alert “$ALERT_MSG”
fi
fi
}

# 发送告警
send_alert() {
local ALERT_MSG=$1

# 发送邮件告警
if [ “$MAIL_ENABLED” = “true” ]; then
echo “$ALERT_MSG” | mail -s “System Alert” $MAIL_TO
log “Email alert sent to $MAIL_TO”
fi

# 发送钉钉告警
if [ “$DINGTALK_ENABLED” = “true” ]; then
curl -X POST “$DINGTALK_WEBHOOK” \
-H ‘Content-Type: application/json’ \
-d “{\”msgtype\”: \”text\”, \”text\”: {\”content\”: \”$ALERT_MSG\”}}”
log “DingTalk alert sent”
fi
}

# 主函数
main() {
log “Starting monitoring…”

# 加载配置文件
load_config

# 检查各项指标
check_cpu_usage
check_memory_usage
check_disk_io
check_network
check_disk_space

log “Monitoring completed.”
}

# 执行主函数
main
EOF

# 2. 设置脚本执行权限
[root@localhost ~]# chmod +x /usr/local/bin/monitor.sh

# 3. 创建日志文件
[root@localhost ~]# touch /var/log/monitoring.log
[root@localhost ~]# touch /var/log/alerts.log
[root@localhost ~]# chmod 644 /var/log/monitoring.log
[root@localhost ~]# chmod 644 /var/log/alerts.log

# 4. 测试监控脚本
[root@localhost ~]# /usr/local/bin/monitor.sh

# 5. 查看日志
[root@localhost ~]# tail -f /var/log/monitoring.log
[2026-04-03 10:00:00] Starting monitoring…
[2026-04-03 10:00:01] Loading configuration files…
[2026-04-03 10:00:02] Checking CPU usage…
[2026-04-03 10:00:03] Checking memory usage…
[2026-04-03 10:00:04] Checking disk I/O…
[2026-04-03 10:00:05] Checking network…
[2026-04-03 10:00:06] Checking disk space…
[2026-04-03 10:00:07] Monitoring completed.

6. 实战案例

性能监控告警实战案例。

更多视频教程www.fgedu.net.cn

# 实战案例:性能监控告警

# 1. 创建监控服务
[root@localhost ~]# cat > /etc/systemd/system/monitoring.service << 'EOF' [Unit] Description=System Monitoring After=network.target [Service] Type=oneshot ExecStart=/usr/local/bin/monitor.sh RemainAfterExit=yes [Install] WantedBy=multi-user.target EOF # 2. 创建监控定时器 [root@localhost ~]# cat > /etc/systemd/system/monitoring.timer << 'EOF' [Unit] Description=Run monitoring every 5 minutes [Timer] OnCalendar=*:0/5 Persistent=true [Install] WantedBy=timers.target EOF # 3. 启用并启动监控定时器 [root@localhost ~]# systemctl enable --now monitoring.timer Created symlink /etc/systemd/system/timers.target.wants/monitoring.timer → /etc/systemd/system/monitoring.timer. Created symlink /etc/systemd/system/monitoring.timer → /etc/systemd/system/monitoring.timer. # 4. 查看定时器状态 [root@localhost ~]# systemctl status monitoring.timer ● monitoring.timer - Run monitoring every 5 minutes Loaded: loaded (/etc/systemd/system/monitoring.timer; enabled; preset: disabled) Active: active (waiting) since Fri 2026-04-03 10:00:00 CST; 5s ago Trigger: Fri 2026-04-03 10:05:00 CST; 4min 55s left Triggers: ● monitoring.service Apr 03 10:00:00 localhost systemd[1]: Started Run monitoring every 5 minutes. # 5. 查看服务状态 [root@localhost ~]# systemctl status monitoring.service ● monitoring.service - System Monitoring Loaded: loaded (/etc/systemd/system/monitoring.service; enabled; preset: disabled) Active: inactive (dead) since Fri 2026-04-03 10:00:00 CST; 5s ago Process: 12345 ExecStart=/usr/local/bin/monitor.sh (code=exited, status=0/SUCCESS) Main PID: 12345 (code=exited, status=0/SUCCESS) Apr 03 10:00:00 localhost systemd[1]: Starting System Monitoring... Apr 03 10:00:00 localhost monitor.sh[12345]: [2026-04-03 10:00:00] Starting monitoring... Apr 03 10:00:00 localhost monitor.sh[12345]: [2026-04-03 10:00:01] Loading configuration files... Apr 03 10:00:00 localhost monitor.sh[12345]: [2026-04-03 10:00:02] Checking CPU usage... Apr 03 10:00:00 localhost monitor.sh[12345]: [2026-04-03 10:00:03] Checking memory usage... Apr 03 10:00:00 localhost monitor.sh[12345]: [2026-04-03 10:00:04] Checking disk I/O... Apr 03 10:00:00 localhost monitor.sh[12345]: [2026-04-03 10:00:05] Checking network... Apr 03 10:00:00 localhost monitor.sh[12345]: [2026-04-03 10:00:06] Checking disk space... Apr 03 10:00:00 localhost monitor.sh[12345]: [2026-04-03 10:00:07] Monitoring completed. Apr 03 10:00:00 localhost systemd[1]: Finished System Monitoring. # 6. 查看定时器列表 [root@localhost ~]# systemctl list-timers NEXT LEFT LAST PASSED UNIT ACTIVATES Fri 2026-04-03 10:05:00 CST 4min 55s ago Fri 2026-04-03 10:00:00 CST 5s ago monitoring.timer monitoring.service Fri 2026-04-03 10:15:00 CST 14min 55s ago Fri 2026-04-03 10:00:00 CST 5s ago systemd-tmpfiles-clean.timer systemd-tmpfiles-clean.service 1 timers listed. # 7. 手动触发监控 [root@localhost ~]# systemctl start monitoring.service # 8. 查看告警日志 [root@localhost ~]# tail -f /var/log/alerts.log [2026-04-03 10:00:00] WARNING: CPU usage is 85% (threshold: 80%) [2026-04-03 10:00:00] WARNING: Memory usage is 85% (threshold: 80%) [2026-04-03 10:00:00] WARNING: Disk usage is 85% (threshold: 80%) # 9. 创建告警统计脚本 [root@localhost ~]# cat > /usr/local/bin/alert-stats.sh << 'EOF' #!/bin/bash # script.sh # from:www.itpux.com.qq113257174.wx:itpux-com # web: http://www.fgedu.net.cn # 告警统计脚本 ALERT_LOG="/var/log/alerts.log" STATS_FILE="/var/log/alert-stats.txt" # 统计告警数量 ALERT_COUNT=$(wc -l < $ALERT_LOG) # 统计CPU告警 CPU_ALERTS=$(grep -c "CPU usage" $ALERT_LOG) # 统计内存告警 MEMORY_ALERTS=$(grep -c "Memory usage" $ALERT_LOG) # 统计磁盘I/O告警 DISK_IO_ALERTS=$(grep -c "Disk utilization" $ALERT_LOG) # 统计网络告警 NETWORK_ALERTS=$(grep -c "Network utilization" $ALERT_LOG) # 统计磁盘空间告警 DISK_SPACE_ALERTS=$(grep -c "Disk usage" $ALERT_LOG) # 生成统计报告 cat > $STATS_FILE << EOF === Alert Statistics Report === Generated: $(date '+%Y-%m-%d %H:%M:%S') Total Alerts: $ALERT_COUNT CPU Alerts: $CPU_ALERTS Memory Alerts: $MEMORY_ALERTS Disk I/O Alerts: $DISK_IO_ALERTS Network Alerts: $NETWORK_ALERTS Disk Space Alerts: $DISK_SPACE_ALERTS EOF echo "Alert statistics report saved to $STATS_FILE" cat $STATS_FILE EOF # 10. 设置脚本执行权限 [root@localhost ~]# chmod +x /usr/local/bin/alert-stats.sh # 11. 运行告警统计 [root@localhost ~]# /usr/local/bin/alert-stats.sh Alert statistics report saved to /var/log/alert-stats.txt === Alert Statistics Report === Generated: 2026-04-03 10:00:00 Total Alerts: 3 CPU Alerts: 1 Memory Alerts: 1 Disk I/O Alerts: 0 Network Alerts: 0 Disk Space Alerts: 1

联系我们

在线咨询:点击这里给我发消息

微信号:itpux-com

工作日:9:30-18:30,节假日休息