内容简介:本文风哥教程参考Linux官方文档、Red Hat Enterprise Linux官方文档、Ansible Automation Platform官方文档、Docker官方文档、Kubernetes官方文档和Podman官方文档等内容,详细介绍了相关技术的配置和使用方法。
风哥提示:
本文档介绍Linux系统负载过高时的应急处理方法和优化策略。
Part01-系统负载检查
1.1 查看系统负载
[root@fgedu-server ~]# uptime
10:00:00 up 10 days, 2:30, 2 users, load average: 12.50, 10.20, 8.70
# 查看CPU使用情况
[root@fgedu-server ~]# top
top – 10:00:00 up 10 days, 2:30, 2 users, load average: 12.50, 10.20, 8.70
Tasks: 280 total, 5 running, 275 sleeping, 0 stopped, 0 zombie
%Cpu(s): 95.2 us, 3.8 sy, 0.0 ni, 0.5 id, 0.0 wa, 0.0 hi, 0.5 si, 0.0 st
KiB Mem : 8192000 total, 512000 free, 6144000 used, 1536000 buff/cache
KiB Swap: 4096000 total, 2048000 free, 2048000 used.
# 查看内存使用情况
[root@fgedu-server ~]# free -h
total used free shared buff/cache available
Mem: 8.0G 6.0G 500M 100M 1.5G 1.2G
Swap: 4.0G 2.0G 2.0G
# 查看磁盘I/O情况
[root@fgedu-server ~]# iostat -x 1
Linux 5.14.0-284.11.1.el9_2.x86_64 (fgedu-server) 01/15/2026 _x86_64_ (8 CPU)
avg-cpu: %user %nice %system %iowait %steal %idle
95.20 0.00 3.80 0.50 0.学习交流加群风哥QQ11325717400 0.50
Device tps kB_read/s kB_wrtn/s kB_dscd/s kB_read kB_wrtn kB_dscd avgrq-sz avgqu-sz await r_await w_await aqu-sz %util
sda 12.5 1250.0 850.0 0.0 125000 85000 0 346.75 0.12 9.60 8.20 11.50 0.15 11.50
# 查看进程情况
[root@fgedu-server ~]# ps aux –sort=-%cpu | head -10
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
root 1234 85.0 12.0 2456000 987652 ? R 09:30 15:30 java -jar app.jar
root 5678 65.0 8.0 1890000 654320 ? R 09:45 10:20 python3 script.py
root 9012 45.学习交流加群风哥微信: itpux-com0 5.0 1234000 412345 ? R 09:50 8:15 node server.js
Part02-识别高负载原因
2.1 分析CPU高负载
[root@fgedu-server ~]# nproc
8
# 查看进程CPU使用情况
[root@fgedu-server ~]更多学习教程公众号风哥教程itpux_com# top -b -n 1 | grep “%Cpu”
%Cpu(s): 95.2 us, 3.8 sy, 0.0 ni, 0.5 id, 0.0 wa, 0.0 hi, 0.5 si, 0.0 st
# 查看每个CPU核心的使用情况
[root@fgedu-server ~]# mpstat -P ALL 1
Linux 5.14.0-284.11.1.el9_2.x86_64 (fgedu-server) 01/15/2026 _x86_64_ (8 CPU)
10:05:00 AM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
10:05:01 AM all 95.20 0.00 3.80 0.50 0.00 0.50 0.00 0.00 0.00 0.00
10:05:01 AM 0 98.00 0.00 2.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
10:05:01 AM 1 96.00 0.00 4.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
10:05:01 AM 2 94.00 0.00 5.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00
10:05:01 AM 3 95.00 0.00 3.00 1.00 0.00 1.00 0.00 0.00 0.00 0.00
10:05:01 AM 4 97.00 0.00 2.00 0.00 0.00 1.00 0.00 0.00 0.00 0.00
10:05:01 AM 5 93.00 0.00 5.00 1.00 0.00 1.00 0.00 0.00 0.00 0.00
10:05:01 AM 6 96.00 0.00 3.00 0.00 0.00 1.00 0.00 0.00 0.00 0.00
10:05:01 AM 7 94.00 0.00 4.00 1.00 0.00 1.00 0.00 0.00 0.00 0.00
# 查看进程线程情况
[root@fgedu-server ~]# ps -eLf | grep java | wc -l
25
# 查看占用CPU的线程
[root@fgedu-server ~]# top -H -p 1234
2.2 分析内存高负载
[root@fgedu-server ~]# cat /proc/meminfo
MemTotal: 8388608 kB
MemFree: 524288 kB
MemAvailable: 1258291 kB
Buffers: 262144 kB
Cached: 1291264 kB
SwapCached: 131072 kB
Active: 6291456 kB
Inactive: 1048576 kB
SwapTotal: 4194304 kB
SwapFree: 2097152 kB
# 查看进程内存使用情况
[root@fgedu-server ~]# ps aux –sort=-%mem | head -10
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
root 1234 85.0 12.0 2456000 987652 ? R 09:30 15:30 java -jar app.jar
root 5678 65.0 8.0 1890000 654320 ? R 09:45 10:20 python3 script.py
root 9012 45.0 5.0 1234000 412345 ? R 09:50 8:15 node server.js
# 查看内存映射
[root@fgedu-server ~]# pmap -x 1234 | head -20
1234: java -jar app.jar
Address Kbytes RSS Dirty Mode Mapping
0000000000400000 4 4 0 r-x– java
0000000000600000 4 4 4 r—- java
0000000000601000 4 4 4 rw— java
0000000001c00000 20480 20480 20480 rw— [ anon ]
00007f0000000000 65536 65536 65536 rw— [ anon ]
00007f0004000000 65536 65536 65536 rw— [ anon ]
Part03-应急处理措施
3.1 终止高负载进程
[root@fgedu-server ~]# kill -9 1234
# 终止占用内存最高的进程
[root@fgedu-server ~]# kill -9 1234
# 批量终止相关进程
[root@fgedu-server ~]# pkill -9 java
[root@fgedu-server ~]# pkill -9 python3
# 终止僵尸进程
[root@fgedu-server ~]# ps aux | grep defunct | awk ‘{print $2}’ | xargs kill -9
3.2 调整系统参数
[root@fgedu-server ~]# renice -n 10 1234
# 限制进程CPU使用
[root@fgedu-server ~]# cpulimit -p 1234 -l 50
# 调整内存使用
[root@fgedu-server ~]# echo 1 > /proc/sys/vm/drop_caches
[root@fgedu-server ~]# echo 2 > /proc/sys/vm/drop_caches
[root@fgedu-server ~]# echo 3 > /proc/sys/vm/drop_caches
# 调整交换分区使用
[root@fgedu-server ~]# sysctl -w vm.swappiness=10
[root@fgedu-server ~]# sysctl -w vm.vfs_cache_pressure=50
# 调整最大文件描述符
[root@fgedu-server ~]# ulimit -n 65535
[root@fgedu-server ~]# echo “* soft nofile 65535” >> /etc/security/limits.conf
[root@fgedu-server ~]# echo “* hard nofile 65535” >> /etc/security/limits.conf
Part04-服务重启与恢复
4.1 重启关键服务
[root@fgedu-server ~]# systemctl restart network
# 重启Web服务
[root@fgedu-server ~]# systemctl restart nginx
[root@fgedu-server ~]# systemctl restart httpd
# 重启数据库服务
[root@fgedu-server ~]# systemctl restart mysql
[root@fgedu-server ~]# systemctl restart postgresql
# 重启应用服务
[root@fgedu-server ~]# systemctl restart app.service
# 检查服务状态
[root@fgedu-server ~]# systemctl status nginx
[root@fgedu-server ~]# systemctl status mysql
Part05-系统优化
5.1 系统参数优化
[root@fgedu-server ~]# cat > /etc/sysctl.d/99-custom.conf << 'EOF' # CPU优化 kernel.sched_autogroup_enabled=0 # 内存优化 vm.swappiness=10 vm.vfs_cache_pressure=50 vm.overcommit_memory=1 vm.overcommit_ratio=50 # 网络优化 net.core.somaxconn=65535 net.ipv4.tcp_max_syn_backlog=65535 net.ipv4.tcp_fin_timeout=30 net.ipv4.tcp_keepalive_time=1200 net.ipv4.tcp_max_tw_buckets=5000 net.ipv4.tcp_fastopen=3 # 文件系统优化 fs.file-max=655350 EOF # 应用系统参数 [root@fgedu-server ~]# sysctl -p /etc/sysctl.d/99-custom.conf # 优化进程调度 [root@fgedu-server ~]# echo "GOVERNOR=performance" > /etc/sysconfig/cpupower
[root@fgedu-server ~]# systemctl restart cpupower
Part06-监控与预防
6.1 负载监控脚本
[root@fgedu-server ~]# cat > /usr/local/bin/load-monitor.sh << 'EOF' #!/bin/bash # load-monitor.sh # from:www.itpux.com.qq113257174.wx:itpux-com # web: http://www.fgedu.net.cn ALERT_EMAIL="admin@fgedu.net.cn" CPU_THRESHOLD=80 MEM_THRESHOLD=80 LOAD_THRESHOLD=4 # 检查系统负载 LOAD=$(uptime | awk '{print $10}' | sed 's/,//') CPU_USAGE=$(top -b -n 1 | grep "%Cpu" | awk '{print 100-$8}') MEM_USAGE=$(free | awk '/Mem/{print $3/$2*100}') if (( $(echo "$LOAD > $LOAD_THRESHOLD” | bc -l) )); then
echo “告警: 系统负载过高 ($LOAD)”
echo “系统负载达到 $LOAD,超过阈值 $LOAD_THRESHOLD” | mail -s “告警: 系统负载过高” $ALERT_EMAIL
fi
if (( $(echo “$CPU_USAGE > $CPU_THRESHOLD” | bc -l) )); then
echo “告警: CPU使用率过高 ($CPU_USAGE%)”
echo “CPU使用率达到 $CPU_USAGE%,超过阈值 $CPU_THRESHOLD%” | mail -s “告警: CPU使用率过高” $ALERT_EMAIL
fi
if (( $(echo “$MEM_USAGE > $MEM_THRESHOLD” | bc -l) )); then
echo “告警: 内存使用率过高 ($MEM_USAGE%)”
echo “内存使用率达到 $MEM_USAGE%,超过阈值 $MEM_THRESHOLD%” | mail -s “告警: 内存使用率过高” $ALERT_EMAIL
fi
echo “负载监控完成: $(date)”
EOF
[root@fgedu-server ~]# chmod +x /usr/local/bin/load-monitor.sh
# 配置定时监控
[root@fgedu-server ~]# cat > /etc/cron.d/load-monitor << 'EOF'
# 系统负载监控
* * * * * root /usr/local/bin/load-monitor.sh
EOF
# 配置自动清理任务
[root@fgedu-server ~]# cat > /usr/local/bin/auto-maintenance.sh << 'EOF'
#!/bin/bash
# auto-maintenance.sh
# from:www.itpux.com.qq113257174.wx:itpux-com
# web: http://www.fgedu.net.cn
# 清理系统缓存
echo "清理系统缓存..."
sync && echo 3 > /proc/sys/vm/drop_caches
# 清理临时文件
echo “清理临时文件…”
find /tmp -type f -mtime +7 -delete
find /var/tmp -type f -mtime +7 -delete
# 检查并重启异常服务
echo “检查服务状态…”
services=(“nginx” “httpd” “mysql” “postgresql”)
for service in “${services[@]}”; do
systemctl is-active $service > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo “重启服务: $service”
systemctl restart $service
fi
done
echo “自动维护完成: $(date)”
EOF
[root@fgedu-server ~]# chmod +x /usr/local/bin/auto-maintenance.更多视频教程www.fgedu.net.cnsh
# 配置定时维护
[root@fgedu-server ~]# cat > /etc/cron.d/auto-maintenance << 'EOF'
# 自动维护任务
0 3 * * * root /usr/local/bin/auto-maintenance.sh
EOF
- 定期监控系统负载和资源使用情况
- 优化应用程序代码和配置
- 合理配置系统参数和资源限制
- 实施负载均衡和水平扩展
- 定期清理系统垃圾和临时文件
- 建立完善的监控告警机制
- 制定应急响应预案
本文由风哥教程整理发布,仅用于学习测试使用,转载注明出处:http://www.fgedu.net.cn/10327.html
