内容简介:本文风哥教程参考Linux官方文档、Red Hat Enterprise Linux官方文档、Ansible Automation Platform官方文档、Docker官方文档、Kubernetes官方文档和Podman官方文档等内容,详细介绍了相关技术的配置和使用方法。
风哥提示:>
本文档介绍Linux性能监控实战案例。
Part01-Prometheus监控部署
1.1 安装Prometheus
[root@fgedu-mon ~]# useradd -r -s /bin/false prometheus
# 下载并安装Prometheus
[root@fgedu-mon ~]# wget https://github.com/prometheus/prometheus/releases/download/v2.48.0/prometheus-2.48.0.linux-amd64.tar.gz
[root@fgedu-mon ~]# tar xzf prometheus-2.48.0.更多视频教程www.fgedu.net.cnlinux-amd64.tar.gz
[root@fgedu-mon ~]# mv prometheus-2.48.0.linux-amd64 /usr/local/prometheus
[root@fgedu-mon ~]# chown -R prometheus:prometheus /usr/local/prometheus
# 配置Prometheus
[root@fgedu-mon ~]# cat > /usr/local/prometheus/prometheus.yml << 'EOF'
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
rule_files:
- /usr/local/prometheus/rules/*.yml
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node-exporter'
static_configs:
- targets:
- '192.168.1.10:9100'
- '192.168.1.11:9100'
- '192.168.1.12:9100'
relabel_configs:
- source_labels: [__address__]
target_label: instance
regex: '([^:]+):\d+'
replacement: '${1}'
- job_name: 'mysql-exporter'
static_configs:
- targets: ['192.168.1.20:9104']
- job_name: 'nginx-exporter'
static_configs:
- targets: ['192.168.1.30:9113']
EOF
# 创建告警规则
[root@fgedu-mon ~]# mkdir -p /usr/local/prometheus/rules
[root@fgedu-mon ~]# cat > /usr/local/prometheus/rules/alerts.yml << 'EOF'
groups:
- name: node_alerts
rules:
- alert: NodeDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "节点不可达"
description: "节点 {{ $labels.instance }} 已经超过1分钟不可达"
- alert: HighCPU
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: “CPU使用率过高”
description: “节点 {{ $labels.instance }} CPU使用率超过80%”
– alert: HighMemory
expr: (1 – (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: “内存使用率过高”
description: “节点 {{ $labels.instance }} 内存使用率超过85%”
– alert: DiskSpaceLow
expr: (1 – (node_filesystem_avail_bytes{fstype!=”tmpfs”} / node_filesystem_size_bytes{fstype!=”tmpfs”})) * 100 > 85
for: 5m
labels:
severity: critical
annotations:
summary: “磁盘空间不足”
description: “节点 {{ $labels.instance }} 磁盘 {{ $labels.mountpoint }} 使用率超过85%”
EOF
# 创建Systemd服务
[root@fgedu-mon ~]# cat > /etc/systemd/system/prometheus.service << 'EOF'
[Unit]
Description=Prometheus
After=network.target
[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/usr/local/prometheus/prometheus \
--config.file=/usr/local/prometheus/prometheus.yml \
--storage.tsdb.path=/usr/local/prometheus/data \
--web.listen-address=:9090
[Install]
WantedBy=multi-user.target
EOF
[root@fgedu-mon ~]# systemctl daemon-reload
[root@fgedu-mon ~]# systemctl enable prometheus --now
Part02-Node Exporter部署
2.1 安装Node Exporter
[root@fgedu-node1 ~]# wget https://github.com/prometheus/node_exporter/releases/download/v1.7.0/node_exporter-1.7.0.linux-amd64.tar.gz
[root学习交流加群风哥QQ113257174@fgedu-node1 ~]# tar xzf node_exporter-1.7.0.linux-amd64.tar.gz
[root@fgedu-node1 ~]# mv node_exporter-1.7.0.linux-amd64/node_exporter /usr/local/bin/
[root@fgedu-node1 ~]# chmod +x /usr/local/bin/node_exporter
# 创建Systemd服务
[root@fgedu-node1 ~]# cat > /etc/systemd/system/node_exporter.service << 'EOF'
[Unit]
Description=Node Exporter
After=network.target
[Service]
Type=simple
ExecStart=/usr/local/bin/node_exporter \
--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/) \
--collector.netclass.ignored-devices=^(veth.*)$$
[Install]
WantedBy=multi-user.target
EOF
[root@fgedu-node1 ~]# systemctl daemon-reload
[root@fgedu-node1 ~]# systemctl enable node_exporter --now
# 验证Node Exporter
[root@fgedu-node1 ~]# curl -s http://localhost:9100/metrics | head -20
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 0
go_gc_duration_seconds{quantile="0.25"} 0
go_gc_duration_seconds{quantile="0.5"} 0
go_gc_duration_seconds{quantile="0.75"} 0
go_gc_duration_seconds{quantile="1"} 0
go_gc_duration_seconds_sum 0
go_gc_duration_seconds_count 0
Part03-Grafana可视化
3.1 安装配置Grafana
[root@fgedu-mon ~]# cat > /etc/yum.repos.d/grafana.repo << 'EOF' [grafana] name=grafana baseurl=https://rpm.grafana.com repo_gpgcheck=1 enabled=1 gpgcheck=1 gpgkey=https://rpm.grafana.com/gpg.key sslverify=1 sslcacert=/etc/pki/tls/certs/ca-bundle.crt EOF [root@fgedu-mon ~]# yum install -y grafana [root@fgedu-mon ~]# systemctl enable grafana-server --now # 配置数据源 [root@fgedu-mon ~]# curl -X POST http://admin:admin@localhost:3000/api/datasources -H 'Content-Type: application/json' -d '{ "name": "Prometheus", "type": "prometheus", "url": "http://localhost:9090", "access": "proxy", "isDefault": true }' {"id":1,"message":"Datasource added","name":"Prometheus"} # 导入Node Exporter Dashboard [root@fgedu-mon ~]# curl -X POST http://admin:admin@localhost:3000/api/dashboards/import -H 'Content-Type: application/json' -d '{ "dashboard": { "id": null, "title": "Node Exporter Full", "tags": ["linux", "node"], "timezone": "browser", "panels": [ { "title": "CPU Usage", "type": "graph", "targets": [ { "expr": "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", "refId": "A" } ] }, { "title": "Memory Usage", "type": "graph", "targets": [ { "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100", "refId": "A" } ] } ] }, "overwrite": true }'
Part04-性能监控脚本
4.1 监控脚本开发
[root@fgedu-mon ~]# cat > /usr/local/bin/perf-monitor.sh << 'EOF' #!/bin/bash # perf-monitor.sh # from:www.itpux.com.qq113257174.wx:itpux-com # web: http://www.fgedu.net.cn PROMETHEUS_URL="http://localhost:9090" ALERT_THRESHOLD_CPU=80 ALERT_THRESHOLD_MEM=85 ALERT_THRESHOLD_DISK=90 echo "=== 系统性能监控报告 ===" echo "监控时间: $(date)" echo "" # CPU使用率 echo "1. CPU使用率" echo "----------------------------------------" CPU_USAGE=$(curl -s "${PROMETHEUS_URL}/api/v1/query?query=100-(avg(irate(node_cpu_seconds_total{mode=\"idle\"}[5m]))*100)" | jq -r '.data.result[0].value[1]' 2>/dev/null | awk ‘{printf “%.1f”, $1}’)
echo “当前CPU使用率: ${CPU_USAGE}%”
if (( $(echo “$CPU_USAGE > $ALERT_THRESHOLD_CPU” | bc -l) )); then
echo “警告: CPU使用率超过阈值!”
fi
echo “”
echo “2. 内存使用率”
echo “—————————————-”
MEM_USAGE=$(curl -s “${PROMETHEUS_URL}/api/v1/query?query=(1-(node_memory_MemAva更多学习教程公众号风哥教程itpux_comilable_bytes/node_memory_MemTotal_bytes))*100” | jq -r ‘.data.result[0].value[1]’ 2>/dev/null | awk ‘{printf “%.1f”, $1}’)
echo “当前内存使用率: ${MEM_USAGE}%”
if (( $(echo “$MEM_USAGE > $ALERT_THRESHOLD_MEM” | bc -l) )); then
echo “警告: 内存使用率超过阈值!”
fi
echo “”
echo “3. 磁盘使用率”
echo “—————————————-”
curl -s “${PROMETHEUS_URL}/api/v1/query?query=(1-(node_filesystem_avail_bytes/node_filesystem_size_bytes))*100” | jq -r ‘.data.result[] | “\(.metric.instance) \(.metric.mountpoint): \(.value[1])%”‘ 2>/dev/null | while read line; do
usage=$(echo $line | awk ‘{print $NF}’ | tr -d ‘%’)
echo “$line”
if (( $(echo “$usage > $ALERT_THRESHOLD_DISK” | bc -l) )); then
echo “警告: 磁盘使用率超过阈值!”
fi
done
echo “”
echo “4. 网络流量”
echo “—————————————-”
curl -s “${PROMETHEUS_URL}/api/v1/query?query=rate(node_network_receive_bytes_total[5m])*8” | jq -r ‘.data.result[] | “\(.metric.instance) \(.metric.device): 接收 \(.value[1]) bps”‘ 2>/dev/null
echo “”
echo “5. 系统负载”
echo “—————————————-”
curl -s “${PROMETHEUS_URL}/api/v1/query?query=node_load1” | jq -r ‘.data.result[] | “\(.metric.instance): 1分钟负载 \(.value[1])”‘ 2>/dev/null
echo “”
echo “6. 活跃告警”
echo “—————————————-”
curl -s “${PROMETHEUS_URL}/api/v1/alerts” | jq -r ‘.data.alerts[] | select(.state==”firing”) | “\(.labels.alertname): \(.annotations.summary)”‘ 2>/dev/null || echo “无活跃告警”
echo “”
echo “=== 监控报告完成 ===”
EOF
[root@fgedu-mon ~]# chmod +x /usr/local/bin/perf-monitor.sh
# 执行监控
[root@fgedu-mon ~]# /usr/local/bin/perf-monitor.sh
=== 系统性能监控报告 ===
监控时间: Sat Apr 4 23:00:00 CST 2026
1. CPU使用率
—————————————-
当前CPU使用率: 25.5%
2. 内存使用率
—————————————-
当前内存使用率: 45.2%
3. 磁盘使用率
—————————————-
192.168.1.10 /: 35.5%
192.168.1.11 /: 42.3%
4. 网络流量
—————————————-
192.168.1.10 eth0: 接收 125000 bps
5. 系统负载
—————————————-
192.168.1.10: 1分钟负载 0.85
6. 活跃告警
—————————————-
无活跃告警
=== 监控报告完成 ===
- 建立完善的监控体系
- 配置合理的告警阈值
- 使用Grafana进行可视化
- 定期审查监控指标
- 建立性能基线
本文由风哥教程整理发布,仅用于学习测试使用,转载注明出处:http://www.fgedu.net.cn/10327.html
