1. 首页 > Linux教程 > 正文

Linux教程FG485-Linux性能监控实战

内容简介:本文风哥教程参考Linux官方文档、Red Hat Enterprise Linux官方文档、Ansible Automation Platform官方文档、Docker官方文档、Kubernetes官方文档和Podman官方文档等内容,详细介绍了相关技术的配置和使用方法。

风哥提示:

>

本文档介绍Linux性能监控实战案例。

Part01-Prometheus监控部署

1.1 安装Prometheus

# 创建Prometheus用户
[root@fgedu-mon ~]# useradd -r -s /bin/false prometheus

# 下载并安装Prometheus
[root@fgedu-mon ~]# wget https://github.com/prometheus/prometheus/releases/download/v2.48.0/prometheus-2.48.0.linux-amd64.tar.gz
[root@fgedu-mon ~]# tar xzf prometheus-2.48.0.更多视频教程www.fgedu.net.cnlinux-amd64.tar.gz
[root@fgedu-mon ~]# mv prometheus-2.48.0.linux-amd64 /usr/local/prometheus
[root@fgedu-mon ~]# chown -R prometheus:prometheus /usr/local/prometheus

# 配置Prometheus
[root@fgedu-mon ~]# cat > /usr/local/prometheus/prometheus.yml << 'EOF' global: scrape_interval: 15s evaluation_interval: 15s alerting: alertmanagers: - static_configs: - targets: - localhost:9093 rule_files: - /usr/local/prometheus/rules/*.yml scrape_configs: - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] - job_name: 'node-exporter' static_configs: - targets: - '192.168.1.10:9100' - '192.168.1.11:9100' - '192.168.1.12:9100' relabel_configs: - source_labels: [__address__] target_label: instance regex: '([^:]+):\d+' replacement: '${1}' - job_name: 'mysql-exporter' static_configs: - targets: ['192.168.1.20:9104'] - job_name: 'nginx-exporter' static_configs: - targets: ['192.168.1.30:9113'] EOF # 创建告警规则 [root@fgedu-mon ~]# mkdir -p /usr/local/prometheus/rules [root@fgedu-mon ~]# cat > /usr/local/prometheus/rules/alerts.yml << 'EOF' groups: - name: node_alerts rules: - alert: NodeDown expr: up == 0 for: 1m labels: severity: critical annotations: summary: "节点不可达" description: "节点 {{ $labels.instance }} 已经超过1分钟不可达" - alert: HighCPU expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: “CPU使用率过高”
description: “节点 {{ $labels.instance }} CPU使用率超过80%”

– alert: HighMemory
expr: (1 – (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: “内存使用率过高”
description: “节点 {{ $labels.instance }} 内存使用率超过85%”

– alert: DiskSpaceLow
expr: (1 – (node_filesystem_avail_bytes{fstype!=”tmpfs”} / node_filesystem_size_bytes{fstype!=”tmpfs”})) * 100 > 85
for: 5m
labels:
severity: critical
annotations:
summary: “磁盘空间不足”
description: “节点 {{ $labels.instance }} 磁盘 {{ $labels.mountpoint }} 使用率超过85%”
EOF

# 创建Systemd服务
[root@fgedu-mon ~]# cat > /etc/systemd/system/prometheus.service << 'EOF' [Unit] Description=Prometheus After=network.target [Service] User=prometheus Group=prometheus Type=simple ExecStart=/usr/local/prometheus/prometheus \ --config.file=/usr/local/prometheus/prometheus.yml \ --storage.tsdb.path=/usr/local/prometheus/data \ --web.listen-address=:9090 [Install] WantedBy=multi-user.target EOF [root@fgedu-mon ~]# systemctl daemon-reload [root@fgedu-mon ~]# systemctl enable prometheus --now

Part02-Node Exporter部署

2.1 安装Node Exporter

# 在所有被监控节点安装Node Exporter
[root@fgedu-node1 ~]# wget https://github.com/prometheus/node_exporter/releases/download/v1.7.0/node_exporter-1.7.0.linux-amd64.tar.gz
[root学习交流加群风哥QQ113257174@fgedu-node1 ~]# tar xzf node_exporter-1.7.0.linux-amd64.tar.gz
[root@fgedu-node1 ~]# mv node_exporter-1.7.0.linux-amd64/node_exporter /usr/local/bin/
[root@fgedu-node1 ~]# chmod +x /usr/local/bin/node_exporter

# 创建Systemd服务
[root@fgedu-node1 ~]# cat > /etc/systemd/system/node_exporter.service << 'EOF' [Unit] Description=Node Exporter After=network.target [Service] Type=simple ExecStart=/usr/local/bin/node_exporter \ --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/) \ --collector.netclass.ignored-devices=^(veth.*)$$ [Install] WantedBy=multi-user.target EOF [root@fgedu-node1 ~]# systemctl daemon-reload [root@fgedu-node1 ~]# systemctl enable node_exporter --now # 验证Node Exporter [root@fgedu-node1 ~]# curl -s http://localhost:9100/metrics | head -20 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary go_gc_duration_seconds{quantile="0"} 0 go_gc_duration_seconds{quantile="0.25"} 0 go_gc_duration_seconds{quantile="0.5"} 0 go_gc_duration_seconds{quantile="0.75"} 0 go_gc_duration_seconds{quantile="1"} 0 go_gc_duration_seconds_sum 0 go_gc_duration_seconds_count 0

Part03-Grafana可视化

3.1 安装配置Grafana

# 安装Grafana
[root@fgedu-mon ~]# cat > /etc/yum.repos.d/grafana.repo << 'EOF' [grafana] name=grafana baseurl=https://rpm.grafana.com repo_gpgcheck=1 enabled=1 gpgcheck=1 gpgkey=https://rpm.grafana.com/gpg.key sslverify=1 sslcacert=/etc/pki/tls/certs/ca-bundle.crt EOF [root@fgedu-mon ~]# yum install -y grafana [root@fgedu-mon ~]# systemctl enable grafana-server --now # 配置数据源 [root@fgedu-mon ~]# curl -X POST http://admin:admin@localhost:3000/api/datasources -H 'Content-Type: application/json' -d '{ "name": "Prometheus", "type": "prometheus", "url": "http://localhost:9090", "access": "proxy", "isDefault": true }' {"id":1,"message":"Datasource added","name":"Prometheus"} # 导入Node Exporter Dashboard [root@fgedu-mon ~]# curl -X POST http://admin:admin@localhost:3000/api/dashboards/import -H 'Content-Type: application/json' -d '{ "dashboard": { "id": null, "title": "Node Exporter Full", "tags": ["linux", "node"], "timezone": "browser", "panels": [ { "title": "CPU Usage", "type": "graph", "targets": [ { "expr": "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", "refId": "A" } ] }, { "title": "Memory Usage", "type": "graph", "targets": [ { "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100", "refId": "A" } ] } ] }, "overwrite": true }'

Part04-性能监控脚本

4.1 监控脚本开发

# 创建综合监控脚本
[root@fgedu-mon ~]# cat > /usr/local/bin/perf-monitor.sh << 'EOF' #!/bin/bash # perf-monitor.sh # from:www.itpux.com.qq113257174.wx:itpux-com # web: http://www.fgedu.net.cn PROMETHEUS_URL="http://localhost:9090" ALERT_THRESHOLD_CPU=80 ALERT_THRESHOLD_MEM=85 ALERT_THRESHOLD_DISK=90 echo "=== 系统性能监控报告 ===" echo "监控时间: $(date)" echo "" # CPU使用率 echo "1. CPU使用率" echo "----------------------------------------" CPU_USAGE=$(curl -s "${PROMETHEUS_URL}/api/v1/query?query=100-(avg(irate(node_cpu_seconds_total{mode=\"idle\"}[5m]))*100)" | jq -r '.data.result[0].value[1]' 2>/dev/null | awk ‘{printf “%.1f”, $1}’)
echo “当前CPU使用率: ${CPU_USAGE}%”
if (( $(echo “$CPU_USAGE > $ALERT_THRESHOLD_CPU” | bc -l) )); then
echo “警告: CPU使用率超过阈值!”
fi

echo “”
echo “2. 内存使用率”
echo “—————————————-”
MEM_USAGE=$(curl -s “${PROMETHEUS_URL}/api/v1/query?query=(1-(node_memory_MemAva更多学习教程公众号风哥教程itpux_comilable_bytes/node_memory_MemTotal_bytes))*100” | jq -r ‘.data.result[0].value[1]’ 2>/dev/null | awk ‘{printf “%.1f”, $1}’)
echo “当前内存使用率: ${MEM_USAGE}%”
if (( $(echo “$MEM_USAGE > $ALERT_THRESHOLD_MEM” | bc -l) )); then
echo “警告: 内存使用率超过阈值!”
fi

echo “”
echo “3. 磁盘使用率”
echo “—————————————-”
curl -s “${PROMETHEUS_URL}/api/v1/query?query=(1-(node_filesystem_avail_bytes/node_filesystem_size_bytes))*100” | jq -r ‘.data.result[] | “\(.metric.instance) \(.metric.mountpoint): \(.value[1])%”‘ 2>/dev/null | while read line; do
usage=$(echo $line | awk ‘{print $NF}’ | tr -d ‘%’)
echo “$line”
if (( $(echo “$usage > $ALERT_THRESHOLD_DISK” | bc -l) )); then
echo “警告: 磁盘使用率超过阈值!”
fi
done

echo “”
echo “4. 网络流量”
echo “—————————————-”
curl -s “${PROMETHEUS_URL}/api/v1/query?query=rate(node_network_receive_bytes_total[5m])*8” | jq -r ‘.data.result[] | “\(.metric.instance) \(.metric.device): 接收 \(.value[1]) bps”‘ 2>/dev/null

echo “”
echo “5. 系统负载”
echo “—————————————-”
curl -s “${PROMETHEUS_URL}/api/v1/query?query=node_load1” | jq -r ‘.data.result[] | “\(.metric.instance): 1分钟负载 \(.value[1])”‘ 2>/dev/null

echo “”
echo “6. 活跃告警”
echo “—————————————-”
curl -s “${PROMETHEUS_URL}/api/v1/alerts” | jq -r ‘.data.alerts[] | select(.state==”firing”) | “\(.labels.alertname): \(.annotations.summary)”‘ 2>/dev/null || echo “无活跃告警”

echo “”
echo “=== 监控报告完成 ===”
EOF

[root@fgedu-mon ~]# chmod +x /usr/local/bin/perf-monitor.sh

# 执行监控
[root@fgedu-mon ~]# /usr/local/bin/perf-monitor.sh
=== 系统性能监控报告 ===
监控时间: Sat Apr 4 23:00:00 CST 2026

1. CPU使用率
—————————————-
当前CPU使用率: 25.5%

2. 内存使用率
—————————————-
当前内存使用率: 45.2%

3. 磁盘使用率
—————————————-
192.168.1.10 /: 35.5%
192.168.1.11 /: 42.3%

4. 网络流量
—————————————-
192.168.1.10 eth0: 接收 125000 bps

5. 系统负载
—————————————-
192.168.1.10: 1分钟负载 0.85

6. 活跃告警
—————————————-
无活跃告警

=== 监控报告完成 ===

风哥针对性能监控建议:

  • 建立完善的监控体系
  • 配置合理的告警阈值
  • 使用Grafana进行可视化
  • 定期审查监控指标
  • 建立性能基线

本文由风哥教程整理发布,仅用于学习测试使用,转载注明出处:http://www.fgedu.net.cn/10327.html

联系我们

在线咨询:点击这里给我发消息

微信号:itpux-com

工作日:9:30-18:30,节假日休息