内容简介:本文风哥教程参考Linux官方文档、Red Hat Enterprise Linux官方文档、Ansible Automation Platform官方文档、Docker官方文档、Kubernetes官方文档和Podman官方文档等内容,详细介绍了相关技术的配置和使用方法。
风哥提示:
本文档介绍大规模监控平台的搭建方法。
Part01-Prometheus监控平台
1.1 安装Prometheus
[root@prometheus ~]# wget https://github.com/prometheus/prometheus/releases/download/v2.45.0/prometheus-2.45.0.linux-amd64.tar.gz
–2026-04-04 14:10:00– https://github.com/prometheus/prometheus/releases/download/v2.45.0/prometheus-2.45.0.linux-amd64.tar.gz
Resolving github.com (github.com)… 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443… connected.
HTTP request sent, awaiting response… 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/6838921/12345678-90ab-cdef-1234-567890abcdef?X-Amz-Algorithm=AWS4-HMAC-SHA256 [following]
–2026-04-04 14:10:05– https://objects.githubusercontent.com/github-production-release-asset-2e65be/6838921/12345678-90ab-cdef-1234-567890abcdef?X-Amz-Algorithm=AWS4-HMAC-SHA256
Resolving objects.githubusercontent.com (objects.githubusercontent.com)… 185.199.108.133
Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.108.133|:443… connected.
HTTP request sent, awaiting response… 200 OK
Length: 100000000 (95M) [application/octet-stream]
Saving to: ‘prometheus-2.45.0.linux-amd64.from PG视频:www.itpux.comtar.gz’
prometheus-2.45.0.linux 100%[========================>] 95.37M 10.0MB/s in 10s
2026-04-04 14:10:15 (9.54 MB/s) – ‘prometheus-2.45.0.linux-amd64.tar.gz’ saved [100000000/100000000]
# 解压安装
[root@prometheus ~]# tar xzf prometheus-2.45.0.linux-amd64.tar.gz
[root@prometheus ~]# mv prometheus-2.45.0.linux-amd64 /usr/local/prometheus
# 创建配置文件
[root@prometheus ~]# cat > /usr/local/prometheus/prometheus.yml << 'EOF'
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
rule_files:
- "rules/*.yml"
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: "node_exporter"
static_configs:
- targets:
- "192.168.1.101:9100"
- "192.168.1.102:9100"
- "192.168.1.103:9100"
- "192.168.1.104:9100"
- job_name: "nginx_exporter"
static_configs:
- targets:
- "192.168.1.101:9113"
- "192.168.1.102:9113"
- job_name: "mysql_exporter"
static_configs:
- targets:
- "192.168.1.103:9104"
- "192.168.1.104:9104"
EOF
# 创建systemd服务
[root@prometheus ~]# cat > /etc/systemd/system/prometheus.service << 'EOF'
[Unit]
Description=Prometheus Server
After=network.target
[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/usr/local/prometheus/prometheus \
--config.file=/usr/local/prometheus/prometheus.yml \
--storage.tsdb.path=/usr/local/prometheus/data \
--storage.tsdb.retention.time=30d \
--web.listen-address=0.0.0.0:9090
[Install]
WantedBy=multi-user.target
EOF
# 创建用户
[root@prometheus ~]# useradd -r -s /sbin/nologin prometheus
[root@prometheus ~]# chown -R prometheus:prometheus /usr/local/prometheus
# 启动服务
[root@prometheus ~]# systemctl daemon-reload
[root@prometheus ~]# systemctl enable --now prometheus
Created symlink /etc/systemd/system/multi-user.target.wants/prometheus.service → /etc/systemd/system/prometheus.service.
# 验证服务
[root@prometheus ~]# systemctl status prometheus
● prometheus.service - Prometheus Server
Loaded: loaded (/etc/systemd/system/prometheus.service; enabled; preset: disabled)
Active: active (running) since Fri 2026-04-04 14:15:00 CST; 10s ago
Main PID: 12345 (prometheus)
Tasks: 10 (limit: 11232)
Memory: 100.0M
CGroup: /system.slice/prometheus.service
└─12345 /usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml
# 访问Web界面
[root@prometheus ~]# curl http://localhost:9090/-/healthy
Prometheus is Healthy.
1.2 安装Node Exporter
[root@node1 ~]# wget https://github.com/prometheus/node_exporter/releases/download/v1.6.0/node_exporter-1.6.0.linux-amd64.tar.gz
–2026-04-04 14:15:00– https://github.com/prometheus/node_exporter/releases/download/v1.6.0/node_exporter-1.6.0.linux-amd64.tar.gz
Resolving github.com (github.com)… 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443… connected.
HTTP request sent, awaiting response… 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/9524057/12345678-90ab-cdef-1234-567890abcdef?X-Amz-Algorithm=AWS4-HMAC-SHA256 [following]
–2026-04-04 14:15:05– https://objects.githubusercontent.com/github-production-release-asset-2e65be/9524057/12345678-90ab-cdef-1234-567890abcdef?X-Amz-Algorithm=AWS4-HMAC-SHA256
Resolving objects.githubusercontent.com (objects.githubusercontent.com)… 185.199.108.133
Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.108.133|:443… connected.
HTTP request sent, awaiting response… 200 OK
Length: 10000000 (9.5M) [application/octet-stream]
Saving to: ‘node_exporter-1.6.0.linux-amd64.tar.gz’
node_exporter-1.6.0.linu 100%[========================>] 9.54M 5.0MB/s in 2s
2026-04-04 14:15:07 (4.77 MB/s) – ‘node_exporter-1.6.0.linux-amd64.tar.gz’ saved [10000000/10000000]
# 解压安装
[root@node1 ~]# tar xzf node_exporter-1.6.0.linux-amd64.tar.gz
[root@node1 ~]# mv node_exporter-1.6.0.linux-amd64/node_exporter /usr/local/bin/
# 创建systemd服务
[root@node1 ~]# cat > /etc/systemd/system/node_exporter.service << 'EOF'
[Unit]
Description=Node Exporter
After=network.target
[Service]
Type=simple
User=node_exporter
Group=node_exporter
ExecStart=/usr/local/bin/node_exporter
[Install]
WantedBy=multi-user.target
EOF
# 创建用户
[root@node1 ~]# useradd -r -s /sbin/nologin node_exporter
# 启动服务
[root@node1 ~]# systemctl daemon-reload
[root@node1 ~]# systemctl enable --now node_exporter
Created symlink /etc/systemd/system/multi-user.target.wants/node_exporter.service → /etc/systemd/system/node_exporter.service.
# 验证服务
[root@node1 ~]# curl http://localhost:9100/metrics | head -20
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 0.000123
go_gc_duration_seconds{quantile="0.25"} 0.000234
go_gc_duration_seconds{quantile="0.5"} 0.000345
go_gc_duration_seconds{quantile="0.75"} 0.000456
go_gc_duration_seconds{quantile="1"} 0.001234
go_gc_duration_seconds_sum 0.012345
go_gc_duration_seconds_count 10
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 10
# HELP go_info Information about the Go environment.
# TYPE go_info gauge
go_info{version="go1.20.4"} 1
# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.
# TYPE go_memstats_alloc_bytes gauge
go_memstats_alloc_bytes 1.234567e+06
Part02-Grafana可视化
2.1 安装Grafana
[root@grafana ~]# cat > /etc/yum.repos.d/grafana.repo << 'EOF' [grafana] name=grafana baseurl=https://rpm.grafana.com repo_gpgcheck=1 enabled=1 gpgcheck=1 gpgkey=https://rpm.grafana.com/gpg.key sslverify=1 sslcacert=/etc/pki/tls/certs/ca-bundle.crt EOF # 安装Grafana [root@grafana ~]# dnf install -y grafana Updating Subscription Management repositories. Last metadata expiration check: 0:05:23 ago on Fri Apr 4 14:20:00 2026. Dependencies resolved. ================================================================================ Package Architecture Version Repository Size ================================================================================ Installing: grafana x86_64 10.0.0-1 grafana 50 M Transaction Summary ================================================================================ Install 1 Package Total download size: 50 M Installed size: 150 M Downloading Packages: grafana-10.0.0-1.x86_64.rpm 20 MB/s | 50 MB 00:02 -------------------------------------------------------------------------------- Total 20 MB/s | 50 MB 00:02 Running transaction check Transaction check succeeded. Running transaction test Transaction test succeeded. Running transaction Preparing : 1/1 Installing : grafana-10.0.0-1.x86_64 1/1 Running scriptlet: grafana-10.0.0-1.x86_64 1/1 Verifying : grafana-10.0.0-1.x86_64 1/1 Installed: grafana-10.0.0-1.x86_64 Complete! # 启动Grafana [root@grafana ~]# systemctl enable --now grafana-server Created symlink /etc/systemd/system/multi-user.target.wants/grafana-server.service → /usr/lib/systemd/system/grafana-server.service. # 验证服务 [root@grafana ~]# systemctl status grafana-server ● grafana-server.service - Grafana instance Loaded: loaded (/usr/lib/systemd/system/grafana-server.service; enabled; preset: disabled) Active: active (running) since Fri 2026-04-04 14:25:00 CST; 10s ago Docs: http://docs.grafana.org Main PID: 12345 (grafana-server) Tasks: 10 (limit: 11232) Memory: 100.0M CGroup: /system.slice/grafana-server.service └─12345 /usr/share/grafana/bin/grafana-server --config=/etc/grafana/grafana.ini # 访问Web界面 [root@grafana ~]# curl -I http://localhost:3000 HTTP/1.1 302 Found Cache-Control: no-cache Content-Type: text/html; charset=utf-8 Location: /login Date: Fri, 04 Apr 2026 14:25:00 GMT
2.2 配置数据源
[root@grafana ~]# curl -X POST http://admin:admin@localhost:3000/api/datasources \
-H “Content-Type: application/json” \
-d ‘{
“name”: “Prometheus”,
“type”: “prometheus”,
“url”: “http://192.168.1.10:9090”,
“access”: “proxy”,
“isDefault”: true
}’
{“id”:1,”message”:”Datasource added”,”name”:”Prometheus”}
# 导入Node Exporter仪表板
[root@grafana ~]# curl -X POST http://admin:admin@localhost:3000/api/dashboards/import \
-H “Content-Type: application/json” \
-d ‘{
“dashboard”: {
“id”: null,
“title”: “Node Exporter Full”,
“tags”: [“linux”, “node”],
“timezone”: “browser”,
“panels”: []
},
“overwrite”: true
}’
{“id”:1,”slug”:”node-exporter-full”,”status”:”success”,”uid”:”abcdefg”,”url”:”/d/abcdefg/node-exporter-full”}
# 查看数据源列表
[root@grafana ~]# curl http://admin:admin@localhost:3000/api/datasources
[
{
“id”: 1,
“uid”: “PDXXXXXXXX”,
“name”: “Prometheus”,
“type”: “prometheus”,
“typeLogoUrl”: “public/app/plugins/datasource/prometheus/img/prometheus_logo.svg”,
“access”: “proxy”,
“url”: “http://192.168.1.10:9090”,
“isDefault”: true
}
]
Part03-告警配置
3.1 配置Alertmanager
[root@prometheus ~]# wget https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.更多视频教程www.fgedu.net.cn25.0.linux-amd64.tar.gz
–2026-04-04 14:30:00– https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.25.0.linux-amd64.tar.gz
Resolving github.com (github.com)… 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443… connected.
HTTP request sent, awaiting response… 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/9524057/12345678-90ab-cdef-1234-567890abcdef?X-Amz-Algorithm=AWS4-HMAC-SHA256 [following]
–2026-04-04 14:30:05– https://objects.githubusercontent.com/github-production-release-asset-2e65be/9524057/12345678-90ab-cdef-1234-567890abcdef?X-Amz-Algorithm=AWS4-HMAC-SHA256
Resolving objects.githubusercontent.com (objects.githubusercontent.com)… 185.199.108.133
Connecting to objects.githubusercontent.com (objects.githubusercon学习交流加群风哥QQ113257174tent.com)|185.199.108.133|:443… connected.
HTTP request sent, awaiting response… 200 OK
Length: 20000000 (19M) [application/octet-stream]
Saving to: ‘alertmanager-0.25.0.linux-amd64.tar.g学习交流加群风哥微信: itpux-comz’
alertmanager-0.25.0.li 100%[========================>] 19.07M 10.0MB/s in 2s
2026-04-04 14:30:07 (9.54 MB/s) – ‘alertmanager-0.25.0.linux-amd64.tar.gz’ saved [20000000/20000000]
# 解压安装
[root@prometheus ~]# tar xzf alertmanager-0.25.0.linux-amd64.tar.gz
[root@prometheus ~]# mv alertmanager-0.25.0.linux-amd64 /usr/local/alertmanager
# 配置Alertmanager
[root@prometheus ~]# cat > /usr/local/alertmanager/alertmanager.yml << 'EOF'
global:
smtp_smarthost: 'smtp.fgedu.net.cn:25'
smtp_from: 'alertmanager@fgedu.net.cn'
smtp_auth_username: 'alertmanager@fgedu.net.cn'
smtp_auth_password: 'password'
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'email-notifications'
receivers:
- name: 'email-notifications'
email_configs:
- to: 'admin@fgedu.net.cn'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
EOF
# 创建告警规则
[root@prometheus ~]# mkdir -p /usr/local/prometheus/rules
[root@prometheus ~]# cat > /usr/local/prometheus/rules/node.yml << 'EOF'
groups:
- name: node_alerts
rules:
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: High CPU usage on {{ $labels.instance }}
description: CPU usage is {{ $value }}%
– alert: HighMemoryUsage
expr: (1 – (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: High memory usage on {{ $labels.instance }}
description: Memory usage is {{ $value }}%
– alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{fstype!=”tmpfs”} / node_filesystem_size_bytes{fstype!=”tmpfs”}) * 100 < 10
for: 5m
labels:
severity: critical
annotations:
summary: Low disk space on {{ $labels.instance }}
description: Disk {{ $labels.mountpoint }} has only {{ $value }}% free
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: Instance {{ $labels.instance }} down
description: {{ $labels.instance }} has been down for more than 1 minute.
EOF
# 重启Prometheus
[root@prometheus ~]# systemctl restart prometheus
- 使用Prometheus+Grafana组合
- 部署Node Exporter收集指标
- 配置合理的告警规则
- 设置告警通知渠道
- 定期维护监控平台
本文由风哥教程整理发布,仅用于学习测试使用,转载注明出处:http://www.fgedu.net.cn/10327.html
