1. 首页 > Linux教程 > 正文

Linux教程FG334-大规模监控平台搭建

内容简介:本文风哥教程参考Linux官方文档、Red Hat Enterprise Linux官方文档、Ansible Automation Platform官方文档、Docker官方文档、Kubernetes官方文档和Podman官方文档等内容,详细介绍了相关技术的配置和使用方法。

风哥提示:

本文档介绍大规模监控平台的搭建方法。

Part01-Prometheus监控平台

1.1 安装Prometheus

# 下载Prometheus
[root@prometheus ~]# wget https://github.com/prometheus/prometheus/releases/download/v2.45.0/prometheus-2.45.0.linux-amd64.tar.gz
–2026-04-04 14:10:00– https://github.com/prometheus/prometheus/releases/download/v2.45.0/prometheus-2.45.0.linux-amd64.tar.gz
Resolving github.com (github.com)… 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443… connected.
HTTP request sent, awaiting response… 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/6838921/12345678-90ab-cdef-1234-567890abcdef?X-Amz-Algorithm=AWS4-HMAC-SHA256 [following]
–2026-04-04 14:10:05– https://objects.githubusercontent.com/github-production-release-asset-2e65be/6838921/12345678-90ab-cdef-1234-567890abcdef?X-Amz-Algorithm=AWS4-HMAC-SHA256
Resolving objects.githubusercontent.com (objects.githubusercontent.com)… 185.199.108.133
Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.108.133|:443… connected.
HTTP request sent, awaiting response… 200 OK
Length: 100000000 (95M) [application/octet-stream]
Saving to: ‘prometheus-2.45.0.linux-amd64.from PG视频:www.itpux.comtar.gz’

prometheus-2.45.0.linux 100%[========================>] 95.37M 10.0MB/s in 10s

2026-04-04 14:10:15 (9.54 MB/s) – ‘prometheus-2.45.0.linux-amd64.tar.gz’ saved [100000000/100000000]

# 解压安装
[root@prometheus ~]# tar xzf prometheus-2.45.0.linux-amd64.tar.gz
[root@prometheus ~]# mv prometheus-2.45.0.linux-amd64 /usr/local/prometheus

# 创建配置文件
[root@prometheus ~]# cat > /usr/local/prometheus/prometheus.yml << 'EOF' global: scrape_interval: 15s evaluation_interval: 15s alerting: alertmanagers: - static_configs: - targets: - localhost:9093 rule_files: - "rules/*.yml" scrape_configs: - job_name: "prometheus" static_configs: - targets: ["localhost:9090"] - job_name: "node_exporter" static_configs: - targets: - "192.168.1.101:9100" - "192.168.1.102:9100" - "192.168.1.103:9100" - "192.168.1.104:9100" - job_name: "nginx_exporter" static_configs: - targets: - "192.168.1.101:9113" - "192.168.1.102:9113" - job_name: "mysql_exporter" static_configs: - targets: - "192.168.1.103:9104" - "192.168.1.104:9104" EOF # 创建systemd服务 [root@prometheus ~]# cat > /etc/systemd/system/prometheus.service << 'EOF' [Unit] Description=Prometheus Server After=network.target [Service] Type=simple User=prometheus Group=prometheus ExecStart=/usr/local/prometheus/prometheus \ --config.file=/usr/local/prometheus/prometheus.yml \ --storage.tsdb.path=/usr/local/prometheus/data \ --storage.tsdb.retention.time=30d \ --web.listen-address=0.0.0.0:9090 [Install] WantedBy=multi-user.target EOF # 创建用户 [root@prometheus ~]# useradd -r -s /sbin/nologin prometheus [root@prometheus ~]# chown -R prometheus:prometheus /usr/local/prometheus # 启动服务 [root@prometheus ~]# systemctl daemon-reload [root@prometheus ~]# systemctl enable --now prometheus Created symlink /etc/systemd/system/multi-user.target.wants/prometheus.service → /etc/systemd/system/prometheus.service. # 验证服务 [root@prometheus ~]# systemctl status prometheus ● prometheus.service - Prometheus Server Loaded: loaded (/etc/systemd/system/prometheus.service; enabled; preset: disabled) Active: active (running) since Fri 2026-04-04 14:15:00 CST; 10s ago Main PID: 12345 (prometheus) Tasks: 10 (limit: 11232) Memory: 100.0M CGroup: /system.slice/prometheus.service └─12345 /usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml # 访问Web界面 [root@prometheus ~]# curl http://localhost:9090/-/healthy Prometheus is Healthy.

1.2 安装Node Exporter

# 在所有节点安装Node Exporter
[root@node1 ~]# wget https://github.com/prometheus/node_exporter/releases/download/v1.6.0/node_exporter-1.6.0.linux-amd64.tar.gz
–2026-04-04 14:15:00– https://github.com/prometheus/node_exporter/releases/download/v1.6.0/node_exporter-1.6.0.linux-amd64.tar.gz
Resolving github.com (github.com)… 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443… connected.
HTTP request sent, awaiting response… 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/9524057/12345678-90ab-cdef-1234-567890abcdef?X-Amz-Algorithm=AWS4-HMAC-SHA256 [following]
–2026-04-04 14:15:05– https://objects.githubusercontent.com/github-production-release-asset-2e65be/9524057/12345678-90ab-cdef-1234-567890abcdef?X-Amz-Algorithm=AWS4-HMAC-SHA256
Resolving objects.githubusercontent.com (objects.githubusercontent.com)… 185.199.108.133
Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.108.133|:443… connected.
HTTP request sent, awaiting response… 200 OK
Length: 10000000 (9.5M) [application/octet-stream]
Saving to: ‘node_exporter-1.6.0.linux-amd64.tar.gz’

node_exporter-1.6.0.linu 100%[========================>] 9.54M 5.0MB/s in 2s

2026-04-04 14:15:07 (4.77 MB/s) – ‘node_exporter-1.6.0.linux-amd64.tar.gz’ saved [10000000/10000000]

# 解压安装
[root@node1 ~]# tar xzf node_exporter-1.6.0.linux-amd64.tar.gz
[root@node1 ~]# mv node_exporter-1.6.0.linux-amd64/node_exporter /usr/local/bin/

# 创建systemd服务
[root@node1 ~]# cat > /etc/systemd/system/node_exporter.service << 'EOF' [Unit] Description=Node Exporter After=network.target [Service] Type=simple User=node_exporter Group=node_exporter ExecStart=/usr/local/bin/node_exporter [Install] WantedBy=multi-user.target EOF # 创建用户 [root@node1 ~]# useradd -r -s /sbin/nologin node_exporter # 启动服务 [root@node1 ~]# systemctl daemon-reload [root@node1 ~]# systemctl enable --now node_exporter Created symlink /etc/systemd/system/multi-user.target.wants/node_exporter.service → /etc/systemd/system/node_exporter.service. # 验证服务 [root@node1 ~]# curl http://localhost:9100/metrics | head -20 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary go_gc_duration_seconds{quantile="0"} 0.000123 go_gc_duration_seconds{quantile="0.25"} 0.000234 go_gc_duration_seconds{quantile="0.5"} 0.000345 go_gc_duration_seconds{quantile="0.75"} 0.000456 go_gc_duration_seconds{quantile="1"} 0.001234 go_gc_duration_seconds_sum 0.012345 go_gc_duration_seconds_count 10 # HELP go_goroutines Number of goroutines that currently exist. # TYPE go_goroutines gauge go_goroutines 10 # HELP go_info Information about the Go environment. # TYPE go_info gauge go_info{version="go1.20.4"} 1 # HELP go_memstats_alloc_bytes Number of bytes allocated and still in use. # TYPE go_memstats_alloc_bytes gauge go_memstats_alloc_bytes 1.234567e+06

Part02-Grafana可视化

2.1 安装Grafana

# 添加Grafana仓库
[root@grafana ~]# cat > /etc/yum.repos.d/grafana.repo << 'EOF' [grafana] name=grafana baseurl=https://rpm.grafana.com repo_gpgcheck=1 enabled=1 gpgcheck=1 gpgkey=https://rpm.grafana.com/gpg.key sslverify=1 sslcacert=/etc/pki/tls/certs/ca-bundle.crt EOF # 安装Grafana [root@grafana ~]# dnf install -y grafana Updating Subscription Management repositories. Last metadata expiration check: 0:05:23 ago on Fri Apr 4 14:20:00 2026. Dependencies resolved. ================================================================================ Package Architecture Version Repository Size ================================================================================ Installing: grafana x86_64 10.0.0-1 grafana 50 M Transaction Summary ================================================================================ Install 1 Package Total download size: 50 M Installed size: 150 M Downloading Packages: grafana-10.0.0-1.x86_64.rpm 20 MB/s | 50 MB 00:02 -------------------------------------------------------------------------------- Total 20 MB/s | 50 MB 00:02 Running transaction check Transaction check succeeded. Running transaction test Transaction test succeeded. Running transaction Preparing : 1/1 Installing : grafana-10.0.0-1.x86_64 1/1 Running scriptlet: grafana-10.0.0-1.x86_64 1/1 Verifying : grafana-10.0.0-1.x86_64 1/1 Installed: grafana-10.0.0-1.x86_64 Complete! # 启动Grafana [root@grafana ~]# systemctl enable --now grafana-server Created symlink /etc/systemd/system/multi-user.target.wants/grafana-server.service → /usr/lib/systemd/system/grafana-server.service. # 验证服务 [root@grafana ~]# systemctl status grafana-server ● grafana-server.service - Grafana instance Loaded: loaded (/usr/lib/systemd/system/grafana-server.service; enabled; preset: disabled) Active: active (running) since Fri 2026-04-04 14:25:00 CST; 10s ago Docs: http://docs.grafana.org Main PID: 12345 (grafana-server) Tasks: 10 (limit: 11232) Memory: 100.0M CGroup: /system.slice/grafana-server.service └─12345 /usr/share/grafana/bin/grafana-server --config=/etc/grafana/grafana.ini # 访问Web界面 [root@grafana ~]# curl -I http://localhost:3000 HTTP/1.1 302 Found Cache-Control: no-cache Content-Type: text/html; charset=utf-8 Location: /login Date: Fri, 04 Apr 2026 14:25:00 GMT

2.2 配置数据源

# 添加Prometheus数据源
[root@grafana ~]# curl -X POST http://admin:admin@localhost:3000/api/datasources \
-H “Content-Type: application/json” \
-d ‘{
“name”: “Prometheus”,
“type”: “prometheus”,
“url”: “http://192.168.1.10:9090”,
“access”: “proxy”,
“isDefault”: true
}’
{“id”:1,”message”:”Datasource added”,”name”:”Prometheus”}

# 导入Node Exporter仪表板
[root@grafana ~]# curl -X POST http://admin:admin@localhost:3000/api/dashboards/import \
-H “Content-Type: application/json” \
-d ‘{
“dashboard”: {
“id”: null,
“title”: “Node Exporter Full”,
“tags”: [“linux”, “node”],
“timezone”: “browser”,
“panels”: []
},
“overwrite”: true
}’
{“id”:1,”slug”:”node-exporter-full”,”status”:”success”,”uid”:”abcdefg”,”url”:”/d/abcdefg/node-exporter-full”}

# 查看数据源列表
[root@grafana ~]# curl http://admin:admin@localhost:3000/api/datasources
[
{
“id”: 1,
“uid”: “PDXXXXXXXX”,
“name”: “Prometheus”,
“type”: “prometheus”,
“typeLogoUrl”: “public/app/plugins/datasource/prometheus/img/prometheus_logo.svg”,
“access”: “proxy”,
“url”: “http://192.168.1.10:9090”,
“isDefault”: true
}
]

Part03-告警配置

3.1 配置Alertmanager

# 下载Alertmanager
[root@prometheus ~]# wget https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.更多视频教程www.fgedu.net.cn25.0.linux-amd64.tar.gz
–2026-04-04 14:30:00– https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.25.0.linux-amd64.tar.gz
Resolving github.com (github.com)… 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443… connected.
HTTP request sent, awaiting response… 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/9524057/12345678-90ab-cdef-1234-567890abcdef?X-Amz-Algorithm=AWS4-HMAC-SHA256 [following]
–2026-04-04 14:30:05– https://objects.githubusercontent.com/github-production-release-asset-2e65be/9524057/12345678-90ab-cdef-1234-567890abcdef?X-Amz-Algorithm=AWS4-HMAC-SHA256
Resolving objects.githubusercontent.com (objects.githubusercontent.com)… 185.199.108.133
Connecting to objects.githubusercontent.com (objects.githubusercon学习交流加群风哥QQ113257174tent.com)|185.199.108.133|:443… connected.
HTTP request sent, awaiting response… 200 OK
Length: 20000000 (19M) [application/octet-stream]
Saving to: ‘alertmanager-0.25.0.linux-amd64.tar.g学习交流加群风哥微信: itpux-comz’

alertmanager-0.25.0.li 100%[========================>] 19.07M 10.0MB/s in 2s

2026-04-04 14:30:07 (9.54 MB/s) – ‘alertmanager-0.25.0.linux-amd64.tar.gz’ saved [20000000/20000000]

# 解压安装
[root@prometheus ~]# tar xzf alertmanager-0.25.0.linux-amd64.tar.gz
[root@prometheus ~]# mv alertmanager-0.25.0.linux-amd64 /usr/local/alertmanager

# 配置Alertmanager
[root@prometheus ~]# cat > /usr/local/alertmanager/alertmanager.yml << 'EOF' global: smtp_smarthost: 'smtp.fgedu.net.cn:25' smtp_from: 'alertmanager@fgedu.net.cn' smtp_auth_username: 'alertmanager@fgedu.net.cn' smtp_auth_password: 'password' route: group_by: ['alertname'] group_wait: 30s group_interval: 5m repeat_interval: 1h receiver: 'email-notifications' receivers: - name: 'email-notifications' email_configs: - to: 'admin@fgedu.net.cn' send_resolved: true inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'instance'] EOF # 创建告警规则 [root@prometheus ~]# mkdir -p /usr/local/prometheus/rules [root@prometheus ~]# cat > /usr/local/prometheus/rules/node.yml << 'EOF' groups: - name: node_alerts rules: - alert: HighCPUUsage expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: High CPU usage on {{ $labels.instance }}
description: CPU usage is {{ $value }}%

– alert: HighMemoryUsage
expr: (1 – (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: High memory usage on {{ $labels.instance }}
description: Memory usage is {{ $value }}%

– alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{fstype!=”tmpfs”} / node_filesystem_size_bytes{fstype!=”tmpfs”}) * 100 < 10 for: 5m labels: severity: critical annotations: summary: Low disk space on {{ $labels.instance }} description: Disk {{ $labels.mountpoint }} has only {{ $value }}% free - alert: InstanceDown expr: up == 0 for: 1m labels: severity: critical annotations: summary: Instance {{ $labels.instance }} down description: {{ $labels.instance }} has been down for more than 1 minute. EOF # 重启Prometheus [root@prometheus ~]# systemctl restart prometheus

风哥针对监控平台搭建建议:

  • 使用Prometheus+Grafana组合
  • 部署Node Exporter收集指标
  • 配置合理的告警规则
  • 设置告警通知渠道
  • 定期维护监控平台

本文由风哥教程整理发布,仅用于学习测试使用,转载注明出处:http://www.fgedu.net.cn/10327.html

联系我们

在线咨询:点击这里给我发消息

微信号:itpux-com

工作日:9:30-18:30,节假日休息