1. 首页 > Linux教程 > 正文

Linux教程FG523-Linux企业级监控系统部署

内容简介:本文风哥教程参考Linux官方文档、Red Hat Enterprise Linux官方文档、Ansible Automation Platform官方文档、Docker官方文档、Kubernetes官方文档和Podman官方文档学习交流加群风哥QQ113257174等内容,详细介绍了相关技术的配置和使用方法。

风哥提示:

本文档介绍Linux企业级监控系统部署和配置实战。

Part01-Prometheus部署

1.1 Prometheus安装配置

# 安装Prometheus
[root@fgedu-prometheus ~]# useradd -r -s /bin/false prometheus
[root@fgedu-prometheus ~]# mkdir -p /etc/prometheus /var/lib/prometheus
[root@fgedu-prometheus ~]# tar xzf prometheus-2.45.0.linux-amd64.tar.gz -C /usr/local/
[root@fgedu-prometheus ~]# ln -s /usr/local/prometheus-2.45.0.linux-amd64 /usr/local/prometheus

# 配置Prometheus
[root@fgedu-prometheus ~]# cat > /etc/prometheus/prometheus.yml << 'EOF' global: scrape_interval: 15s evaluation_interval: 15s external_labels: monitor: 'fgedu-monitor' # 告警规则文件 rule_files: - /etc/prometheus/rules/*.yml # 告警管理器 alerting: alertmanagers: - static_configs: 更多学习教程公众号风哥教程itpux_com - targets: - 192.168.1.100:9093 # 抓取配置 scrape_configs: # Prometheus自身 - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] # Node Exporter - job_name: 'node-exporter' static_configs: - targets: - 192from PG视频:www.itpux.com.168.1.10:9100 - 192.168.1.11:9100 - 192.168.1.12:9100 labels: env: production # MySQL Exporter - job_name: 'mysql-exporter' static_configs: - targets: ['192.168.1.40:9104'] # Redis Exporter - job_name: 'redis-exporter' static_configs: - targets: ['192.168.1.50:9121'] # Nginx Exporter - job_name: 'nginx-exporter' static_configs: - targets: ['192.168.1.20:9113'] # Blackbox Exporter - job_name: 'blackbox-http' metrics_path: /probe params: module: [http_2xx] static_configs: - targets: - https://www.fgedu.net.cn 学习交流加群风哥微信: itpux-com - https://api.fgedu.net.cn relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 192.168.1.100:9115 # SNMP监控 - job_name: 'snmp' static_configs: - targets: - 192.168.1.1 - 192.168.1.2 metrics_path: /snmp relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 192.168.1.100:9116 EOF # 创建告警规则 [root@fgedu-prometheus ~]# mkdir -p /etc/prometheus/rules [root@fgedu-prometheus ~]# cat > /etc/prometheus/rules/alerts.yml << 'EOF' groups: - name: node_alerts rules: - alert: NodeDown expr: up{job="node-exporter"} == 0 for: 5m labels: severity: critical annotations: summary: "节点不可达: {{ $labels.instance }}" description: "节点已经离线超过5分钟" - alert: HighCPU expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: “CPU使用率过高: {{ $labels.instance }}”
description: “CPU使用率超过80%,当前值: {{ $value }}%”

– alert: HighMemory
expr: (1 – (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: “内存使用率过高: {{ $labels.instance }}”
description: “内存使用率超过85%,当前值: {{ $value }}%”

– alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{fstype!~”tmpfs|overlay”} / node_filesystem_size_bytes{fstype!~”tmpfs|overlay”}) * 100 < 15 for: 5m labels: severity: warning annotations: summary: "磁盘空间不足: {{ $labels.instance }}" description: "磁盘剩余空间低于15%,当前值: {{ $value }}%" EOF # 创建systemd服务 [root@fgedu-prometheus ~]# cat > /etc/systemd/system/prometheus.service << 'EOF' [Unit] Description=Prometheus Monitoring System After=network.target [Service] User=prometheus Group=prometheus ExecStart=/usr/local/prometheus/prometheus \ --config.file=/etc/prometheus/prometheus.yml \ --storage.tsdb.path=/var/lib/prometheus \ --storage.tsdb.retention.time=30d \ --web.console.templates=/usr/local/prometheus/consoles \ --web.console.libraries=/usr/local/prometheus/console_libraries \ --web.listen-address=0.0.0.0:9090 Restart=always [Install] WantedBy=multi-user.target EOF # 启动Prometheus [root@fgedu-prometheus ~]# systemctl enable prometheus --now

Part02-Grafana可视化

2.1 Grafana安装配置

# 安装Grafana
[root@fgedu-grafana ~]# yum install -y grafana

# 配置Grafana
[root@fgedu-grafana ~]# cat > /etc/grafana/grafana.ini << 'EOF' [server] http_addr = 0.0.0.0 http_port = 3000 domain = grafana.fgedu.net.cn root_url = https://grafana.fgedu.net.cn/ [database] type = mysql host = 192.168.1.40:3306 name = grafana user = grafana password = Grafana@123 [security] admin_user = admin admin_password = Admin@123 secret_key = fgedu-secret-key-32-characters [auth] disable_login_form = false [auth.ldap] enabled = true config_file = /etc/grafana/ldap.toml [dashboards] default_home_dashboard_path = /var/lib/grafana/dashboards/home.json [paths] data = /var/lib/grafana logs = /var/log/grafana plugins = /var/lib/grafana/plugins provisioning = /etc/grafana/provisioning [smtp] enabled = true host = smtp.fgedu.net.cn:465 user = grafana@fgedu.net.cn password = Grafana@123 from_address = grafana@fgedu.net.cn from_name = Grafana Alert EOF # 配置数据源 [root@fgedu-grafana ~]# cat > /etc/grafana/provisioning/datasources/prometheus.yml << 'EOF' apiVersion: 1 datasources: - name: Prometheus type: prometheus access: proxy url: http://192.168.1.100:9090 isDefault: true editable: false EOF # 配置仪表板 [root@fgedu-grafana ~]# cat > /etc/grafana/provisioning/dashboards/dashboards.yml << 'EOF' apiVersion: 1 providers: - name: 'default' orgId: 1 folder: '' type: file disableDeletion: false updateIntervalSeconds: 10 options: path: /var/lib/grafana/dashboards EOF # 启动Grafana [root@fgedu-grafana ~]# systemctl enable grafana-server --now # 安装插件 [root@fgedu-grafana ~]# grafana-cli plugins install grafana-clock-panel [root@fgedu-grafana ~]# grafana-cli plugins install grafana-piechart-panel [root@fgedu-grafana ~]# grafana-cli plugins install grafana-worldmap-panel [root@fgedu-grafana ~]# systemctl restart grafana-server

Part03-Alertmanager告警

3.1 告警配置

# 配置Alertmanager
[root@fgedu-prometheus ~]# cat > /etc/prometheus/alertmanager.yml << 'EOF' global: resolve_timeout: 5m smtp_smarthost: 'smtp.fgedu.net.cn:465' smtp_from: 'alert@fgedu.net.cn' smtp_auth_username: 'alert@fgedu.net.cn' smtp_auth_password: 'Alert@123' smtp_require_tls: true # 路由配置 route: group_by: ['alertname', 'severity'] group_wait: 30s group_interval: 5m repeat_interval: 4h receiver: 'default-receiver' routes: - match: severity: critical receiver: 'critical-receiver' continue: true - match: severity: warning receiver: 'warning-receiver' # 接收器配置 receivers: - name: 'default-receiver' email_configs: - to: 'ops@fgedu.net.cn' send_resolved: true - name: 'critical-receiver' email_configs: - to: 'ops@fgedu.net.cn,manager@fgedu.net.cn' send_resolved: true webhook_configs: - url: 'http://192.168.1.200:8060/dingtalk/webhook1/send' send_resolved: true pagerduty_configs: - service_key: 'pagerduty-service-key' - name: 'warning-receiver' email_configs: - to: 'ops@fgedu.net.cn' send_resolved: true # 抑制规则 inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'instance'] EOF # 创建Alertmanager服务 [root@fgedu-prometheus ~]# cat > /etc/systemd/system/alertmanager.service << 'EOF' [Unit] Description=Prometheus Alertmanager After=network.target [Service] User=prometheus Group=prometheus ExecStart=/usr/local/prometheus/alertmanager \ --config.file=/etc/prometheus/alertmanager.yml \ --storage.path=/var/lib/prometheus/alertmanager \ --web.listen-address=0.0.0.0:9093 Restart=always [Install] WantedBy=multi-user.target EOF # 启动Alertmanager [root@fgedu-prometheus ~]# systemctl enable alertmanager --now # 测试告警 [root@fgedu-prometheus ~]# curl -X POST http://localhost:9093/api/v1/alerts -d '[{ "labels": { "alertname": "TestAlert", "severity": "warning" }, "annotations": { "summary": "测试告警", "description": "这是一条测试告警消息" } }]'

Part04-监控实战

4.1 监控脚本配置

# 创建监控脚本
[root@fgedu-prometheus ~]# cat > /usr/local/bin/monitor-check.sh << 'EOF' #!/bin/bash # monitor-check.sh # from:www.itpux.com.qq113257174.wx:itpux-com # web: http://www.fgedu.net.cn echo "=== 监控系统检查 ===" echo "检查时间: $(date)" echo "" echo "1. Prometheus状态" systemctl is-active prometheus curl -s http://localhost:9090/-/healthy echo "" echo "" echo "2. Alertmanager状态" systemctl is-active alertmanager curl -s http://localhost:9093/-/healthy echo "" echo "" echo "3. Grafana状态" systemctl is-active grafana-server curl -s http://localhost:3000/api/health | jq echo "" echo "4. 活跃告警" curl -s http://localhost:9090/api/v1/alerts | jq '.data.alerts | length' echo "" echo "5. 目标状态" curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets | map(select(.health != "up")) | length' echo "" echo "6. 数据存储" du -sh /var/lib/prometheus echo "" echo "=== 检查完成 ===" EOF [root@fgedu-prometheus ~]# chmod +x /usr/local/bin/monitor-check.sh # 配置自定义指标 [root@fgedu-app ~]# cat > /usr/local/bin/custom-exporter.py << 'EOF' #!/usr/bin/env python3 from prometheus_client import start_http_server, Gauge import random import time # 创建自定义指标 REQUEST_COUNT = Gauge('fgedu_request_total', 'Total request count') ACTIVE_USERS = Gauge('fgedu_active_users', 'Number of active users') QUEUE_SIZE = Gauge('fgedu_queue_size', 'Current queue size') def collect_metrics(): while True: REQUEST_COUNT.set(random.randint(1000, 10000)) ACTIVE_USERS.set(random.randint(50, 500)) QUEUE_SIZE.set(random.randint(0, 100)) time.sleep(15) if __name__ == '__main__': start_http_server(9101) collect_metrics() EOF [root@fgedu-app ~]# chmod +x /usr/local/bin/custom-exporter.py
风哥针对监控系统建议:

  • 配置多级告警策略
  • 设置合理的告警阈值
  • 建立仪表板体系
  • 定期检查监控覆盖
  • 优化数据存储策略

本文由风哥教程整理发布,仅用于学习测试使用,转载注明出处:http://www.fgedu.net.cn/10327.html

联系我们

在线咨询:点击这里给我发消息

微信号:itpux-com

工作日:9:30-18:30,节假日休息