内容简介:本文风哥教程参考Linux官方文档、Red Hat Enterprise Linux官方文档、Ansible Automation Platform官方文档、Docker官方文档、Kubernefrom PG视频:www.itpux.comtes官方文档和Podman官方文档等内容,详细介绍了相关技术的配置和使用方法。
风哥提示:
本文档介绍企业级监控平台部署综合实战案例。
Part01-Prometheus监控平台
1.1 Prometheus高可用部署
[root@fgedu-prometheus ~]# mkdir -p /etc/prometheus /var/lib/prometheus
[root@fgedu-prometheus ~]# cat > /etc/prometheus/prometheus.yml << 'EOF' global: scrape_interval: 15s evaluation_interval: 15s external_labels: monitor: 'fgedu-monitor' alerting: alertmanagers: - static_configs: - targets: - localhost:9093 rule_files: - /etc/prometheus/rules/*.yml scrape_configs: - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] - job_name: 'node-exporter' static_configs: - targets: - '192.168.1.10:9100' - '192.168.1.11:9100' - '192.168.1.12:9100' - '192.168.1.20:9100' - '192.168.1.21:9100' - '192.168.1.22:9100' labels: env: 'production' - job_name: 'nginx-exporter' static_configs: - targets: - '192.168.1.30:9113' - '192.168.1.31:9113' - job_name: 'mysql-exporter' static_configs: - targets: - '192.168.1.40:9104' - '192.168.1.41:9104' - job_name: 'redis-exporter' static_configs: - targets: - '192.168.1.50:9121' - '192.168.1.51:9121' - job_name: 'kubernetes-apiservers' kubernetes_sd_configs: - role: endpoints scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: default;kubernetes;https EOF # 部署Prometheus [root@fgedu-prometheus ~]# docker run -d --name prometheus \ -p 9090:9090 \ -v /etc/prometheus:/etc/prometheus \ -v /var/lib/prometheus:/var/lib/prometheus \ prom/prometheus:v2.48.0 \ --config.file=/etc/prometheus/prometheus.yml \ --storage.tsdb.path=/var/lib/prometheus \ --storage.tsdb.retention.time=30d \ --web.学习交流加群风哥QQ113257174enable-lifecycle # 验证Prometheus [root@fgedu-prometheus ~]# curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets | length' 10
Part02-Grafana可视化
2.1 Grafana部署配置
[root@fgedu-grafana ~]# docker run -d –name grafana \
-p 3000:3000 \
-e “GF_SECURITY_ADMIN_USER=admin” \
-e “GF_SECURITY_ADMIN_PASSWORD=Grafana@123” \
-e “GF_INSTALL_PLUGINS=redis-datasource” \
-v /var/lib/grafana:/var/lib/grafana \
grafana/grafana:10.2.0
# 配置数据源
[root@fgedu-grafana ~]# cat > /etc/grafana/provisioning/datasources/prometheus.yml << 'EOF'
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://192.168.1.100:9090
isDefault: true
editable: false
EOF
# 创建Dashboard
[root@fgedu-grafana ~]# cat > /etc/grafana/provisioning/dashboards/dashboard.yml << 'EOF'
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
options:
path: /etc/grafana/provisioning/dashboards/json
EOF
# 创建系统监控Dashboard
[root@fgedu-grafana ~]# cat > /etc/grafana/provisioning/dashboards/json/system.json << 'EOF'
{
"dashboard": {
"title": "FGEDU系统监控",
"panels": [
{
"title": "CPU使用率",
"type": "graph",
"targets": [
{
"expr": "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "{{instance}}"
}
]
},
{
"title": "内存使用率",
"type": "graph",
"targets": [
{
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
"legendFormat": "{{instance}}"
}
]
},
{
"title": "磁盘使用率",
"type": "graph",
"targets": [
{
"expr": "(1 - (node_filesystem_avail_bytes{fstype!=\"tmpfs\"} / node_filesystem_size_bytes{fstype!=\"tmpfs\"})) * 100",
"legendFormat": "{{instance}} - {{mountpoint}}"
}
]
}
]
}
}
EOF
# 重启Grafana应用配置
[root@fgedu-grafana ~]# docker restart grafana
Part03-AlertManager告警
3.1 告警配置
[root@fgedu-prometheus ~]# cat > /etc/prometheus/alertmanager.yml << 'EOF' global: resolve_timeout: 5m smtp_smarthost: 'smtp.学习交流加群风哥微信: itpux-comfgedu.net.cn:465' smtp_from: 'alert@fgedu.net.cn' smtp_auth_username: 'alert@fgedu.更多视频教程www.fgedu.net.cnnet.cn' smtp_auth_password: 'Alert@123' smtp_require_tls: false route: group_by: ['alertname', 'severity'] group_wait: 30s group_interval: 5m repeat_interval: 4h receiver: 'default-receiver' routes: - match: severity: critical receiver: 'critical-receiver' - match: severity: warning receiver: 'warning-receiver' receivers: - name: 'default-receiver' email_configs: - to: 'ops@fgedu.net.cn' send_resolved: true - name: 'critical-receiver' email_configs: - to: 'ops-critical@fgedu.net.cn' send_resolved: true webhook_configs: - url: 'http://webhook.fgedu.net.cn/alert' send_resolved: true - name: 'warning-receiver' email_configs: - to: 'ops-warning@fgedu.net.cn' send_resolved: true inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'instance'] EOF # 部署AlertManager [root@fgedu-prometheus ~]# docker run -d --name alertmanager \ -p 9093:9093 \ -v /etc/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml \ prom/alertmanager:v0.26.0 # 创建告警规则 [root@fgedu-prometheus ~]# cat > /etc/prometheus/rules/alerts.yml << 'EOF' groups: - name: node_alerts rules: - alert: NodeDown expr: up == 0 for: 1m labels: severity: critical annotations: summary: "节点宕机" description: "节点 {{ $labels.instance }} 已宕机超过1分钟" - alert: HighCPU expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: “CPU使用率过高”
description: “节点 {{ $labels.instance }} CPU使用率超过80%”
– alert: HighMemory
expr: (1 – (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: “内存使用率过高”
description: “节点 {{ $labels.instance }} 内存使用率超过85%”
– alert: DiskSpaceLow
expr: (1 – (node_filesystem_avail_bytes{fstype!=”tmpfs”} / node_filesystem_size_bytes{fstype!=”tmpfs”})) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: “磁盘空间不足”
description: “节点 {{ $labels.instance }} 磁盘 {{ $labels.mountpoint }} 使用率超过85%”
EOF
Part04-日志收集分析
4.1 ELK Stack部署
[root@fgedu-elk ~]# docker run -d –name elasticsearch \
-p 9200:9200 -p 9300:9300 \
-e “discovery.type=single-node” \
-e “ES_JAVA_OPTS=-Xms2g -Xmx2g” \
-v /var/lib/elasticsearch:/usr/share/elasticsearch/data \
elasticsearch:8.11.0
# 部署Logstash
[root@fgedu-elk ~]# cat > /etc/logstash/logstash.conf << 'EOF'
input {
beats {
port => 5044
}
}
filter {
grok {
match => { “message” => “%{SYSLOGBASE} %{GREEDYDATA:syslog_message}” }
}
date {
match => [ “timestamp”, “MMM d HH:mm:ss”, “MMM dd HH:mm:ss” ]
}
}
output {
elasticsearch {
hosts => [“http://elasticsearch:9200”]
index => “fgedu-logs-%{+YYYY.MM.dd}”
}
}
EOF
[root@fgedu-elk ~]# docker run -d –name logstash \
-p 5044:5044 \
-v /etc/logstash:/usr/share/logstash/pipeline \
logstash:8.11.0
# 部署Kibana
[root@fgedu-elk ~]# docker run -d –name kibana \
-p 5601:5601 \
-e “ELASTICSEARCH_HOSTS=http://elasticsearch:9200” \
kibana:8.11.0
# 配置Filebeat
[root@fgedu-app ~]# cat > /etc/filebeat/filebeat.yml << 'EOF'
filebeat.inputs:
- type: log
enabled: true
paths:
- /var/log/nginx/*.log
fields:
type: nginx
fields_under_root: true
- type: log
enabled: true
paths:
- /var/log/messages
fields:
type: system
fields_under_root: true
output.logstash:
hosts: ["192.168.1.100:5044"]
EOF
[root@fgedu-app ~]# systemctl enable filebeat --now
# 验证日志收集
[root@fgedu-elk ~]# curl -s http://localhost:9200/_cat/indices?v
health status index uuid pri rep docs.count docs.deleted store.size pri.store.size
green open fgedu-logs-2026.04.04 abc123def456ghi789... 1 0 12345 0 10.5mb 10.5mb
- 建立完整的监控体系
- 配置多维度告警规则
- 实施日志集中管理
- 定期优化监控策略
- 建立监控运维流程
本文由风哥教程整理发布,仅用于学习测试使用,转载注明出处:http://www.fgedu.net.cn/10327.html
