内容简介:本文风哥教程参考Linux官方文档、Red Hat Enterprise Linux官方文档、Ansible Automation Platform官方文档、Docker官方文档、Kuberne更多学习教程公众号风哥教程itpux_comtes官方文档和Podman官方文档等内容,详细介绍了相关技术的配置和使用方法。
本文档介绍性能监控平台的部署和应用。
风哥提示:
Part01-Prometheus高级配置
1.1 Prometheus联邦集群
[root@prometheus-master ~]# cat > /opt/prometheus/prometheus.yml << 'EOF' global: scrape_interval: 15s evaluation_interval: 15s alerting: alertmanagers: - static_configs: - targets: - 192.168.1.100:9093 rule_files: - "/opt/prometheus/rules/*.yml" scrape_configs: - job_name: "prometheus" static_configs: - targets: ["localhost:9090"] - job_name: "federate" metrics_path: "/federate" honor_labels: true params: "match[]": - '{job="node_exporter"}' - '{job="nginx"}' - '{job="mysql"}' static_configs: - targets: - "192.168.1.11:9090" - "192.168.1.12:9090" - "192.168.1.13:9090" labels: datacenter: "dc1" EOF # 配置告警规则 [root@prometheus-master ~]# mkdir -p /opt/prometheus/rules [root@prometheus-master ~]# cat > /opt/prometheus/rules/alerts.yml << 'EOF' groups: - name: node_alerts rules: - alert: HighCPUUsage expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: “High CPU usage on {{ $labels.instance }}”
description: “CPU usage is {{ $value }}%”
– alert: HighMemoryUsage
expr: (1 – (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: “High memory usage on {{ $labels.instance }}”
description: “Memory usage is {{ $value }}%”
– alert: DiskSpaceLow
expr: (1 – (node_filesystem_avail_bytes{fstype!=”tmpfs”} / node_filesystem_size_bytes{fstype!=”tmpfs”})) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: “Low disk space on {{ $labels.instance }}”
description: “Disk usage is {{ $value }}%”
– alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: “Instance {{ $labels.instance }} down”
description: “{{ $labels.instance }} has been down for more than 1 minute.”
– name: nginx_alerts
rules:
– alert: HighErrorRate
expr: sum(rate(nginx_http_requests_total{status=~”5..”}[5m])) / sum(rate(nginx_http_requests_total[5m])) * 100 > 5
for: 5m
labels:
severity: warning
annotations:
summary: “High error rate”
description: “Error rate is {{ $value }}%”
– alert: HighLatency
expr: histogram_quantile(0.95, sum(rate(nginx_http_request_duration_seconds_bucket[5m])) by (le)) > 1
for: 5m
labels:
severity: warning
annotations:
summary: “High request latency”
description: “95th percentile latency is {{ $value }}s”
EOF
# 重载配置
[root@prometheus-master ~]# curl -X POST http://localhost:9090/-/reload
1.2 Alertmanager配置
[root@alertmanager ~]# cd /opt
[root@alertmanager opt]# wget https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.25.0.linux-amd64.tar.gz
[root@alertmanager opt]# tar xzf alertmanager-0.25.0.linux-amd64.更多视频教程www.fgedu.net.cntar.gz
[root@alertmanager opt]# ln -s alertmanager-0.25.0.linux-amd64 alertmanager
# 配置Alertmanager
[root@alertmanager ~]# cat > /opt/alertmanager/alertmanager.yml << 'EOF'
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.fgedu.net.cn:587'
smtp_from: 'alertmanager@fgedu.net.cn'
smtp_auth_username: 'alertmanager@fgedu.net.cn'
smtp_auth_password: 'password'
route:
group_by: ['alertname', 'severity']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: 'default-receiver'
routes:
- match:
severity: critical
receiver: 'critical-receiver'
- match:
severity: warning
receiver: 'warning-receiver'
receivers:
- name: 'default-receiver'
email_configs:
- to: 'admin@fgedu.net.cn'
send_resolved: true
- name: 'critical-receiver'
email_configs:
- to: 'admin@fgedu.net.cn,oncall@fgedfrom PG视频:www.itpux.comu.net.cn'
send_resolved: true
webhook_configs:
- url: 'http://192.168.1.200:8060/dingtalk/webhook1/send'
send_resolved: true
- name: 'warning-receiver'
email_configs:
- to: 'admin@fgedu.net.cn'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
EOF
# 创建systemd服务
[root@alertmanager ~]# cat > /etc/systemd/system/alertmanager.service << 'EOF'
[Unit]
Description=Alertmanager
Wants=network-online.target
After=network-online.target
[Service]
Type=simple
ExecStart=/opt/alertmanager/alertmanager \
--config.file=/opt/alertmanager/alertmanager.yml \
--storage.path=/var/lib/alertmanager
[Install]
WantedBy=multi-user.target
EOF
# 启动Alertmanager
[root@alertmanager ~]# systemctl enable --now alertmanager
# 测试告警
[root@alertmanager ~]# curl -X POST http://localhost:9093/api/v1/alerts -d '[{
"labels": {
"alertname": "TestAlert",
"severity": "warning"
},
"annotations": {
"summary": "This is a test alert"
}
}]'
Part02-Grafana高级配置
2.1 自定义仪表盘
[root@grafana ~]# cat > /tmp/node_dashboard.json << 'EOF' { "dashboard": { "title": "Node Exporter Full", "uid": "node-exporter", "panels": [ { "title": "CPU Usage", "type": "graph", "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}, "targets": [ { "expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", "legendFormat": "{{ instance }}" } ] }, { "title": "Memory Usage", "type": "graph", "gridPos": {"x": 12, "y": 0, "w": 12, "h": 8}, "targets": [ { "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100", "legendFormat": "{{ instance }}" } ] }, { "title": "Disk I/O", "type": "graph", "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8}, "targets": [ { "expr": "rate(node_disk_read_bytes_total[5m])", "legendFormat": "Read - {{ instance }}" }, { "expr": "rate(node_disk_written_bytes_total[5m])", "legendFormat": "Write - {{ instance }}" } ] }, { "title": "Network Traffic", "type": "graph", "gridPos": {"x": 12, "y": 8, "w": 12, "h": 8}, "targets": [ { "expr": "rate(node_network_receive_bytes_total{device!=\"lo\"}[5m])", "legendFormat": "Receive - {{ instance }}" }, { "expr": "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m])", "legendFormat": "Transmit - {{ instance }}" } ] } ] }, "overwrite": true } EOF # 导入仪表盘 [root@grafana ~]# curl -X POST "http://admin:admin@localhost:3000/api/dashboards/db" \ -H "Content-Type: application/json" \ -d @/tmp/node_dashboard.json # 配置告警通知渠道 [root@grafana ~]# curl -X POST "http://admin:admin@localhost:3000/api/alert-notifications" \ -H "Content-Type: application/json" \ -d '{ "name": "Email Alert", "type": "email", "isDefault": true, "settings": { "addresses": "admin@fgedu.net.cn" } }'
- 配置合理的告警阈值
- 设置告警分级和路由
- 使用仪表盘模板
- 配置数据保留策略
- 监控监控系统本身
本文由风哥教程整理发布,仅用于学习测试使用,转载注明出处:http://www.fgedu.net.cn/10327.html
