add alert-rules-patch.yaml,recording-rules-patch.yaml
This commit is contained in:
parent
1a4276e10a
commit
d5b3391195
57
clusters/monitor/alert-rules-patch.yaml
Normal file
57
clusters/monitor/alert-rules-patch.yaml
Normal file
@ -0,0 +1,57 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
annotations:
|
||||
meta.helm.sh/release-name: observability-server
|
||||
meta.helm.sh/release-namespace: monitoring
|
||||
labels:
|
||||
app.kubernetes.io/component: server
|
||||
app.kubernetes.io/instance: observability-server
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
app.kubernetes.io/name: prometheus
|
||||
app.kubernetes.io/part-of: prometheus
|
||||
app.kubernetes.io/version: v2.48.1
|
||||
helm.sh/chart: prometheus-25.8.2
|
||||
name: observability-server-prometheus-server
|
||||
namespace: monitoring
|
||||
data:
|
||||
alerting_rules.yml: |
|
||||
groups:
|
||||
- name: Load-monitoring
|
||||
rules:
|
||||
- alert: HighLoad
|
||||
expr: node_load1 > 2.0 for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: High load on {{ $labels.instance }}
|
||||
description: "Load is {{ $value }} (threshold: 2.0)"
|
||||
- name: cpu-usage-monitoring
|
||||
rules:
|
||||
- alert: HighCpuUsage
|
||||
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90 for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: High CPU usage on {{ $labels.instance }}
|
||||
description: "CPU usage is {{ $value }}%"
|
||||
- name: memory-usage-monitoring
|
||||
rules:
|
||||
- alert: HighMemoryUsage
|
||||
expr: (node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / node_memory_MemTotal_bytes * 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: High memory usage on {{ $labels.instance }}
|
||||
description: "Memory usage is {{ $value }}%"
|
||||
- name: disk-usage-monitoring
|
||||
rules:
|
||||
- alert: HighDiskUsage
|
||||
expr: node_filesystem_avail_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: High disk usage on {{ $labels.instance }}
|
||||
description: "Disk usage is {{ $value }}%"
|
||||
@ -5,3 +5,7 @@ resources:
|
||||
- namespace.yaml
|
||||
- helmrepo.yaml
|
||||
- observability-agent.yaml
|
||||
- prometheus-server-configmap.yaml
|
||||
patchesStrategicMerge:
|
||||
- alert-rules-patch.yaml
|
||||
- recording-rules-patch.yaml
|
||||
|
||||
339
clusters/monitor/prometheus-server-configmap.yaml
Normal file
339
clusters/monitor/prometheus-server-configmap.yaml
Normal file
@ -0,0 +1,339 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
annotations:
|
||||
meta.helm.sh/release-name: observability-server
|
||||
meta.helm.sh/release-namespace: monitoring
|
||||
labels:
|
||||
app.kubernetes.io/component: server
|
||||
app.kubernetes.io/instance: observability-server
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
app.kubernetes.io/name: prometheus
|
||||
app.kubernetes.io/part-of: prometheus
|
||||
app.kubernetes.io/version: v2.48.1
|
||||
helm.sh/chart: prometheus-25.8.2
|
||||
name: observability-server-prometheus-server
|
||||
namespace: monitoring
|
||||
data:
|
||||
alerting_rules.yml: |
|
||||
{}
|
||||
alerts: |
|
||||
{}
|
||||
allow-snippet-annotations: "false"
|
||||
prometheus.yml: |
|
||||
global:
|
||||
evaluation_interval: 1m
|
||||
scrape_interval: 1m
|
||||
scrape_timeout: 10s
|
||||
rule_files:
|
||||
- /etc/config/recording_rules.yml
|
||||
- /etc/config/alerting_rules.yml
|
||||
scrape_configs:
|
||||
- job_name: prometheus
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost:9090
|
||||
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
job_name: kubernetes-apiservers
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
regex: default;kubernetes;https
|
||||
source_labels:
|
||||
- __meta_kubernetes_namespace
|
||||
- __meta_kubernetes_service_name
|
||||
- __meta_kubernetes_endpoint_port_name
|
||||
scheme: https
|
||||
tls_config:
|
||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
insecure_skip_verify: true
|
||||
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
job_name: kubernetes-nodes
|
||||
kubernetes_sd_configs:
|
||||
- role: node
|
||||
relabel_configs:
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_node_label_(.+)
|
||||
- replacement: kubernetes.default.svc:443
|
||||
target_label: __address__
|
||||
- regex: (.+)
|
||||
replacement: /api/v1/nodes/$1/proxy/metrics
|
||||
source_labels:
|
||||
- __meta_kubernetes_node_name
|
||||
target_label: __metrics_path__
|
||||
scheme: https
|
||||
tls_config:
|
||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
insecure_skip_verify: true
|
||||
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
job_name: kubernetes-nodes-cadvisor
|
||||
kubernetes_sd_configs:
|
||||
- role: node
|
||||
relabel_configs:
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_node_label_(.+)
|
||||
- replacement: kubernetes.default.svc:443
|
||||
target_label: __address__
|
||||
- regex: (.+)
|
||||
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
|
||||
source_labels:
|
||||
- __meta_kubernetes_node_name
|
||||
target_label: __metrics_path__
|
||||
scheme: https
|
||||
tls_config:
|
||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
insecure_skip_verify: true
|
||||
- honor_labels: true
|
||||
job_name: kubernetes-service-endpoints
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
regex: true
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_scrape
|
||||
- action: drop
|
||||
regex: true
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
|
||||
- action: replace
|
||||
regex: (https?)
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_scheme
|
||||
target_label: __scheme__
|
||||
- action: replace
|
||||
regex: (.+)
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_path
|
||||
target_label: __metrics_path__
|
||||
- action: replace
|
||||
regex: (.+?)(?::\d+)?;(\d+)
|
||||
replacement: $1:$2
|
||||
source_labels:
|
||||
- __address__
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_port
|
||||
target_label: __address__
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
|
||||
replacement: __param_$1
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_service_label_(.+)
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_namespace
|
||||
target_label: namespace
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_name
|
||||
target_label: service
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
target_label: node
|
||||
- honor_labels: true
|
||||
job_name: kubernetes-service-endpoints-slow
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
regex: true
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
|
||||
- action: replace
|
||||
regex: (https?)
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_scheme
|
||||
target_label: __scheme__
|
||||
- action: replace
|
||||
regex: (.+)
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_path
|
||||
target_label: __metrics_path__
|
||||
- action: replace
|
||||
regex: (.+?)(?::\d+)?;(\d+)
|
||||
replacement: $1:$2
|
||||
source_labels:
|
||||
- __address__
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_port
|
||||
target_label: __address__
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
|
||||
replacement: __param_$1
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_service_label_(.+)
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_namespace
|
||||
target_label: namespace
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_name
|
||||
target_label: service
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
target_label: node
|
||||
scrape_interval: 5m
|
||||
scrape_timeout: 30s
|
||||
- honor_labels: true
|
||||
job_name: prometheus-pushgateway
|
||||
kubernetes_sd_configs:
|
||||
- role: service
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
regex: pushgateway
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_probe
|
||||
- honor_labels: true
|
||||
job_name: kubernetes-services
|
||||
kubernetes_sd_configs:
|
||||
- role: service
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module:
|
||||
- http_2xx
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
regex: true
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_probe
|
||||
- source_labels:
|
||||
- __address__
|
||||
target_label: __param_target
|
||||
- replacement: blackbox
|
||||
target_label: __address__
|
||||
- source_labels:
|
||||
- __param_target
|
||||
target_label: instance
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_service_label_(.+)
|
||||
- source_labels:
|
||||
- __meta_kubernetes_namespace
|
||||
target_label: namespace
|
||||
- source_labels:
|
||||
- __meta_kubernetes_service_name
|
||||
target_label: service
|
||||
- honor_labels: true
|
||||
job_name: kubernetes-pods
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
regex: true
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
|
||||
- action: drop
|
||||
regex: true
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
|
||||
- action: replace
|
||||
regex: (https?)
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
|
||||
target_label: __scheme__
|
||||
- action: replace
|
||||
regex: (.+)
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_path
|
||||
target_label: __metrics_path__
|
||||
- action: replace
|
||||
regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
|
||||
replacement: '[$2]:$1'
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
||||
- __meta_kubernetes_pod_ip
|
||||
target_label: __address__
|
||||
- action: replace
|
||||
regex: (\d+);((([0-9]+?)(\.|$)){4})
|
||||
replacement: $2:$1
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
||||
- __meta_kubernetes_pod_ip
|
||||
target_label: __address__
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
|
||||
replacement: __param_$1
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_label_(.+)
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_namespace
|
||||
target_label: namespace
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_name
|
||||
target_label: pod
|
||||
- action: drop
|
||||
regex: Pending|Succeeded|Failed|Completed
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_phase
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
target_label: node
|
||||
- honor_labels: true
|
||||
job_name: kubernetes-pods-slow
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
regex: true
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
|
||||
- action: replace
|
||||
regex: (https?)
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
|
||||
target_label: __scheme__
|
||||
- action: replace
|
||||
regex: (.+)
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_path
|
||||
target_label: __metrics_path__
|
||||
- action: replace
|
||||
regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
|
||||
replacement: '[$2]:$1'
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
||||
- __meta_kubernetes_pod_ip
|
||||
target_label: __address__
|
||||
- action: replace
|
||||
regex: (\d+);((([0-9]+?)(\.|$)){4})
|
||||
replacement: $2:$1
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
||||
- __meta_kubernetes_pod_ip
|
||||
target_label: __address__
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
|
||||
replacement: __param_$1
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_label_(.+)
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_namespace
|
||||
target_label: namespace
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_name
|
||||
target_label: pod
|
||||
- action: drop
|
||||
regex: Pending|Succeeded|Failed|Completed
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_phase
|
||||
- action: replace
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
target_label: node
|
||||
scrape_interval: 5m
|
||||
scrape_timeout: 30s
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager.svc.plus
|
||||
recording_rules.yml: |
|
||||
{}
|
||||
rules: |
|
||||
{}
|
||||
29
clusters/monitor/recording-rules-patch.yaml
Normal file
29
clusters/monitor/recording-rules-patch.yaml
Normal file
@ -0,0 +1,29 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
annotations:
|
||||
meta.helm.sh/release-name: observability-server
|
||||
meta.helm.sh/release-namespace: monitoring
|
||||
labels:
|
||||
app.kubernetes.io/component: server
|
||||
app.kubernetes.io/instance: observability-server
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
app.kubernetes.io/name: prometheus
|
||||
app.kubernetes.io/part-of: prometheus
|
||||
app.kubernetes.io/version: v2.48.1
|
||||
helm.sh/chart: prometheus-25.8.2
|
||||
name: observability-server-prometheus-server
|
||||
namespace: monitoring
|
||||
data:
|
||||
recording_rules.yml: |
|
||||
groups:
|
||||
- name: host-monitoring
|
||||
rules:
|
||||
- record: node_load1
|
||||
expr: node_load1
|
||||
- record: node_cpu_usage
|
||||
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
||||
- record: node_memory_usage
|
||||
expr: (node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / node_memory_MemTotal_bytes * 100
|
||||
- record: node_disk_usage
|
||||
expr: 100 - (avg by (instance) (node_filesystem_avail_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"}) * 100)
|
||||
Loading…
Reference in New Issue
Block a user