diff --git a/clusters/monitor/alert-rules-patch.yaml b/clusters/monitor/alert-rules-patch.yaml new file mode 100644 index 0000000..1afe66c --- /dev/null +++ b/clusters/monitor/alert-rules-patch.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + annotations: + meta.helm.sh/release-name: observability-server + meta.helm.sh/release-namespace: monitoring + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: observability-server + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: prometheus + app.kubernetes.io/version: v2.48.1 + helm.sh/chart: prometheus-25.8.2 + name: observability-server-prometheus-server + namespace: monitoring +data: + alerting_rules.yml: | + groups: + - name: Load-monitoring + rules: + - alert: HighLoad + expr: node_load1 > 2.0 for: 5m + labels: + severity: warning + annotations: + summary: High load on {{ $labels.instance }} + description: "Load is {{ $value }} (threshold: 2.0)" + - name: cpu-usage-monitoring + rules: + - alert: HighCpuUsage + expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90 for: 5m + labels: + severity: critical + annotations: + summary: High CPU usage on {{ $labels.instance }} + description: "CPU usage is {{ $value }}%" + - name: memory-usage-monitoring + rules: + - alert: HighMemoryUsage + expr: (node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / node_memory_MemTotal_bytes * 100 > 90 + for: 5m + labels: + severity: warning + annotations: + summary: High memory usage on {{ $labels.instance }} + description: "Memory usage is {{ $value }}%" + - name: disk-usage-monitoring + rules: + - alert: HighDiskUsage + expr: node_filesystem_avail_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10 + for: 5m + labels: + severity: critical + annotations: + summary: High disk usage on {{ $labels.instance }} + description: "Disk usage is {{ $value }}%" diff --git a/clusters/monitor/kustomization.yaml b/clusters/monitor/kustomization.yaml index 433cd6d..45dc1a4 100644 --- a/clusters/monitor/kustomization.yaml +++ b/clusters/monitor/kustomization.yaml @@ -5,3 +5,7 @@ resources: - namespace.yaml - helmrepo.yaml - observability-agent.yaml + - prometheus-server-configmap.yaml +patchesStrategicMerge: + - alert-rules-patch.yaml + - recording-rules-patch.yaml diff --git a/clusters/monitor/prometheus-server-configmap.yaml b/clusters/monitor/prometheus-server-configmap.yaml new file mode 100644 index 0000000..e85e015 --- /dev/null +++ b/clusters/monitor/prometheus-server-configmap.yaml @@ -0,0 +1,339 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + annotations: + meta.helm.sh/release-name: observability-server + meta.helm.sh/release-namespace: monitoring + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: observability-server + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: prometheus + app.kubernetes.io/version: v2.48.1 + helm.sh/chart: prometheus-25.8.2 + name: observability-server-prometheus-server + namespace: monitoring +data: + alerting_rules.yml: | + {} + alerts: | + {} + allow-snippet-annotations: "false" + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + alerting: + alertmanagers: + - static_configs: + - targets: + - alertmanager.svc.plus + recording_rules.yml: | + {} + rules: | + {} diff --git a/clusters/monitor/recording-rules-patch.yaml b/clusters/monitor/recording-rules-patch.yaml new file mode 100644 index 0000000..f1a1f6c --- /dev/null +++ b/clusters/monitor/recording-rules-patch.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + annotations: + meta.helm.sh/release-name: observability-server + meta.helm.sh/release-namespace: monitoring + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: observability-server + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: prometheus + app.kubernetes.io/version: v2.48.1 + helm.sh/chart: prometheus-25.8.2 + name: observability-server-prometheus-server + namespace: monitoring +data: + recording_rules.yml: | + groups: + - name: host-monitoring + rules: + - record: node_load1 + expr: node_load1 + - record: node_cpu_usage + expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) + - record: node_memory_usage + expr: (node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / node_memory_MemTotal_bytes * 100 + - record: node_disk_usage + expr: 100 - (avg by (instance) (node_filesystem_avail_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"}) * 100)