diff --git a/README.md b/README.md index c2cb86a..ef84c82 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,72 @@ flowchart LR ## 3) Start +当前推荐按“混合部署到已有主机”的方式执行。 + +1. 先更新 DNS,把 `observability.svc.plus` 指到 `us-xhttp.svc.plus` +2. 在 `us-xhttp.svc.plus` 上执行下面的 Server side 示例,部署中心端 +3. 再到其他已有主机执行下面的 Client side 示例,把采集数据回传到 `observability.svc.plus` + +当前接入主机: + +- `us-xhttp.svc.plus`:继续承载现有服务,同时承载 `observability.svc.plus` +- `openclaw.svc.plus`:部署 agent,采集后上报到中心端 +- `jp-xhttp.svc.plus`:部署 agent,采集后上报到中心端 + +### Ansible (Recommended) + +#### Server side + +先导出 Cloudflare Token,然后在 `us-xhttp.svc.plus` 上执行服务端部署。`deploy_observability_service.yml` 会先把 Cloudflare 上的 `observability.svc.plus` 更新成指向 `us-xhttp.svc.plus` 的非代理记录,再等待公共 DNS 生效后继续部署,这样更容易保证 Caddy 首次自动签名成功。 + +```bash +export CLOUDFLARE_API_TOKEN=... +ansible-playbook -i deploy_observability_service.yml -l us-xhttp.svc.plus +``` + +如果希望给 `/ingest/*` 增加一层基础认证,可以在服务端部署时一起打开: + +```bash +export CLOUDFLARE_API_TOKEN=... +ansible-playbook -i deploy_observability_service.yml -l us-xhttp.svc.plus \ + -e observability_ingest_basic_auth_enabled=true \ + -e observability_ingest_basic_auth_user=ingest \ + -e observability_ingest_basic_auth_password='' +``` + +#### Client side (agent) + +再到采集端主机执行 `node.yml` 的 push mode: + +```bash +ansible-playbook -i node.yml \ + -l openclaw.svc.plus,jp-xhttp.svc.plus \ + -e node_monitor_mode=push \ + -e observability_endpoint=https://observability.svc.plus/ \ + -e haproxy_enabled=false +``` + +如果服务端已开启 ingest 基本认证,采集端也要带上同一组凭据: + +```bash +ansible-playbook -i node.yml \ + -l openclaw.svc.plus,jp-xhttp.svc.plus \ + -e node_monitor_mode=push \ + -e observability_endpoint=https://observability.svc.plus/ \ + -e observability_ingest_basic_auth_enabled=true \ + -e observability_ingest_basic_auth_user=ingest \ + -e observability_ingest_basic_auth_password='' \ + -e haproxy_enabled=false +``` + +> `node_monitor_mode=push` 会在远端主机上部署 `node_exporter + process_exporter + vector`,并把 metrics / logs 主动汇总到 `observability.svc.plus`。`vector` 固定归到采集端任务,服务端 `infra.yml` 不再默认部署它。 +> +> 如果采集端与 Victoria 服务端同机,playbook 会自动把 metrics / logs 改走本机 `127.0.0.1` ingest;跨主机时默认走 `https://observability.svc.plus/` 并自动补全 `/ingest/metrics/api/v1/write` 和 `/ingest/logs/insert`。 +> +> `observability_ingest_basic_auth_*` 只保护 `/ingest/*` 写入入口,不影响 Caddy 暴露的其他站点页面;服务端和采集端必须使用同一组认证信息。 + +### Script Installers + ### Server side ```bash @@ -91,10 +157,18 @@ vi pigsty.yml # adjust domain/password/ports Default inventory template: `conf/app/deepflow.yml` -### Remote client example (clawdbot.svc.plus) +### Remote client example (openclaw.svc.plus) ```bash -ssh root@clawdbot.svc.plus \ +ssh root@openclaw.svc.plus \ + 'curl -fsSL https://raw.githubusercontent.com/cloud-neutral-toolkit/observability.svc.plus/main/scripts/agent-install.sh \ + | bash -s -- --endpoint https://observability.svc.plus/ingest/otlp' +``` + +### Remote client example (jp-xhttp.svc.plus) + +```bash +ssh root@jp-xhttp.svc.plus \ 'curl -fsSL https://raw.githubusercontent.com/cloud-neutral-toolkit/observability.svc.plus/main/scripts/agent-install.sh \ | bash -s -- --endpoint https://observability.svc.plus/ingest/otlp' ``` @@ -102,11 +176,11 @@ ssh root@clawdbot.svc.plus \ ### Optional SSH manager env example ```bash -SSH_SERVER_CLAWBOT_HOST=clawdbot.svc.plus +SSH_SERVER_CLAWBOT_HOST=openclaw.svc.plus SSH_SERVER_CLAWBOT_USER=root SSH_SERVER_CLAWBOT_KEYPATH=~/.ssh/id_rsa SSH_SERVER_CLAWBOT_PORT=22 -SSH_SERVER_CLAWBOT_DESCRIPTION=clawdbot_server +SSH_SERVER_CLAWBOT_DESCRIPTION=openclaw_server ``` ## 4) Features diff --git a/conf/app/deepflow.yml b/conf/deepflow/deepflow.yml similarity index 100% rename from conf/app/deepflow.yml rename to conf/deepflow/deepflow.yml diff --git a/conf/app/dify.yml b/conf/deepflow/dify.yml similarity index 100% rename from conf/app/dify.yml rename to conf/deepflow/dify.yml diff --git a/conf/app/electric.yml b/conf/deepflow/electric.yml similarity index 100% rename from conf/app/electric.yml rename to conf/deepflow/electric.yml diff --git a/conf/app/maybe.yml b/conf/deepflow/maybe.yml similarity index 100% rename from conf/app/maybe.yml rename to conf/deepflow/maybe.yml diff --git a/conf/app/odoo.yml b/conf/deepflow/odoo.yml similarity index 100% rename from conf/app/odoo.yml rename to conf/deepflow/odoo.yml diff --git a/conf/app/registry.yml b/conf/deepflow/registry.yml similarity index 100% rename from conf/app/registry.yml rename to conf/deepflow/registry.yml diff --git a/conf/app/supa.yml b/conf/deepflow/supa.yml similarity index 100% rename from conf/app/supa.yml rename to conf/deepflow/supa.yml diff --git a/conf/app/teable.yml b/conf/deepflow/teable.yml similarity index 100% rename from conf/app/teable.yml rename to conf/deepflow/teable.yml diff --git a/deploy_observability_service.yml b/deploy_observability_service.yml new file mode 100644 index 0000000..e4132cd --- /dev/null +++ b/deploy_observability_service.yml @@ -0,0 +1,147 @@ +--- +- name: Update Cloudflare DNS for observability.svc.plus + hosts: localhost + connection: local + gather_facts: false + vars: + cloudflare_zone_name: svc.plus + cloudflare_api_base: https://api.cloudflare.com/client/v4 + observability_domain: observability.svc.plus + observability_dns_target: us-xhttp.svc.plus + observability_dns_type: CNAME + observability_dns_ttl: 1 + observability_dns_proxied: false + dns_wait_retries: 30 + dns_wait_delay: 10 + tasks: + - name: Validate Cloudflare token is present in environment + ansible.builtin.assert: + that: + - lookup('ansible.builtin.env', 'CLOUDFLARE_API_TOKEN') | length > 0 + fail_msg: "CLOUDFLARE_API_TOKEN must be exported before running this playbook." + + - name: Resolve Cloudflare zone id + ansible.builtin.uri: + url: "{{ cloudflare_api_base }}/zones?name={{ cloudflare_zone_name }}" + method: GET + headers: + Authorization: "Bearer {{ lookup('ansible.builtin.env', 'CLOUDFLARE_API_TOKEN') }}" + Content-Type: application/json + return_content: true + register: cloudflare_zone_lookup + + - name: Validate zone lookup result + ansible.builtin.assert: + that: + - cloudflare_zone_lookup.json.success + - cloudflare_zone_lookup.json.result | length > 0 + fail_msg: "Unable to resolve Cloudflare zone id for {{ cloudflare_zone_name }}." + + - name: Set Cloudflare zone id + ansible.builtin.set_fact: + cloudflare_zone_id: "{{ cloudflare_zone_lookup.json.result[0].id }}" + + - name: Query existing observability DNS records + ansible.builtin.uri: + url: "{{ cloudflare_api_base }}/zones/{{ cloudflare_zone_id }}/dns_records?name={{ observability_domain }}" + method: GET + headers: + Authorization: "Bearer {{ lookup('ansible.builtin.env', 'CLOUDFLARE_API_TOKEN') }}" + Content-Type: application/json + return_content: true + register: observability_dns_records + + - name: Remove conflicting observability DNS records with different type + ansible.builtin.uri: + url: "{{ cloudflare_api_base }}/zones/{{ cloudflare_zone_id }}/dns_records/{{ item.id }}" + method: DELETE + headers: + Authorization: "Bearer {{ lookup('ansible.builtin.env', 'CLOUDFLARE_API_TOKEN') }}" + Content-Type: application/json + loop: "{{ observability_dns_records.json.result | default([]) }}" + loop_control: + label: "{{ item.type }} {{ item.name }}" + when: item.type != observability_dns_type + + - name: Create observability DNS record when missing + ansible.builtin.uri: + url: "{{ cloudflare_api_base }}/zones/{{ cloudflare_zone_id }}/dns_records" + method: POST + headers: + Authorization: "Bearer {{ lookup('ansible.builtin.env', 'CLOUDFLARE_API_TOKEN') }}" + Content-Type: application/json + body_format: raw + body: >- + {{ + { + 'type': observability_dns_type, + 'name': observability_domain, + 'content': observability_dns_target, + 'ttl': (observability_dns_ttl | int), + 'proxied': (observability_dns_proxied | bool) + } | to_json + }} + when: (observability_dns_records.json.result | selectattr('type', 'equalto', observability_dns_type) | list | length) == 0 + + - name: Update observability DNS record when target changes + ansible.builtin.uri: + url: "{{ cloudflare_api_base }}/zones/{{ cloudflare_zone_id }}/dns_records/{{ (observability_dns_records.json.result | selectattr('type', 'equalto', observability_dns_type) | list | first).id }}" + method: PUT + headers: + Authorization: "Bearer {{ lookup('ansible.builtin.env', 'CLOUDFLARE_API_TOKEN') }}" + Content-Type: application/json + body_format: raw + body: >- + {{ + { + 'type': observability_dns_type, + 'name': observability_domain, + 'content': observability_dns_target, + 'ttl': (observability_dns_ttl | int), + 'proxied': (observability_dns_proxied | bool) + } | to_json + }} + when: + - (observability_dns_records.json.result | selectattr('type', 'equalto', observability_dns_type) | list | length) > 0 + - > + ((observability_dns_records.json.result | selectattr('type', 'equalto', observability_dns_type) | list | first).content != observability_dns_target) + or + (((observability_dns_records.json.result | selectattr('type', 'equalto', observability_dns_type) | list | first).proxied | default(false)) != observability_dns_proxied) + + - name: Wait for public DNS to expose observability CNAME + ansible.builtin.uri: + url: "https://cloudflare-dns.com/dns-query?name={{ observability_domain }}&type=CNAME" + method: GET + headers: + Accept: application/dns-json + return_content: true + register: observability_dns_public + until: + - observability_dns_public.status == 200 + - > + ( + observability_dns_public.json.Status + if (observability_dns_public.json is defined) + else ((observability_dns_public.content | from_json).Status | default(1)) + ) == 0 + - > + ( + observability_dns_public.json.Answer + if (observability_dns_public.json is defined) + else ((observability_dns_public.content | from_json).Answer | default([])) + ) | selectattr('data', 'equalto', observability_dns_target ~ '.') + | list | length > 0 + retries: "{{ dns_wait_retries }}" + delay: "{{ dns_wait_delay }}" + + - name: Show effective observability DNS target + ansible.builtin.debug: + msg: "{{ observability_domain }} -> {{ observability_dns_target }} proxied={{ observability_dns_proxied }}" + +- import_playbook: infra.yml + vars: + infra_domain: observability.svc.plus + infra_portal: + home: { domain: observability.svc.plus } + caddy_enabled: true + nginx_enabled: false diff --git a/infra.yml b/infra.yml index 18e2225..dd47c14 100755 --- a/infra.yml +++ b/infra.yml @@ -103,4 +103,13 @@ # - add_logs : register infra as vector logging source # - add_ds : register infra victoria stack as grafana datasource #--------------------------------------------------------------# -... \ No newline at end of file +# Mixed Existing-Host Deployment +#--------------------------------------------------------------# +# Center service example: +# ./infra.yml -l us-xhttp.svc.plus \ +# -e infra_domain=observability.svc.plus \ +# -e 'infra_portal={\"home\":{\"domain\":\"observability.svc.plus\"}}' \ +# -e caddy_enabled=true \ +# -e nginx_enabled=false +#--------------------------------------------------------------# +... diff --git a/node.yml b/node.yml index 47c79d4..67ec72c 100755 --- a/node.yml +++ b/node.yml @@ -32,6 +32,12 @@ # node.yml -l # add groups # node.yml -l # add single node # +# Observability push-agent mode: +# ./node.yml -l openclaw.svc.plus,jp-xhttp.svc.plus \ +# -e node_monitor_mode=push \ +# -e observability_endpoint=https://observability.svc.plus/ \ +# -e haproxy_enabled=false +# # Bootstrap with another admin user: (Create admin with another admin) # node.yml -t node_admin # create admin user for nodes # node.yml -t node_admin -k -K -e ansible_user= @@ -112,4 +118,4 @@ # - vector_config # - vector_launch #--------------------------------------------------------------- -... \ No newline at end of file +... diff --git a/roles/infra/defaults/main.yml b/roles/infra/defaults/main.yml index 7401882..2dae1b8 100644 --- a/roles/infra/defaults/main.yml +++ b/roles/infra/defaults/main.yml @@ -15,6 +15,10 @@ proxy_env: { no_proxy: "localhost,127.0.0.1,10.0.0.0/8,192.168.0.0/16,*.aliyun.c infra_portal: # infra services exposed via portal home : { domain: i.observability } # default home server definition infra_domain: observability.svc.plus +observability_ingest_basic_auth_enabled: false +observability_ingest_basic_auth_user: ingest +observability_ingest_basic_auth_password: '' +observability_ingest_basic_auth_password_hash: '' infra_data: /data/infra # default data path for infrastructure data infra_services: # home page navigation entries - { name: Metrics ,url: '/vmetrics/vmui/' ,desc: 'VictoriaMetrics Query UI' ,icon: 'metrics' ,name_cn: '指标查询' ,desc_cn: 'VictoriaMetrics 指标查询界面' } @@ -60,7 +64,7 @@ certbot_options: '' # certbot extra options #----------------------------------------------------------------- # DNS #----------------------------------------------------------------- -dns_enabled: true # setup dnsmasq on this infra node? +dns_enabled: false # setup dnsmasq on this infra node? dns_port: 53 # dns server listen port, 53 by default dns_records: # dynamic dns records resolved by dnsmasq - "${admin_ip} i.pigsty" diff --git a/roles/infra/tasks/caddy.yml b/roles/infra/tasks/caddy.yml index de44aeb..28a04e3 100644 --- a/roles/infra/tasks/caddy.yml +++ b/roles/infra/tasks/caddy.yml @@ -21,10 +21,41 @@ tags: caddy_config template: src: caddy/Caddyfile + dest: /etc/caddy/conf.d/observability.caddy + owner: root + group: root + mode: '0644' + notify: reload caddy + +- name: check existing caddy main config + tags: caddy_config + stat: + path: /etc/caddy/Caddyfile + register: caddy_main_config + +- name: bootstrap caddy main config when missing + tags: caddy_config + copy: dest: /etc/caddy/Caddyfile owner: root group: root mode: '0644' + content: | + { + } + + import /etc/caddy/conf.d/*.caddy + when: not caddy_main_config.stat.exists + notify: reload caddy + +- name: ensure caddy main config imports conf.d snippets + tags: caddy_config + lineinfile: + path: /etc/caddy/Caddyfile + line: "import /etc/caddy/conf.d/*.caddy" + insertafter: EOF + state: present + when: caddy_main_config.stat.exists notify: reload caddy #--------------------------------------------------------------# diff --git a/roles/infra/tasks/main.yml b/roles/infra/tasks/main.yml index 616cbd0..b2dff64 100644 --- a/roles/infra/tasks/main.yml +++ b/roles/infra/tasks/main.yml @@ -1,4 +1,27 @@ --- +#--------------------------------------------------------------# +# 0. Validate optional ingest auth [auth] +#--------------------------------------------------------------# +- name: validate observability ingest basic auth inputs + tags: auth + when: observability_ingest_basic_auth_enabled | default(false) | bool + assert: + that: + - observability_ingest_basic_auth_user | default('', true) | length > 0 + - observability_ingest_basic_auth_password | default('', true) | length > 0 or observability_ingest_basic_auth_password_hash | default('', true) | length > 0 + fail_msg: "When observability_ingest_basic_auth_enabled=true, set observability_ingest_basic_auth_user and either observability_ingest_basic_auth_password or observability_ingest_basic_auth_password_hash." + +- name: build effective observability ingest password hash + tags: auth + when: observability_ingest_basic_auth_enabled | default(false) | bool + set_fact: + observability_ingest_basic_auth_password_hash_effective: >- + {{ + observability_ingest_basic_auth_password_hash + if (observability_ingest_basic_auth_password_hash | default('', true) | length > 0) + else (observability_ingest_basic_auth_password | password_hash('bcrypt')) + }} + #--------------------------------------------------------------# # 1. Infra User [infra_user] #--------------------------------------------------------------# @@ -40,6 +63,7 @@ #--------------------------------------------------------------# # dns_config, dns_record, dns_launch - import_tasks: dns.yml + when: dns_enabled|bool tags: dns #--------------------------------------------------------------# @@ -97,4 +121,4 @@ - import_tasks: register.yml tags: infra_register -... \ No newline at end of file +... diff --git a/roles/infra/tasks/nginx.yml b/roles/infra/tasks/nginx.yml index 7af6356..e37939b 100644 --- a/roles/infra/tasks/nginx.yml +++ b/roles/infra/tasks/nginx.yml @@ -268,7 +268,7 @@ - name: setup nginx exporter ignore_errors: true tags: nginx_exporter - when: nginx_exporter_enabled|bool + when: nginx_enabled|bool and nginx_exporter_enabled|bool block: - name: copy nginx_exporter systemd service template: src=nginx/nginx_exporter.svc dest={{ systemd_dir }}/nginx_exporter.service owner=root group=root mode='0644' diff --git a/roles/infra/templates/caddy/Caddyfile b/roles/infra/templates/caddy/Caddyfile index d75334b..4c7d537 100644 --- a/roles/infra/templates/caddy/Caddyfile +++ b/roles/infra/templates/caddy/Caddyfile @@ -1,10 +1,13 @@ -{ - # debug -} - {{ infra_domain | default('observability.svc.plus') }} { encode gzip zstd +{% if observability_ingest_basic_auth_enabled | default(false) %} + @observability_ingest path /ingest/* + basic_auth @observability_ingest { + {{ observability_ingest_basic_auth_user }} {{ observability_ingest_basic_auth_password_hash_effective | default(observability_ingest_basic_auth_password_hash) }} + } +{% endif %} + # ---- Alloy unified ingest endpoints ---- # Prometheus remote_write diff --git a/roles/node/defaults/main.yml b/roles/node/defaults/main.yml index 01cbdb1..aaa5f76 100644 --- a/roles/node/defaults/main.yml +++ b/roles/node/defaults/main.yml @@ -107,7 +107,7 @@ proxy_env: # global proxy env when downloading packages #----------------------------------------------------------------- # NGINX (Reference) #----------------------------------------------------------------- -nginx_enabled: true # enable nginx on this infra node? +nginx_enabled: false # enable nginx on this infra node? nginx_home: /www # nginx content dir, `/www` by default (soft link to nginx_data) nginx_data: /data/nginx # nginx actual data dir, /data/nginx by default #----------------------------------------------------------------- @@ -119,4 +119,4 @@ repo_name: pigsty # repo name, pigsty by default # CA (Reference) #----------------------------------------------------------------- cert_validity: 7300d # cert validity, 20 years by default -... \ No newline at end of file +... diff --git a/roles/node_monitor/defaults/main.yml b/roles/node_monitor/defaults/main.yml index dae0ce6..9245e06 100644 --- a/roles/node_monitor/defaults/main.yml +++ b/roles/node_monitor/defaults/main.yml @@ -8,13 +8,30 @@ node_exporter_options: '--no-collector.softnet --no-collector.nvme --collector.t #--------------------------------------------------------------# # VECTOR #--------------------------------------------------------------# -vector_enabled: true # enable vector log collector? +node_monitor_mode: pull # pull: central scrape/register, push: remote agent pushes to observability endpoint +observability_endpoint: '' # base endpoint, e.g. https://observability.svc.plus/ingest/otlp +observability_metrics_endpoint: '' # optional override for remote_write endpoint +observability_logs_endpoint: '' # optional override for logs endpoint +observability_ingest_basic_auth_enabled: false +observability_ingest_basic_auth_user: ingest +observability_ingest_basic_auth_password: '' +vector_enabled: false # enable vector log collector? push mode enables it automatically vector_clean: false # purge vector data dir during init? vector_data: /data/vector # vector data dir, /data/vector by default vector_port: 9598 # vector metrics port, 9598 by default vector_read_from: beginning # vector read from beginning or end vector_log_endpoint: [ infra ] # if defined, sending vector log to this endpoint. +#--------------------------------------------------------------# +# PROCESS EXPORTER +#--------------------------------------------------------------# +process_exporter_enabled: false # enable process_exporter, automatically recommended for push mode +process_exporter_version: 0.7.10 +process_exporter_port: 9256 +process_exporter_binary: /usr/local/bin/process-exporter +process_exporter_config_dir: /etc/process-exporter +process_exporter_config_file: /etc/process-exporter/process-exporter.yml + #----------------------------------------------------------------- # NODE_VIP (Reference) #----------------------------------------------------------------- @@ -45,4 +62,4 @@ pg_log_dir: /pg/log/postgres # postgres log dir, `/pg/log/postgres` by defa patroni_log_dir: /pg/log/patroni # patroni log dir, `/pg/log/patroni` by default pgbouncer_log_dir: /pg/log/pgbouncer # pgbouncer log dir, `/pg/log/pgbouncer` by default pgbackrest_log_dir: /pg/log/pgbackrest # pgbackrest log dir, `/pg/log/pgbackrest` by default -... \ No newline at end of file +... diff --git a/roles/node_monitor/tasks/main.yml b/roles/node_monitor/tasks/main.yml index b90bc7c..53e838e 100644 --- a/roles/node_monitor/tasks/main.yml +++ b/roles/node_monitor/tasks/main.yml @@ -45,6 +45,109 @@ loop: '{{ groups["infra"]|default([]) }}' systemd: name=nginx state=reloaded enabled=yes daemon_reload=yes +- name: validate observability push mode inputs + tags: [monitor, vector, process_exporter] + when: node_monitor_mode | default('pull') == 'push' + assert: + that: + - observability_endpoint | default('', true) | length > 0 or (observability_metrics_endpoint | default('', true) | length > 0 and observability_logs_endpoint | default('', true) | length > 0) + fail_msg: "Set observability_endpoint, or set both observability_metrics_endpoint and observability_logs_endpoint, when node_monitor_mode=push." + +- name: validate observability push mode basic auth inputs + tags: [monitor, vector, process_exporter, auth] + when: + - node_monitor_mode | default('pull') == 'push' + - observability_ingest_basic_auth_enabled | default(false) | bool + assert: + that: + - observability_ingest_basic_auth_user | default('', true) | length > 0 + - observability_ingest_basic_auth_password | default('', true) | length > 0 + fail_msg: "When observability_ingest_basic_auth_enabled=true in push mode, set observability_ingest_basic_auth_user and observability_ingest_basic_auth_password." + +- name: derive observability collector host for push mode + tags: [monitor, vector, process_exporter] + when: node_monitor_mode | default('pull') == 'push' + set_fact: + observability_collector_host: >- + {{ + ( + observability_endpoint + if (observability_endpoint | default('', true) | length > 0) + else ( + observability_metrics_endpoint + if (observability_metrics_endpoint | default('', true) | length > 0) + else observability_logs_endpoint + ) + ) + | regex_replace('^[A-Za-z][A-Za-z0-9+.-]*://', '') + | regex_replace('/.*$', '') + | regex_replace(':.*$', '') + }} + +- name: detect whether observability collector is local + tags: [monitor, vector, process_exporter] + when: node_monitor_mode | default('pull') == 'push' + shell: | + set -eu + collector_host="{{ observability_collector_host }}" + + if [ -z "${collector_host}" ]; then + exit 1 + fi + + matches_local_name() { + local candidate="$1" + [ -n "${candidate}" ] && [ "${collector_host}" = "${candidate}" ] + } + + if matches_local_name "{{ inventory_hostname }}"; then + exit 0 + fi + + if matches_local_name "{{ nodename | default('', true) }}"; then + exit 0 + fi + + if matches_local_name "$(hostname -f 2>/dev/null || hostname 2>/dev/null || true)"; then + exit 0 + fi + + if matches_local_name "$(hostname -s 2>/dev/null || true)"; then + exit 0 + fi + + local_ips="$( + { + hostname -I 2>/dev/null || true + ip -o -4 addr show scope global 2>/dev/null | awk '{print $4}' | cut -d/ -f1 + } | tr ' ' '\n' | sed '/^$/d' | sort -u + )" + + resolved_ips="$( + { + getent ahostsv4 "${collector_host}" 2>/dev/null | awk '{print $1}' || true + host "${collector_host}" 2>/dev/null | awk '/has address/ {print $4}' || true + } | sed '/^$/d' | sort -u + )" + + [ -n "${local_ips}" ] || exit 1 + [ -n "${resolved_ips}" ] || exit 1 + + if comm -12 <(printf '%s\n' "${local_ips}") <(printf '%s\n' "${resolved_ips}") | grep -q .; then + exit 0 + fi + + exit 1 + args: { executable: /bin/bash } + register: observability_collector_local_check + changed_when: false + failed_when: false + +- name: expose observability collector locality + tags: [monitor, vector, process_exporter] + when: node_monitor_mode | default('pull') == 'push' + set_fact: + observability_collector_is_local: "{{ observability_collector_local_check.rc == 0 }}" #--------------------------------------------------------------# # Register Instance DNS Name [vip_dns] @@ -140,6 +243,7 @@ # /infra/targets/node/{{ ip }}.yml - name: register node as victoria target tags: [ node_vip, node_register, register, add_metrics ] + when: node_monitor_mode | default('pull') != 'push' ignore_errors: true delegate_to: '{{ item }}' loop: '{{ groups["infra"]|default([]) }}' @@ -168,6 +272,7 @@ - name: register node as ping target tags: [ node_register, register, add_metrics ] + when: node_monitor_mode | default('pull') != 'push' ignore_errors: true delegate_to: '{{ item }}' loop: '{{ groups["infra"]|default([]) }}' @@ -183,7 +288,7 @@ - name: register node vip as ping target tags: [ node_vip, node_register, register, add_metrics ] - when: vip_enabled|bool and vip_address is defined and vip_address != '' + when: node_monitor_mode | default('pull') != 'push' and vip_enabled|bool and vip_address is defined and vip_address != '' ignore_errors: true delegate_to: '{{ item }}' loop: '{{ groups["infra"]|default([]) }}' @@ -197,11 +302,14 @@ - labels: { ip: {{ inventory_hostname }} , ins: {{ nodename }} , cls: {{ node_cluster|default('nodes') }}, vip: {{ vip_address }} , job: node-vip } targets: [ {{ vip_address }} ] +- import_tasks: process_exporter.yml + tags: process_exporter + when: process_exporter_enabled | default(false) | bool or node_monitor_mode | default('pull') == 'push' #--------------------------------------------------------------# # Vector [vector] #--------------------------------------------------------------# - import_tasks: vector.yml tags: vector - when: vector_enabled|bool + when: vector_enabled|bool or node_monitor_mode | default('pull') == 'push' ... diff --git a/roles/node_monitor/tasks/process_exporter.yml b/roles/node_monitor/tasks/process_exporter.yml new file mode 100644 index 0000000..6657418 --- /dev/null +++ b/roles/node_monitor/tasks/process_exporter.yml @@ -0,0 +1,90 @@ +--- +#--------------------------------------------------------------# +# Install process_exporter [process_exporter_install] +#--------------------------------------------------------------# +- name: detect process_exporter architecture + tags: [process_exporter, process_exporter_install] + command: uname -m + register: process_exporter_uname + changed_when: false + +- name: map process_exporter architecture + tags: [process_exporter, process_exporter_install] + set_fact: + process_exporter_arch: >- + {% if process_exporter_uname.stdout == 'x86_64' %}amd64{% elif process_exporter_uname.stdout in ['aarch64', 'arm64'] %}arm64{% else %}{% endif %} + +- name: validate process_exporter architecture + tags: [process_exporter, process_exporter_install] + assert: + that: + - process_exporter_arch | length > 0 + fail_msg: "Unsupported process_exporter architecture: {{ process_exporter_uname.stdout }}" + +- name: ensure process_exporter config directory exists + tags: [process_exporter, process_exporter_config] + file: + path: "{{ process_exporter_config_dir }}" + state: directory + owner: root + group: root + mode: '0755' + +- name: download process_exporter release archive + tags: [process_exporter, process_exporter_install] + get_url: + url: "https://github.com/ncabatoff/process-exporter/releases/download/v{{ process_exporter_version }}/process-exporter-{{ process_exporter_version }}.linux-{{ process_exporter_arch }}.tar.gz" + dest: "/tmp/process-exporter-{{ process_exporter_version }}.linux-{{ process_exporter_arch }}.tar.gz" + mode: '0644' + +- name: extract process_exporter release archive + tags: [process_exporter, process_exporter_install] + unarchive: + src: "/tmp/process-exporter-{{ process_exporter_version }}.linux-{{ process_exporter_arch }}.tar.gz" + dest: /tmp + remote_src: true + creates: "/tmp/process-exporter-{{ process_exporter_version }}.linux-{{ process_exporter_arch }}/process-exporter" + +- name: install process_exporter binary + tags: [process_exporter, process_exporter_install] + copy: + src: "/tmp/process-exporter-{{ process_exporter_version }}.linux-{{ process_exporter_arch }}/process-exporter" + dest: "{{ process_exporter_binary }}" + owner: root + group: root + mode: '0755' + remote_src: true + +- name: render process_exporter config + tags: [process_exporter, process_exporter_config] + template: + src: process_exporter.yml + dest: "{{ process_exporter_config_file }}" + owner: root + group: root + mode: '0644' + +- name: render process_exporter systemd unit + tags: [process_exporter, process_exporter_config] + template: + src: process_exporter.svc + dest: "{{ systemd_dir }}/process_exporter.service" + owner: root + group: root + mode: '0644' + +- name: launch process_exporter + tags: [process_exporter, process_exporter_launch] + systemd: + name: process_exporter + state: restarted + enabled: true + daemon_reload: true + +- name: wait for process_exporter service online + tags: [process_exporter, process_exporter_launch] + wait_for: + host: 127.0.0.1 + port: "{{ process_exporter_port }}" + state: started + timeout: 15 diff --git a/roles/node_monitor/tasks/vector.yml b/roles/node_monitor/tasks/vector.yml index 8509e74..be99915 100644 --- a/roles/node_monitor/tasks/vector.yml +++ b/roles/node_monitor/tasks/vector.yml @@ -46,10 +46,11 @@ with_items: - { src: vector.svc ,dest: "{{ systemd_dir }}/vector.service" } - { src: vector.env ,dest: /etc/default/vector } - - { src: vector.yaml ,dest: /etc/vector/vector.yaml } + - { src: "{% if node_monitor_mode | default('pull') == 'push' %}vector-push.yaml{% else %}vector.yaml{% endif %}" ,dest: /etc/vector/vector.yaml } - name: register node syslog to vector tags: [ node_register, register ,add_logs ] + when: node_monitor_mode | default('pull') != 'push' template: src=node.yaml dest=/etc/vector/node.yaml mode=0600 diff --git a/roles/node_monitor/templates/process_exporter.svc b/roles/node_monitor/templates/process_exporter.svc new file mode 100644 index 0000000..20df87c --- /dev/null +++ b/roles/node_monitor/templates/process_exporter.svc @@ -0,0 +1,13 @@ +[Unit] +Description=Process Exporter +Documentation=https://github.com/ncabatoff/process-exporter +After=network.target + +[Service] +User=root +ExecStart={{ process_exporter_binary }} --web.listen-address=:{{ process_exporter_port }} -config.path {{ process_exporter_config_file }} +Restart=on-failure +RestartSec=5 + +[Install] +WantedBy=multi-user.target diff --git a/roles/node_monitor/templates/process_exporter.yml b/roles/node_monitor/templates/process_exporter.yml new file mode 100644 index 0000000..4cb2d72 --- /dev/null +++ b/roles/node_monitor/templates/process_exporter.yml @@ -0,0 +1,4 @@ +process_names: + - name: "{{ '{{.Comm}}' }}" + cmdline: + - '.+' diff --git a/roles/node_monitor/templates/vector-push.yaml b/roles/node_monitor/templates/vector-push.yaml new file mode 100644 index 0000000..dc5a0cb --- /dev/null +++ b/roles/node_monitor/templates/vector-push.yaml @@ -0,0 +1,102 @@ +--- +{% set base_endpoint = (observability_endpoint | default('', true) | regex_replace('/+$', '') | regex_replace('/ingest/otlp.*$', '')) %} +{% set collector_is_local = observability_collector_is_local | default(false) | bool %} +{% set default_metrics_endpoint = 'http://127.0.0.1:' ~ (vmetrics_port | default(8428) | string) ~ '/api/v1/write' if collector_is_local else base_endpoint ~ '/ingest/metrics/api/v1/write' %} +{% set default_logs_endpoint = 'http://127.0.0.1:' ~ (vlogs_port | default(9428) | string) ~ '/insert' if collector_is_local else base_endpoint ~ '/ingest/logs/insert' %} +{% set metrics_endpoint = observability_metrics_endpoint | default(default_metrics_endpoint, true) %} +{% set logs_endpoint = observability_logs_endpoint | default(default_logs_endpoint, true) %} +data_dir: {{ vector_data }} + +api: + enabled: true + +sources: + internal_metrics: + type: internal_metrics + scrape_interval_secs: 15 + + node_exporter: + type: prometheus_scrape + endpoints: + - http://127.0.0.1:{{ node_exporter_port | default(9100) }}{{ exporter_metrics_path | default('/metrics') }} + scrape_interval_secs: 15 + + process_exporter: + type: prometheus_scrape + endpoints: + - http://127.0.0.1:{{ process_exporter_port | default(9256) }}/metrics + scrape_interval_secs: 15 + + journald: + type: journald + current_boot_only: true + + syslog_files: + type: file + include: + - /var/log/syslog + - /var/log/messages + - /var/log/auth.log + read_from: end + +transforms: + agent_metrics: + type: remap + inputs: ["node_exporter", "process_exporter"] + source: | + .tags.host = "{{ ansible_hostname | default(nodename | default(inventory_hostname)) }}" + .tags.ip = "{{ inventory_hostname }}" + .tags.ins = "{{ nodename | default(inventory_hostname) }}" + .tags.cls = "{{ node_cluster | default('nodes') }}" + .tags.job = "node" + .tags.origin = "vector-agent" + + agent_logs: + type: remap + inputs: ["journald", "syslog_files"] + source: | + .host = "{{ ansible_hostname | default(nodename | default(inventory_hostname)) }}" + .ip = "{{ inventory_hostname }}" + .ins = "{{ nodename | default(inventory_hostname) }}" + .cls = "{{ node_cluster | default('nodes') }}" + .job = "node" + .origin = "vector-agent" + .timestamp = now() + +sinks: + vector_metrics: + type: prometheus_exporter + inputs: ["internal_metrics"] + address: 0.0.0.0:{{ vector_port }} + default_namespace: vector + + observability_metrics: + type: prometheus_remote_write + inputs: ["agent_metrics"] + endpoint: "{{ metrics_endpoint }}" +{% if observability_ingest_basic_auth_enabled | default(false) %} + auth: + strategy: basic + user: "{{ observability_ingest_basic_auth_user }}" + password: "{{ observability_ingest_basic_auth_password }}" +{% endif %} + compression: snappy + healthcheck: false + + observability_logs: + type: loki + inputs: ["agent_logs"] + endpoint: "{{ logs_endpoint }}" +{% if observability_ingest_basic_auth_enabled | default(false) %} + auth: + strategy: basic + user: "{{ observability_ingest_basic_auth_user }}" + password: "{{ observability_ingest_basic_auth_password }}" +{% endif %} + compression: gzip + encoding: + codec: json + labels: + host: "{{ '{{ host }}' }}" + job: "{{ '{{ job }}' }}" + origin: "{{ '{{ origin }}' }}" diff --git a/scripts/agent-install.sh b/scripts/agent-install.sh index ead4f70..42f32ec 100755 --- a/scripts/agent-install.sh +++ b/scripts/agent-install.sh @@ -36,6 +36,102 @@ log_error() { echo -e "${RED}[ERROR]${NC} $1"; } log_fail() { echo -e "${RED}[FAIL]${NC} $1"; } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +append_unique() { + local value="$1" + local -n target_ref="$2" + [[ -z "${value}" ]] && return 0 + local existing + for existing in "${target_ref[@]:-}"; do + if [[ "${existing}" == "${value}" ]]; then + return 0 + fi + done + target_ref+=("${value}") +} + +collect_local_ipv4s() { + local ips=() + local ip + + if command -v hostname >/dev/null 2>&1; then + for ip in $(hostname -I 2>/dev/null || true); do + append_unique "${ip}" ips + done + fi + + if command -v ip >/dev/null 2>&1; then + while read -r ip; do + append_unique "${ip}" ips + done < <(ip -o -4 addr show scope global 2>/dev/null | awk '{print $4}' | cut -d/ -f1) + fi + + printf '%s\n' "${ips[@]}" +} + +resolve_ipv4s() { + local host="$1" + local ips=() + local ip + + if command -v getent >/dev/null 2>&1; then + while read -r ip _; do + append_unique "${ip}" ips + done < <(getent ahostsv4 "${host}" 2>/dev/null || true) + fi + + if [[ ${#ips[@]} -eq 0 ]] && command -v host >/dev/null 2>&1; then + while read -r ip; do + append_unique "${ip}" ips + done < <(host "${host}" 2>/dev/null | awk '/has address/ {print $4}') + fi + + printf '%s\n' "${ips[@]}" +} + +extract_host_from_url() { + local url="$1" + url="${url#*://}" + url="${url%%/*}" + url="${url%%:*}" + printf '%s\n' "${url}" +} + +endpoint_targets_local_host() { + local host="$1" + local local_host + local local_short + local local_ip + local resolved_ip + local local_ips=() + local resolved_ips=() + + local_host="$(hostname -f 2>/dev/null || hostname)" + local_short="${local_host%%.*}" + if [[ "${host}" == "${local_host}" || "${host}" == "${local_short}" ]]; then + return 0 + fi + + while read -r local_ip; do + append_unique "${local_ip}" local_ips + done < <(collect_local_ipv4s) + + while read -r resolved_ip; do + append_unique "${resolved_ip}" resolved_ips + done < <(resolve_ipv4s "${host}") + + [[ ${#local_ips[@]} -eq 0 || ${#resolved_ips[@]} -eq 0 ]] && return 1 + + for resolved_ip in "${resolved_ips[@]}"; do + for local_ip in "${local_ips[@]}"; do + if [[ "${resolved_ip}" == "${local_ip}" ]]; then + return 0 + fi + done + done + + return 1 +} + usage() { cat </dev/null 2>&1; then + for ip in $(hostname -I 2>/dev/null || true); do + append_unique "${ip}" ips + done + fi + + if command -v ip >/dev/null 2>&1; then + while read -r ip; do + append_unique "${ip}" ips + done < <(ip -o -4 addr show scope global 2>/dev/null | awk '{print $4}' | cut -d/ -f1) + fi + + printf '%s\n' "${ips[@]}" +} + +resolve_ipv4s() { + local host="$1" + local ips=() + local ip + + if command -v getent >/dev/null 2>&1; then + while read -r ip _; do + append_unique "${ip}" ips + done < <(getent ahostsv4 "${host}" 2>/dev/null || true) + fi + + if [[ ${#ips[@]} -eq 0 ]] && command -v host >/dev/null 2>&1; then + while read -r ip; do + append_unique "${ip}" ips + done < <(host "${host}" 2>/dev/null | awk '/has address/ {print $4}') + fi + + printf '%s\n' "${ips[@]}" +} + +domain_points_to_local_host() { + local host="$1" + local local_ip + local resolved_ip + local local_ips=() + local resolved_ips=() + + while read -r local_ip; do + append_unique "${local_ip}" local_ips + done < <(collect_local_ipv4s) + + while read -r resolved_ip; do + append_unique "${resolved_ip}" resolved_ips + done < <(resolve_ipv4s "${host}") + + [[ ${#local_ips[@]} -eq 0 || ${#resolved_ips[@]} -eq 0 ]] && return 1 + + for resolved_ip in "${resolved_ips[@]}"; do + for local_ip in "${local_ips[@]}"; do + if [[ "${resolved_ip}" == "${local_ip}" ]]; then + return 0 + fi + done + done + + return 1 +} + usage() { cat <