Add mixed-host observability deploy and ingest auth
This commit is contained in:
parent
f937afe1fd
commit
c3fe0324ea
72
README.md
72
README.md
@ -31,6 +31,70 @@ flowchart LR
|
||||
|
||||
## 3) Start
|
||||
|
||||
当前推荐按“混合部署到已有主机”的方式执行。
|
||||
|
||||
1. 先更新 DNS,把 `observability.svc.plus` 指到 `us-xhttp.svc.plus`
|
||||
2. 在 `us-xhttp.svc.plus` 上执行下面的 Server side 示例,部署中心端
|
||||
3. 再到其他已有主机执行下面的 Client side 示例,把采集数据回传到 `observability.svc.plus`
|
||||
|
||||
当前接入主机:
|
||||
|
||||
- `us-xhttp.svc.plus`:继续承载现有服务,同时承载 `observability.svc.plus`
|
||||
- `clawdbot.svc.plus`:部署 agent,采集后上报到中心端
|
||||
- `jp-xhttp.svc.plus`:部署 agent,采集后上报到中心端
|
||||
|
||||
### Ansible (Recommended)
|
||||
|
||||
#### Server side
|
||||
|
||||
先导出 Cloudflare Token,然后在 `us-xhttp.svc.plus` 上执行服务端部署。`deploy_observability_service.yml` 会先把 Cloudflare 上的 `observability.svc.plus` 更新成指向 `us-xhttp.svc.plus` 的非代理记录,再等待公共 DNS 生效后继续部署,这样更容易保证 Caddy 首次自动签名成功。
|
||||
|
||||
```bash
|
||||
export CLOUDFLARE_API_TOKEN=...
|
||||
ansible-playbook -i <your-inventory> deploy_observability_service.yml -l us-xhttp.svc.plus
|
||||
```
|
||||
|
||||
如果希望给 `/ingest/*` 增加一层基础认证,可以在服务端部署时一起打开:
|
||||
|
||||
```bash
|
||||
export CLOUDFLARE_API_TOKEN=...
|
||||
ansible-playbook -i <your-inventory> deploy_observability_service.yml -l us-xhttp.svc.plus \
|
||||
-e observability_ingest_basic_auth_enabled=true \
|
||||
-e observability_ingest_basic_auth_user=ingest \
|
||||
-e observability_ingest_basic_auth_password='<strong-password>'
|
||||
```
|
||||
|
||||
#### Client side (agent)
|
||||
|
||||
再到采集端主机执行 `node.yml` 的 push mode:
|
||||
|
||||
```bash
|
||||
ansible-playbook -i <your-inventory> node.yml \
|
||||
-l clawdbot.svc.plus,jp-xhttp.svc.plus \
|
||||
-e node_monitor_mode=push \
|
||||
-e observability_endpoint=https://observability.svc.plus/ingest/otlp \
|
||||
-e haproxy_enabled=false
|
||||
```
|
||||
|
||||
如果服务端已开启 ingest 基本认证,采集端也要带上同一组凭据:
|
||||
|
||||
```bash
|
||||
ansible-playbook -i <your-inventory> node.yml \
|
||||
-l clawdbot.svc.plus,jp-xhttp.svc.plus \
|
||||
-e node_monitor_mode=push \
|
||||
-e observability_endpoint=https://observability.svc.plus/ingest/otlp \
|
||||
-e observability_ingest_basic_auth_enabled=true \
|
||||
-e observability_ingest_basic_auth_user=ingest \
|
||||
-e observability_ingest_basic_auth_password='<strong-password>' \
|
||||
-e haproxy_enabled=false
|
||||
```
|
||||
|
||||
> `node_monitor_mode=push` 会在远端主机上部署 `node_exporter + process_exporter + vector`,并把 metrics / logs 主动汇总到 `observability.svc.plus`。
|
||||
>
|
||||
> `observability_ingest_basic_auth_*` 只保护 `/ingest/*` 写入入口,不影响 Caddy 暴露的其他站点页面;服务端和采集端必须使用同一组认证信息。
|
||||
|
||||
### Script Installers
|
||||
|
||||
### Server side
|
||||
|
||||
```bash
|
||||
@ -99,6 +163,14 @@ ssh root@clawdbot.svc.plus \
|
||||
| bash -s -- --endpoint https://observability.svc.plus/ingest/otlp'
|
||||
```
|
||||
|
||||
### Remote client example (jp-xhttp.svc.plus)
|
||||
|
||||
```bash
|
||||
ssh root@jp-xhttp.svc.plus \
|
||||
'curl -fsSL https://raw.githubusercontent.com/cloud-neutral-toolkit/observability.svc.plus/main/scripts/agent-install.sh \
|
||||
| bash -s -- --endpoint https://observability.svc.plus/ingest/otlp'
|
||||
```
|
||||
|
||||
### Optional SSH manager env example
|
||||
|
||||
```bash
|
||||
|
||||
139
deploy_observability_service.yml
Normal file
139
deploy_observability_service.yml
Normal file
@ -0,0 +1,139 @@
|
||||
---
|
||||
- name: Update Cloudflare DNS for observability.svc.plus
|
||||
hosts: localhost
|
||||
connection: local
|
||||
gather_facts: false
|
||||
vars:
|
||||
cloudflare_zone_name: svc.plus
|
||||
cloudflare_api_base: https://api.cloudflare.com/client/v4
|
||||
observability_domain: observability.svc.plus
|
||||
observability_dns_target: us-xhttp.svc.plus
|
||||
observability_dns_type: CNAME
|
||||
observability_dns_ttl: 1
|
||||
observability_dns_proxied: false
|
||||
dns_wait_retries: 30
|
||||
dns_wait_delay: 10
|
||||
tasks:
|
||||
- name: Validate Cloudflare token is present in environment
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- lookup('ansible.builtin.env', 'CLOUDFLARE_API_TOKEN') | length > 0
|
||||
fail_msg: "CLOUDFLARE_API_TOKEN must be exported before running this playbook."
|
||||
|
||||
- name: Resolve Cloudflare zone id
|
||||
ansible.builtin.uri:
|
||||
url: "{{ cloudflare_api_base }}/zones?name={{ cloudflare_zone_name }}"
|
||||
method: GET
|
||||
headers:
|
||||
Authorization: "Bearer {{ lookup('ansible.builtin.env', 'CLOUDFLARE_API_TOKEN') }}"
|
||||
Content-Type: application/json
|
||||
return_content: true
|
||||
register: cloudflare_zone_lookup
|
||||
|
||||
- name: Validate zone lookup result
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- cloudflare_zone_lookup.json.success
|
||||
- cloudflare_zone_lookup.json.result | length > 0
|
||||
fail_msg: "Unable to resolve Cloudflare zone id for {{ cloudflare_zone_name }}."
|
||||
|
||||
- name: Set Cloudflare zone id
|
||||
ansible.builtin.set_fact:
|
||||
cloudflare_zone_id: "{{ cloudflare_zone_lookup.json.result[0].id }}"
|
||||
|
||||
- name: Query existing observability DNS records
|
||||
ansible.builtin.uri:
|
||||
url: "{{ cloudflare_api_base }}/zones/{{ cloudflare_zone_id }}/dns_records?name={{ observability_domain }}"
|
||||
method: GET
|
||||
headers:
|
||||
Authorization: "Bearer {{ lookup('ansible.builtin.env', 'CLOUDFLARE_API_TOKEN') }}"
|
||||
Content-Type: application/json
|
||||
return_content: true
|
||||
register: observability_dns_records
|
||||
|
||||
- name: Remove conflicting observability DNS records with different type
|
||||
ansible.builtin.uri:
|
||||
url: "{{ cloudflare_api_base }}/zones/{{ cloudflare_zone_id }}/dns_records/{{ item.id }}"
|
||||
method: DELETE
|
||||
headers:
|
||||
Authorization: "Bearer {{ lookup('ansible.builtin.env', 'CLOUDFLARE_API_TOKEN') }}"
|
||||
Content-Type: application/json
|
||||
loop: "{{ observability_dns_records.json.result | default([]) }}"
|
||||
loop_control:
|
||||
label: "{{ item.type }} {{ item.name }}"
|
||||
when: item.type != observability_dns_type
|
||||
|
||||
- name: Create observability DNS record when missing
|
||||
ansible.builtin.uri:
|
||||
url: "{{ cloudflare_api_base }}/zones/{{ cloudflare_zone_id }}/dns_records"
|
||||
method: POST
|
||||
headers:
|
||||
Authorization: "Bearer {{ lookup('ansible.builtin.env', 'CLOUDFLARE_API_TOKEN') }}"
|
||||
Content-Type: application/json
|
||||
body_format: raw
|
||||
body: >-
|
||||
{{
|
||||
{
|
||||
'type': observability_dns_type,
|
||||
'name': observability_domain,
|
||||
'content': observability_dns_target,
|
||||
'ttl': (observability_dns_ttl | int),
|
||||
'proxied': (observability_dns_proxied | bool)
|
||||
} | to_json
|
||||
}}
|
||||
when: (observability_dns_records.json.result | selectattr('type', 'equalto', observability_dns_type) | list | length) == 0
|
||||
|
||||
- name: Update observability DNS record when target changes
|
||||
ansible.builtin.uri:
|
||||
url: "{{ cloudflare_api_base }}/zones/{{ cloudflare_zone_id }}/dns_records/{{ (observability_dns_records.json.result | selectattr('type', 'equalto', observability_dns_type) | list | first).id }}"
|
||||
method: PUT
|
||||
headers:
|
||||
Authorization: "Bearer {{ lookup('ansible.builtin.env', 'CLOUDFLARE_API_TOKEN') }}"
|
||||
Content-Type: application/json
|
||||
body_format: raw
|
||||
body: >-
|
||||
{{
|
||||
{
|
||||
'type': observability_dns_type,
|
||||
'name': observability_domain,
|
||||
'content': observability_dns_target,
|
||||
'ttl': (observability_dns_ttl | int),
|
||||
'proxied': (observability_dns_proxied | bool)
|
||||
} | to_json
|
||||
}}
|
||||
when:
|
||||
- (observability_dns_records.json.result | selectattr('type', 'equalto', observability_dns_type) | list | length) > 0
|
||||
- >
|
||||
((observability_dns_records.json.result | selectattr('type', 'equalto', observability_dns_type) | list | first).content != observability_dns_target)
|
||||
or
|
||||
(((observability_dns_records.json.result | selectattr('type', 'equalto', observability_dns_type) | list | first).proxied | default(false)) != observability_dns_proxied)
|
||||
|
||||
- name: Wait for public DNS to expose observability CNAME
|
||||
ansible.builtin.uri:
|
||||
url: "https://cloudflare-dns.com/dns-query?name={{ observability_domain }}&type=CNAME"
|
||||
method: GET
|
||||
headers:
|
||||
Accept: application/dns-json
|
||||
return_content: true
|
||||
register: observability_dns_public
|
||||
until:
|
||||
- observability_dns_public.status == 200
|
||||
- observability_dns_public.json.Status == 0
|
||||
- >
|
||||
(observability_dns_public.json.Answer | default([])
|
||||
| selectattr('data', 'equalto', observability_dns_target ~ '.')
|
||||
| list | length) > 0
|
||||
retries: "{{ dns_wait_retries }}"
|
||||
delay: "{{ dns_wait_delay }}"
|
||||
|
||||
- name: Show effective observability DNS target
|
||||
ansible.builtin.debug:
|
||||
msg: "{{ observability_domain }} -> {{ observability_dns_target }} proxied={{ observability_dns_proxied }}"
|
||||
|
||||
- import_playbook: infra.yml
|
||||
vars:
|
||||
infra_domain: observability.svc.plus
|
||||
infra_portal:
|
||||
home: { domain: observability.svc.plus }
|
||||
caddy_enabled: true
|
||||
nginx_enabled: false
|
||||
@ -103,4 +103,13 @@
|
||||
# - add_logs : register infra as vector logging source
|
||||
# - add_ds : register infra victoria stack as grafana datasource
|
||||
#--------------------------------------------------------------#
|
||||
# Mixed Existing-Host Deployment
|
||||
#--------------------------------------------------------------#
|
||||
# Center service example:
|
||||
# ./infra.yml -l us-xhttp.svc.plus \
|
||||
# -e infra_domain=observability.svc.plus \
|
||||
# -e 'infra_portal={\"home\":{\"domain\":\"observability.svc.plus\"}}' \
|
||||
# -e caddy_enabled=true \
|
||||
# -e nginx_enabled=false
|
||||
#--------------------------------------------------------------#
|
||||
...
|
||||
6
node.yml
6
node.yml
@ -32,6 +32,12 @@
|
||||
# node.yml -l <cls> # add groups
|
||||
# node.yml -l <ip> # add single node
|
||||
#
|
||||
# Observability push-agent mode:
|
||||
# ./node.yml -l clawdbot.svc.plus,jp-xhttp.svc.plus \
|
||||
# -e node_monitor_mode=push \
|
||||
# -e observability_endpoint=https://observability.svc.plus/ingest/otlp \
|
||||
# -e haproxy_enabled=false
|
||||
#
|
||||
# Bootstrap with another admin user: (Create admin with another admin)
|
||||
# node.yml -t node_admin # create admin user for nodes
|
||||
# node.yml -t node_admin -k -K -e ansible_user=<another admin>
|
||||
|
||||
@ -15,6 +15,10 @@ proxy_env: { no_proxy: "localhost,127.0.0.1,10.0.0.0/8,192.168.0.0/16,*.aliyun.c
|
||||
infra_portal: # infra services exposed via portal
|
||||
home : { domain: i.observability } # default home server definition
|
||||
infra_domain: observability.svc.plus
|
||||
observability_ingest_basic_auth_enabled: false
|
||||
observability_ingest_basic_auth_user: ingest
|
||||
observability_ingest_basic_auth_password: ''
|
||||
observability_ingest_basic_auth_password_hash: ''
|
||||
infra_data: /data/infra # default data path for infrastructure data
|
||||
infra_services: # home page navigation entries
|
||||
- { name: Metrics ,url: '/vmetrics/vmui/' ,desc: 'VictoriaMetrics Query UI' ,icon: 'metrics' ,name_cn: '指标查询' ,desc_cn: 'VictoriaMetrics 指标查询界面' }
|
||||
|
||||
@ -1,4 +1,27 @@
|
||||
---
|
||||
#--------------------------------------------------------------#
|
||||
# 0. Validate optional ingest auth [auth]
|
||||
#--------------------------------------------------------------#
|
||||
- name: validate observability ingest basic auth inputs
|
||||
tags: auth
|
||||
when: observability_ingest_basic_auth_enabled | default(false) | bool
|
||||
assert:
|
||||
that:
|
||||
- observability_ingest_basic_auth_user | default('', true) | length > 0
|
||||
- observability_ingest_basic_auth_password | default('', true) | length > 0 or observability_ingest_basic_auth_password_hash | default('', true) | length > 0
|
||||
fail_msg: "When observability_ingest_basic_auth_enabled=true, set observability_ingest_basic_auth_user and either observability_ingest_basic_auth_password or observability_ingest_basic_auth_password_hash."
|
||||
|
||||
- name: build effective observability ingest password hash
|
||||
tags: auth
|
||||
when: observability_ingest_basic_auth_enabled | default(false) | bool
|
||||
set_fact:
|
||||
observability_ingest_basic_auth_password_hash_effective: >-
|
||||
{{
|
||||
observability_ingest_basic_auth_password_hash
|
||||
if (observability_ingest_basic_auth_password_hash | default('', true) | length > 0)
|
||||
else (observability_ingest_basic_auth_password | password_hash('bcrypt'))
|
||||
}}
|
||||
|
||||
#--------------------------------------------------------------#
|
||||
# 1. Infra User [infra_user]
|
||||
#--------------------------------------------------------------#
|
||||
|
||||
@ -5,6 +5,13 @@
|
||||
{{ infra_domain | default('observability.svc.plus') }} {
|
||||
encode gzip zstd
|
||||
|
||||
{% if observability_ingest_basic_auth_enabled | default(false) %}
|
||||
@observability_ingest path /ingest/*
|
||||
basic_auth @observability_ingest {
|
||||
{{ observability_ingest_basic_auth_user }} {{ observability_ingest_basic_auth_password_hash_effective | default(observability_ingest_basic_auth_password_hash) }}
|
||||
}
|
||||
{% endif %}
|
||||
|
||||
# ---- Alloy unified ingest endpoints ----
|
||||
|
||||
# Prometheus remote_write
|
||||
|
||||
@ -8,6 +8,13 @@ node_exporter_options: '--no-collector.softnet --no-collector.nvme --collector.t
|
||||
#--------------------------------------------------------------#
|
||||
# VECTOR
|
||||
#--------------------------------------------------------------#
|
||||
node_monitor_mode: pull # pull: central scrape/register, push: remote agent pushes to observability endpoint
|
||||
observability_endpoint: '' # base endpoint, e.g. https://observability.svc.plus/ingest/otlp
|
||||
observability_metrics_endpoint: '' # optional override for remote_write endpoint
|
||||
observability_logs_endpoint: '' # optional override for logs endpoint
|
||||
observability_ingest_basic_auth_enabled: false
|
||||
observability_ingest_basic_auth_user: ingest
|
||||
observability_ingest_basic_auth_password: ''
|
||||
vector_enabled: true # enable vector log collector?
|
||||
vector_clean: false # purge vector data dir during init?
|
||||
vector_data: /data/vector # vector data dir, /data/vector by default
|
||||
@ -15,6 +22,16 @@ vector_port: 9598 # vector metrics port, 9598 by default
|
||||
vector_read_from: beginning # vector read from beginning or end
|
||||
vector_log_endpoint: [ infra ] # if defined, sending vector log to this endpoint.
|
||||
|
||||
#--------------------------------------------------------------#
|
||||
# PROCESS EXPORTER
|
||||
#--------------------------------------------------------------#
|
||||
process_exporter_enabled: false # enable process_exporter, automatically recommended for push mode
|
||||
process_exporter_version: 0.7.10
|
||||
process_exporter_port: 9256
|
||||
process_exporter_binary: /usr/local/bin/process-exporter
|
||||
process_exporter_config_dir: /etc/process-exporter
|
||||
process_exporter_config_file: /etc/process-exporter/process-exporter.yml
|
||||
|
||||
#-----------------------------------------------------------------
|
||||
# NODE_VIP (Reference)
|
||||
#-----------------------------------------------------------------
|
||||
|
||||
@ -45,6 +45,24 @@
|
||||
loop: '{{ groups["infra"]|default([]) }}'
|
||||
systemd: name=nginx state=reloaded enabled=yes daemon_reload=yes
|
||||
|
||||
- name: validate observability push mode inputs
|
||||
tags: [monitor, vector, process_exporter]
|
||||
when: node_monitor_mode | default('pull') == 'push'
|
||||
assert:
|
||||
that:
|
||||
- observability_endpoint | default('', true) | length > 0 or (observability_metrics_endpoint | default('', true) | length > 0 and observability_logs_endpoint | default('', true) | length > 0)
|
||||
fail_msg: "Set observability_endpoint, or set both observability_metrics_endpoint and observability_logs_endpoint, when node_monitor_mode=push."
|
||||
|
||||
- name: validate observability push mode basic auth inputs
|
||||
tags: [monitor, vector, process_exporter, auth]
|
||||
when:
|
||||
- node_monitor_mode | default('pull') == 'push'
|
||||
- observability_ingest_basic_auth_enabled | default(false) | bool
|
||||
assert:
|
||||
that:
|
||||
- observability_ingest_basic_auth_user | default('', true) | length > 0
|
||||
- observability_ingest_basic_auth_password | default('', true) | length > 0
|
||||
fail_msg: "When observability_ingest_basic_auth_enabled=true in push mode, set observability_ingest_basic_auth_user and observability_ingest_basic_auth_password."
|
||||
|
||||
#--------------------------------------------------------------#
|
||||
# Register Instance DNS Name [vip_dns]
|
||||
@ -140,6 +158,7 @@
|
||||
# /infra/targets/node/{{ ip }}.yml
|
||||
- name: register node as victoria target
|
||||
tags: [ node_vip, node_register, register, add_metrics ]
|
||||
when: node_monitor_mode | default('pull') != 'push'
|
||||
ignore_errors: true
|
||||
delegate_to: '{{ item }}'
|
||||
loop: '{{ groups["infra"]|default([]) }}'
|
||||
@ -168,6 +187,7 @@
|
||||
|
||||
- name: register node as ping target
|
||||
tags: [ node_register, register, add_metrics ]
|
||||
when: node_monitor_mode | default('pull') != 'push'
|
||||
ignore_errors: true
|
||||
delegate_to: '{{ item }}'
|
||||
loop: '{{ groups["infra"]|default([]) }}'
|
||||
@ -183,7 +203,7 @@
|
||||
|
||||
- name: register node vip as ping target
|
||||
tags: [ node_vip, node_register, register, add_metrics ]
|
||||
when: vip_enabled|bool and vip_address is defined and vip_address != ''
|
||||
when: node_monitor_mode | default('pull') != 'push' and vip_enabled|bool and vip_address is defined and vip_address != ''
|
||||
ignore_errors: true
|
||||
delegate_to: '{{ item }}'
|
||||
loop: '{{ groups["infra"]|default([]) }}'
|
||||
@ -197,6 +217,9 @@
|
||||
- labels: { ip: {{ inventory_hostname }} , ins: {{ nodename }} , cls: {{ node_cluster|default('nodes') }}, vip: {{ vip_address }} , job: node-vip }
|
||||
targets: [ {{ vip_address }} ]
|
||||
|
||||
- import_tasks: process_exporter.yml
|
||||
tags: process_exporter
|
||||
when: process_exporter_enabled | default(false) | bool or node_monitor_mode | default('pull') == 'push'
|
||||
|
||||
#--------------------------------------------------------------#
|
||||
# Vector [vector]
|
||||
|
||||
90
roles/node_monitor/tasks/process_exporter.yml
Normal file
90
roles/node_monitor/tasks/process_exporter.yml
Normal file
@ -0,0 +1,90 @@
|
||||
---
|
||||
#--------------------------------------------------------------#
|
||||
# Install process_exporter [process_exporter_install]
|
||||
#--------------------------------------------------------------#
|
||||
- name: detect process_exporter architecture
|
||||
tags: [process_exporter, process_exporter_install]
|
||||
command: uname -m
|
||||
register: process_exporter_uname
|
||||
changed_when: false
|
||||
|
||||
- name: map process_exporter architecture
|
||||
tags: [process_exporter, process_exporter_install]
|
||||
set_fact:
|
||||
process_exporter_arch: >-
|
||||
{% if process_exporter_uname.stdout == 'x86_64' %}amd64{% elif process_exporter_uname.stdout in ['aarch64', 'arm64'] %}arm64{% else %}{% endif %}
|
||||
|
||||
- name: validate process_exporter architecture
|
||||
tags: [process_exporter, process_exporter_install]
|
||||
assert:
|
||||
that:
|
||||
- process_exporter_arch | length > 0
|
||||
fail_msg: "Unsupported process_exporter architecture: {{ process_exporter_uname.stdout }}"
|
||||
|
||||
- name: ensure process_exporter config directory exists
|
||||
tags: [process_exporter, process_exporter_config]
|
||||
file:
|
||||
path: "{{ process_exporter_config_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
|
||||
- name: download process_exporter release archive
|
||||
tags: [process_exporter, process_exporter_install]
|
||||
get_url:
|
||||
url: "https://github.com/ncabatoff/process-exporter/releases/download/v{{ process_exporter_version }}/process-exporter-{{ process_exporter_version }}.linux-{{ process_exporter_arch }}.tar.gz"
|
||||
dest: "/tmp/process-exporter-{{ process_exporter_version }}.linux-{{ process_exporter_arch }}.tar.gz"
|
||||
mode: '0644'
|
||||
|
||||
- name: extract process_exporter release archive
|
||||
tags: [process_exporter, process_exporter_install]
|
||||
unarchive:
|
||||
src: "/tmp/process-exporter-{{ process_exporter_version }}.linux-{{ process_exporter_arch }}.tar.gz"
|
||||
dest: /tmp
|
||||
remote_src: true
|
||||
creates: "/tmp/process-exporter-{{ process_exporter_version }}.linux-{{ process_exporter_arch }}/process-exporter"
|
||||
|
||||
- name: install process_exporter binary
|
||||
tags: [process_exporter, process_exporter_install]
|
||||
copy:
|
||||
src: "/tmp/process-exporter-{{ process_exporter_version }}.linux-{{ process_exporter_arch }}/process-exporter"
|
||||
dest: "{{ process_exporter_binary }}"
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
remote_src: true
|
||||
|
||||
- name: render process_exporter config
|
||||
tags: [process_exporter, process_exporter_config]
|
||||
template:
|
||||
src: process_exporter.yml
|
||||
dest: "{{ process_exporter_config_file }}"
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: render process_exporter systemd unit
|
||||
tags: [process_exporter, process_exporter_config]
|
||||
template:
|
||||
src: process_exporter.svc
|
||||
dest: "{{ systemd_dir }}/process_exporter.service"
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: launch process_exporter
|
||||
tags: [process_exporter, process_exporter_launch]
|
||||
systemd:
|
||||
name: process_exporter
|
||||
state: restarted
|
||||
enabled: true
|
||||
daemon_reload: true
|
||||
|
||||
- name: wait for process_exporter service online
|
||||
tags: [process_exporter, process_exporter_launch]
|
||||
wait_for:
|
||||
host: 127.0.0.1
|
||||
port: "{{ process_exporter_port }}"
|
||||
state: started
|
||||
timeout: 15
|
||||
@ -46,10 +46,11 @@
|
||||
with_items:
|
||||
- { src: vector.svc ,dest: "{{ systemd_dir }}/vector.service" }
|
||||
- { src: vector.env ,dest: /etc/default/vector }
|
||||
- { src: vector.yaml ,dest: /etc/vector/vector.yaml }
|
||||
- { src: "{% if node_monitor_mode | default('pull') == 'push' %}vector-push.yaml{% else %}vector.yaml{% endif %}" ,dest: /etc/vector/vector.yaml }
|
||||
|
||||
- name: register node syslog to vector
|
||||
tags: [ node_register, register ,add_logs ]
|
||||
when: node_monitor_mode | default('pull') != 'push'
|
||||
template: src=node.yaml dest=/etc/vector/node.yaml mode=0600
|
||||
|
||||
|
||||
|
||||
13
roles/node_monitor/templates/process_exporter.svc
Normal file
13
roles/node_monitor/templates/process_exporter.svc
Normal file
@ -0,0 +1,13 @@
|
||||
[Unit]
|
||||
Description=Process Exporter
|
||||
Documentation=https://github.com/ncabatoff/process-exporter
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
User=root
|
||||
ExecStart={{ process_exporter_binary }} --web.listen-address=:{{ process_exporter_port }} -config.path {{ process_exporter_config_file }}
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
4
roles/node_monitor/templates/process_exporter.yml
Normal file
4
roles/node_monitor/templates/process_exporter.yml
Normal file
@ -0,0 +1,4 @@
|
||||
process_names:
|
||||
- name: "{{ '{{.Comm}}' }}"
|
||||
cmdline:
|
||||
- '.+'
|
||||
99
roles/node_monitor/templates/vector-push.yaml
Normal file
99
roles/node_monitor/templates/vector-push.yaml
Normal file
@ -0,0 +1,99 @@
|
||||
---
|
||||
{% set base_endpoint = (observability_endpoint | default('', true) | regex_replace('/+$', '') | regex_replace('/ingest/otlp.*$', '')) %}
|
||||
{% set metrics_endpoint = observability_metrics_endpoint | default(base_endpoint ~ '/ingest/metrics/api/v1/write', true) %}
|
||||
{% set logs_endpoint = observability_logs_endpoint | default(base_endpoint ~ '/ingest/logs/insert', true) %}
|
||||
data_dir: {{ vector_data }}
|
||||
|
||||
api:
|
||||
enabled: true
|
||||
|
||||
sources:
|
||||
internal_metrics:
|
||||
type: internal_metrics
|
||||
scrape_interval_secs: 15
|
||||
|
||||
node_exporter:
|
||||
type: prometheus_scrape
|
||||
endpoints:
|
||||
- http://127.0.0.1:{{ node_exporter_port | default(9100) }}{{ exporter_metrics_path | default('/metrics') }}
|
||||
scrape_interval_secs: 15
|
||||
|
||||
process_exporter:
|
||||
type: prometheus_scrape
|
||||
endpoints:
|
||||
- http://127.0.0.1:{{ process_exporter_port | default(9256) }}/metrics
|
||||
scrape_interval_secs: 15
|
||||
|
||||
journald:
|
||||
type: journald
|
||||
current_boot_only: true
|
||||
|
||||
syslog_files:
|
||||
type: file
|
||||
include:
|
||||
- /var/log/syslog
|
||||
- /var/log/messages
|
||||
- /var/log/auth.log
|
||||
read_from: end
|
||||
|
||||
transforms:
|
||||
agent_metrics:
|
||||
type: remap
|
||||
inputs: ["node_exporter", "process_exporter"]
|
||||
source: |
|
||||
.tags.host = "{{ ansible_hostname | default(nodename | default(inventory_hostname)) }}"
|
||||
.tags.ip = "{{ inventory_hostname }}"
|
||||
.tags.ins = "{{ nodename | default(inventory_hostname) }}"
|
||||
.tags.cls = "{{ node_cluster | default('nodes') }}"
|
||||
.tags.job = "node"
|
||||
.tags.origin = "vector-agent"
|
||||
|
||||
agent_logs:
|
||||
type: remap
|
||||
inputs: ["journald", "syslog_files"]
|
||||
source: |
|
||||
.host = "{{ ansible_hostname | default(nodename | default(inventory_hostname)) }}"
|
||||
.ip = "{{ inventory_hostname }}"
|
||||
.ins = "{{ nodename | default(inventory_hostname) }}"
|
||||
.cls = "{{ node_cluster | default('nodes') }}"
|
||||
.job = "node"
|
||||
.origin = "vector-agent"
|
||||
.timestamp = now()
|
||||
|
||||
sinks:
|
||||
vector_metrics:
|
||||
type: prometheus_exporter
|
||||
inputs: ["internal_metrics"]
|
||||
address: 0.0.0.0:{{ vector_port }}
|
||||
default_namespace: vector
|
||||
|
||||
observability_metrics:
|
||||
type: prometheus_remote_write
|
||||
inputs: ["agent_metrics"]
|
||||
endpoint: "{{ metrics_endpoint }}"
|
||||
{% if observability_ingest_basic_auth_enabled | default(false) %}
|
||||
auth:
|
||||
strategy: basic
|
||||
user: "{{ observability_ingest_basic_auth_user }}"
|
||||
password: "{{ observability_ingest_basic_auth_password }}"
|
||||
{% endif %}
|
||||
compression: snappy
|
||||
healthcheck: false
|
||||
|
||||
observability_logs:
|
||||
type: loki
|
||||
inputs: ["agent_logs"]
|
||||
endpoint: "{{ logs_endpoint }}"
|
||||
{% if observability_ingest_basic_auth_enabled | default(false) %}
|
||||
auth:
|
||||
strategy: basic
|
||||
user: "{{ observability_ingest_basic_auth_user }}"
|
||||
password: "{{ observability_ingest_basic_auth_password }}"
|
||||
{% endif %}
|
||||
compression: gzip
|
||||
encoding:
|
||||
codec: json
|
||||
labels:
|
||||
host: "{{ '{{ host }}' }}"
|
||||
job: "{{ '{{ job }}' }}"
|
||||
origin: "{{ '{{ origin }}' }}"
|
||||
@ -36,6 +36,102 @@ log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
|
||||
log_fail() { echo -e "${RED}[FAIL]${NC} $1"; }
|
||||
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
|
||||
|
||||
append_unique() {
|
||||
local value="$1"
|
||||
local -n target_ref="$2"
|
||||
[[ -z "${value}" ]] && return 0
|
||||
local existing
|
||||
for existing in "${target_ref[@]:-}"; do
|
||||
if [[ "${existing}" == "${value}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
target_ref+=("${value}")
|
||||
}
|
||||
|
||||
collect_local_ipv4s() {
|
||||
local ips=()
|
||||
local ip
|
||||
|
||||
if command -v hostname >/dev/null 2>&1; then
|
||||
for ip in $(hostname -I 2>/dev/null || true); do
|
||||
append_unique "${ip}" ips
|
||||
done
|
||||
fi
|
||||
|
||||
if command -v ip >/dev/null 2>&1; then
|
||||
while read -r ip; do
|
||||
append_unique "${ip}" ips
|
||||
done < <(ip -o -4 addr show scope global 2>/dev/null | awk '{print $4}' | cut -d/ -f1)
|
||||
fi
|
||||
|
||||
printf '%s\n' "${ips[@]}"
|
||||
}
|
||||
|
||||
resolve_ipv4s() {
|
||||
local host="$1"
|
||||
local ips=()
|
||||
local ip
|
||||
|
||||
if command -v getent >/dev/null 2>&1; then
|
||||
while read -r ip _; do
|
||||
append_unique "${ip}" ips
|
||||
done < <(getent ahostsv4 "${host}" 2>/dev/null || true)
|
||||
fi
|
||||
|
||||
if [[ ${#ips[@]} -eq 0 ]] && command -v host >/dev/null 2>&1; then
|
||||
while read -r ip; do
|
||||
append_unique "${ip}" ips
|
||||
done < <(host "${host}" 2>/dev/null | awk '/has address/ {print $4}')
|
||||
fi
|
||||
|
||||
printf '%s\n' "${ips[@]}"
|
||||
}
|
||||
|
||||
extract_host_from_url() {
|
||||
local url="$1"
|
||||
url="${url#*://}"
|
||||
url="${url%%/*}"
|
||||
url="${url%%:*}"
|
||||
printf '%s\n' "${url}"
|
||||
}
|
||||
|
||||
endpoint_targets_local_host() {
|
||||
local host="$1"
|
||||
local local_host
|
||||
local local_short
|
||||
local local_ip
|
||||
local resolved_ip
|
||||
local local_ips=()
|
||||
local resolved_ips=()
|
||||
|
||||
local_host="$(hostname -f 2>/dev/null || hostname)"
|
||||
local_short="${local_host%%.*}"
|
||||
if [[ "${host}" == "${local_host}" || "${host}" == "${local_short}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
while read -r local_ip; do
|
||||
append_unique "${local_ip}" local_ips
|
||||
done < <(collect_local_ipv4s)
|
||||
|
||||
while read -r resolved_ip; do
|
||||
append_unique "${resolved_ip}" resolved_ips
|
||||
done < <(resolve_ipv4s "${host}")
|
||||
|
||||
[[ ${#local_ips[@]} -eq 0 || ${#resolved_ips[@]} -eq 0 ]] && return 1
|
||||
|
||||
for resolved_ip in "${resolved_ips[@]}"; do
|
||||
for local_ip in "${local_ips[@]}"; do
|
||||
if [[ "${resolved_ip}" == "${local_ip}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage:
|
||||
@ -177,10 +273,9 @@ if [[ "${DEEPFLOW_AGENT_ENABLED}" == "true" && -z "${DEEPFLOW_GRPC_ENDPOINT}" ]]
|
||||
DEEPFLOW_GRPC_ENDPOINT="deepflow-agent.${base_endpoint#*://}:443"
|
||||
fi
|
||||
|
||||
# observability server should bypass external HTTPS ingress for local self-monitoring
|
||||
local_host="$(hostname -f 2>/dev/null || hostname)"
|
||||
local_short="${local_host%%.*}"
|
||||
if [[ "${local_host}" == "observability.svc.plus" || "${local_short}" == "observability" ]]; then
|
||||
collector_host="$(extract_host_from_url "${base_endpoint}")"
|
||||
if endpoint_targets_local_host "${collector_host}"; then
|
||||
log_info "Collector endpoint resolves to this host; using local ingest ports for self-monitoring."
|
||||
if [[ "${METRICS_ENDPOINT_SET}" == "false" ]]; then
|
||||
METRICS_ENDPOINT="http://127.0.0.1:8428/api/v1/write"
|
||||
fi
|
||||
|
||||
@ -30,6 +30,86 @@ log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
|
||||
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
|
||||
log_ok() { echo -e "${GREEN}[OK]${NC} $1"; }
|
||||
|
||||
append_unique() {
|
||||
local value="$1"
|
||||
local -n target_ref="$2"
|
||||
[[ -z "${value}" ]] && return 0
|
||||
local existing
|
||||
for existing in "${target_ref[@]:-}"; do
|
||||
if [[ "${existing}" == "${value}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
target_ref+=("${value}")
|
||||
}
|
||||
|
||||
collect_local_ipv4s() {
|
||||
local ips=()
|
||||
local ip
|
||||
|
||||
if command -v hostname >/dev/null 2>&1; then
|
||||
for ip in $(hostname -I 2>/dev/null || true); do
|
||||
append_unique "${ip}" ips
|
||||
done
|
||||
fi
|
||||
|
||||
if command -v ip >/dev/null 2>&1; then
|
||||
while read -r ip; do
|
||||
append_unique "${ip}" ips
|
||||
done < <(ip -o -4 addr show scope global 2>/dev/null | awk '{print $4}' | cut -d/ -f1)
|
||||
fi
|
||||
|
||||
printf '%s\n' "${ips[@]}"
|
||||
}
|
||||
|
||||
resolve_ipv4s() {
|
||||
local host="$1"
|
||||
local ips=()
|
||||
local ip
|
||||
|
||||
if command -v getent >/dev/null 2>&1; then
|
||||
while read -r ip _; do
|
||||
append_unique "${ip}" ips
|
||||
done < <(getent ahostsv4 "${host}" 2>/dev/null || true)
|
||||
fi
|
||||
|
||||
if [[ ${#ips[@]} -eq 0 ]] && command -v host >/dev/null 2>&1; then
|
||||
while read -r ip; do
|
||||
append_unique "${ip}" ips
|
||||
done < <(host "${host}" 2>/dev/null | awk '/has address/ {print $4}')
|
||||
fi
|
||||
|
||||
printf '%s\n' "${ips[@]}"
|
||||
}
|
||||
|
||||
domain_points_to_local_host() {
|
||||
local host="$1"
|
||||
local local_ip
|
||||
local resolved_ip
|
||||
local local_ips=()
|
||||
local resolved_ips=()
|
||||
|
||||
while read -r local_ip; do
|
||||
append_unique "${local_ip}" local_ips
|
||||
done < <(collect_local_ipv4s)
|
||||
|
||||
while read -r resolved_ip; do
|
||||
append_unique "${resolved_ip}" resolved_ips
|
||||
done < <(resolve_ipv4s "${host}")
|
||||
|
||||
[[ ${#local_ips[@]} -eq 0 || ${#resolved_ips[@]} -eq 0 ]] && return 1
|
||||
|
||||
for resolved_ip in "${resolved_ips[@]}"; do
|
||||
for local_ip in "${local_ips[@]}"; do
|
||||
if [[ "${resolved_ip}" == "${local_ip}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage:
|
||||
@ -52,6 +132,10 @@ Examples:
|
||||
curl -fsSL ".../server-install.sh" | bash -s -- observability.svc.plus
|
||||
curl -fsSL ".../server-install.sh" | bash -s -- --action upgrade observability.svc.plus
|
||||
curl -fsSL ".../server-install.sh" | bash -s -- --action reset -y observability.svc.plus
|
||||
|
||||
Notes:
|
||||
DOMAIN is the public ingress domain. The current machine may still be named
|
||||
us-xhttp.svc.plus while serving traffic for observability.svc.plus.
|
||||
EOF
|
||||
}
|
||||
|
||||
@ -151,7 +235,27 @@ run_configure() {
|
||||
else
|
||||
sed -i '/vars:/a \ caddy_enabled: true' pigsty.yml
|
||||
fi
|
||||
|
||||
if grep -q "infra_domain:" pigsty.yml; then
|
||||
sed -i -E "s#^([[:space:]]*infra_domain:).*#\\1 ${DOMAIN}#" pigsty.yml
|
||||
else
|
||||
sed -i "/caddy_enabled:/a\\ infra_domain: ${DOMAIN}" pigsty.yml
|
||||
fi
|
||||
|
||||
if grep -qE '^([[:space:]]*)home[[:space:]]*:[[:space:]]*\{[[:space:]]*domain:' pigsty.yml; then
|
||||
sed -i -E "s#^([[:space:]]*home[[:space:]]*:[[:space:]]*\\{[[:space:]]*domain:[[:space:]]*)[^,}]+(.*)#\\1${DOMAIN}\\2#" pigsty.yml
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
check_dns_preflight() {
|
||||
if domain_points_to_local_host "${DOMAIN}"; then
|
||||
log_ok "DNS preflight passed: ${DOMAIN} resolves to this host."
|
||||
return 0
|
||||
fi
|
||||
|
||||
log_warn "DNS preflight: ${DOMAIN} does not currently resolve to this host."
|
||||
log_warn "Recommended order: update DNS first, then deploy the server on this machine."
|
||||
}
|
||||
|
||||
run_deploy() {
|
||||
@ -184,7 +288,7 @@ location = /ingest/metrics/api/v1/write {
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}
|
||||
|
||||
location = /ingest/logs/loki/api/v1/push {
|
||||
location = /ingest/logs/insert/loki/api/v1/push {
|
||||
proxy_pass http://127.0.0.1:9428/insert/loki/api/v1/push;
|
||||
proxy_set_header Host $http_host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
@ -261,6 +365,7 @@ deploy_or_upgrade() {
|
||||
|
||||
ensure_repo
|
||||
ensure_root_ssh_access
|
||||
check_dns_preflight
|
||||
run_bootstrap
|
||||
run_configure
|
||||
run_deploy
|
||||
|
||||
Loading…
Reference in New Issue
Block a user