#!/bin/bash set -euo pipefail DEFAULT_ENDPOINT="https://observability.svc.plus/ingest/otlp" INSTALL_DIR="/opt/observability" BIN_DIR="${INSTALL_DIR}/bin" CONFIG_DIR="${INSTALL_DIR}/config" DATA_DIR="${INSTALL_DIR}/data" NODE_EXPORTER_VERSION="1.7.0" PROCESS_EXPORTER_VERSION="0.7.10" VECTOR_VERSION="0.36.0" ACTION="deploy" ENDPOINT="${DEFAULT_ENDPOINT}" METRICS_ENDPOINT="" LOGS_ENDPOINT="" METRICS_ENDPOINT_SET=false LOGS_ENDPOINT_SET=false AUTO_YES=false GREEN='\033[0;32m' BLUE='\033[0;34m' RED='\033[0;31m' YELLOW='\033[0;33m' NC='\033[0m' log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } log_error() { echo -e "${RED}[ERROR]${NC} $1"; } log_fail() { echo -e "${RED}[FAIL]${NC} $1"; } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } usage() { cat </dev/null | grep -Eo "${regex}" | head -n1 || true } write_unit_if_changed() { local unit_name="$1" local content="$2" local unit_path="/etc/systemd/system/${unit_name}.service" local tmp_file tmp_file="$(mktemp)" printf "%s\n" "${content}" > "${tmp_file}" if [[ ! -f "${unit_path}" ]] || ! cmp -s "${tmp_file}" "${unit_path}"; then install -m 0644 "${tmp_file}" "${unit_path}" systemctl daemon-reload fi rm -f "${tmp_file}" } download_tar_binary() { local url="$1" local archive_name="$2" local source_binary_relpath="$3" local target_binary="$4" local tmp_dir tmp_dir="$(mktemp -d)" curl -fL --progress-bar "${url}" -o "${tmp_dir}/${archive_name}" tar -xzf "${tmp_dir}/${archive_name}" -C "${tmp_dir}" install -m 0755 "${tmp_dir}/${source_binary_relpath}" "${target_binary}" rm -rf "${tmp_dir}" } install_node_exporter() { local current_version current_version="$(version_from_bin "${BIN_DIR}/node_exporter" '[0-9]+\.[0-9]+\.[0-9]+')" if [[ "${current_version}" != "${NODE_EXPORTER_VERSION}" ]]; then log_info "Installing Node Exporter v${NODE_EXPORTER_VERSION} (current: ${current_version:-none})" download_tar_binary \ "https://github.com/prometheus/node_exporter/releases/download/v${NODE_EXPORTER_VERSION}/node_exporter-${NODE_EXPORTER_VERSION}.linux-${ARCH_NODE}.tar.gz" \ "node_exporter.tar.gz" \ "node_exporter-${NODE_EXPORTER_VERSION}.linux-${ARCH_NODE}/node_exporter" \ "${BIN_DIR}/node_exporter" else log_info "Node Exporter already at desired version ${NODE_EXPORTER_VERSION}" fi write_unit_if_changed "node_exporter" "[Unit] Description=Node Exporter After=network.target [Service] User=root ExecStart=${BIN_DIR}/node_exporter Restart=always [Install] WantedBy=multi-user.target" systemctl enable --now node_exporter systemctl restart node_exporter } install_process_exporter() { local current_version current_version="$(version_from_bin "${BIN_DIR}/process-exporter" '[0-9]+\.[0-9]+\.[0-9]+')" if [[ "${current_version}" != "${PROCESS_EXPORTER_VERSION}" ]]; then log_info "Installing Process Exporter v${PROCESS_EXPORTER_VERSION} (current: ${current_version:-none})" download_tar_binary \ "https://github.com/ncabatoff/process-exporter/releases/download/v${PROCESS_EXPORTER_VERSION}/process-exporter-${PROCESS_EXPORTER_VERSION}.linux-${ARCH_PROCESS}.tar.gz" \ "process_exporter.tar.gz" \ "process-exporter-${PROCESS_EXPORTER_VERSION}.linux-${ARCH_PROCESS}/process-exporter" \ "${BIN_DIR}/process-exporter" else log_info "Process Exporter already at desired version ${PROCESS_EXPORTER_VERSION}" fi cat < "${CONFIG_DIR}/process-config.yaml" process_names: - name: "{{.Comm}}" cmdline: - '.+' EOF write_unit_if_changed "process_exporter" "[Unit] Description=Process Exporter After=network.target [Service] User=root ExecStart=${BIN_DIR}/process-exporter -config.path ${CONFIG_DIR}/process-config.yaml Restart=always [Install] WantedBy=multi-user.target" systemctl enable --now process_exporter systemctl restart process_exporter } write_vector_config() { cat < "${CONFIG_DIR}/vector.yaml" data_dir: "${DATA_DIR}/vector" sources: node_exporter: type: prometheus_scrape endpoints: - http://127.0.0.1:9100/metrics scrape_interval_secs: 15 process_exporter: type: prometheus_scrape endpoints: - http://127.0.0.1:9256/metrics scrape_interval_secs: 15 journald: type: journald current_boot_only: true syslog_files: type: file include: - /var/log/syslog - /var/log/messages - /var/log/auth.log read_from: end transforms: add_metric_labels: type: remap inputs: ["node_exporter", "process_exporter"] source: | .tags.host = get_hostname!() .tags.job = "node" .tags.origin = "vector-agent" add_log_labels: type: remap inputs: ["journald", "syslog_files"] source: | .host = get_hostname!() .job = "node" .origin = "vector-agent" .timestamp = now() sinks: metrics_out: type: prometheus_remote_write inputs: ["add_metric_labels"] endpoint: "${METRICS_ENDPOINT}" compression: snappy healthcheck: false logs_out: type: loki inputs: ["add_log_labels"] endpoint: "${LOGS_ENDPOINT}" compression: gzip encoding: codec: json labels: host: "{{ host }}" job: "{{ job }}" origin: "{{ origin }}" EOF } install_vector() { local current_version current_version="$(version_from_bin "${BIN_DIR}/vector" '[0-9]+\.[0-9]+\.[0-9]+')" if [[ "${current_version}" != "${VECTOR_VERSION}" ]]; then log_info "Installing Vector v${VECTOR_VERSION} (current: ${current_version:-none})" download_tar_binary \ "https://packages.timber.io/vector/${VECTOR_VERSION}/vector-${VECTOR_VERSION}-${ARCH_VECTOR}-unknown-linux-gnu.tar.gz" \ "vector.tar.gz" \ "vector-${ARCH_VECTOR}-unknown-linux-gnu/bin/vector" \ "${BIN_DIR}/vector" else log_info "Vector already at desired version ${VECTOR_VERSION}" fi write_vector_config if ! "${BIN_DIR}/vector" validate --no-environment --config-yaml "${CONFIG_DIR}/vector.yaml" >/dev/null 2>&1; then log_error "Vector config validation failed." "${BIN_DIR}/vector" validate --no-environment --config-yaml "${CONFIG_DIR}/vector.yaml" || true exit 1 fi write_unit_if_changed "vector" "[Unit] Description=Vector Documentation=https://vector.dev After=network-online.target Requires=network-online.target [Service] User=root ExecStart=${BIN_DIR}/vector --config ${CONFIG_DIR}/vector.yaml Restart=always RestartSec=5 AmbientCapabilities=CAP_NET_BIND_SERVICE Environment=VECTOR_LOG=info [Install] WantedBy=multi-user.target" systemctl enable --now vector systemctl restart vector } uninstall_agent() { confirm "This will uninstall observability agent components. Continue?" || { log_info "Cancelled." return 0 } for svc in vector process_exporter node_exporter; do systemctl disable --now "${svc}" >/dev/null 2>&1 || true rm -f "/etc/systemd/system/${svc}.service" done systemctl daemon-reload rm -rf "${INSTALL_DIR}" log_success "Agent components uninstalled." } verify_installation() { sleep 2 log_info "Verifying services..." for service in node_exporter process_exporter vector; do if systemctl is-active --quiet "${service}"; then log_success "Service '${service}' is running" else log_fail "Service '${service}' is NOT running" systemctl status "${service}" --no-pager | head -n 20 || true fi done log_info "Checking ports..." for item in "9100 Node Exporter" "9256 Process Exporter"; do local port name port="${item%% *}" name="${item#* }" if ss -tuln | grep -q ":${port} "; then log_success "Port ${port} (${name}) is listening" else log_fail "Port ${port} (${name}) is NOT listening" fi done } deploy_agent() { log_info "Action=${ACTION}" log_info "Base endpoint=${ENDPOINT}" log_info "Metrics endpoint=${METRICS_ENDPOINT}" log_info "Logs endpoint=${LOGS_ENDPOINT}" install_node_exporter install_process_exporter install_vector verify_installation log_success "Agent deploy/upgrade complete." } case "${ACTION}" in deploy|upgrade) deploy_agent ;; reset) uninstall_agent deploy_agent ;; uninstall) uninstall_agent ;; *) log_error "Unsupported action: ${ACTION}" usage exit 1 ;; esac