observability.svc.plus/scripts/agent-install.sh

451 lines
12 KiB
Bash
Executable File

#!/bin/bash
set -euo pipefail
DEFAULT_ENDPOINT="https://observability.svc.plus/ingest/otlp"
INSTALL_DIR="/opt/observability"
BIN_DIR="${INSTALL_DIR}/bin"
CONFIG_DIR="${INSTALL_DIR}/config"
DATA_DIR="${INSTALL_DIR}/data"
NODE_EXPORTER_VERSION="1.7.0"
PROCESS_EXPORTER_VERSION="0.7.10"
VECTOR_VERSION="0.36.0"
ACTION="deploy"
ENDPOINT="${DEFAULT_ENDPOINT}"
METRICS_ENDPOINT=""
LOGS_ENDPOINT=""
METRICS_ENDPOINT_SET=false
LOGS_ENDPOINT_SET=false
AUTO_YES=false
GREEN='\033[0;32m'
BLUE='\033[0;34m'
RED='\033[0;31m'
YELLOW='\033[0;33m'
NC='\033[0m'
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
log_fail() { echo -e "${RED}[FAIL]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
usage() {
cat <<EOF
Usage:
bash agent-install.sh [options]
Actions (default: deploy):
--action deploy Deploy or upgrade components
--action upgrade Alias of deploy
--action reset Uninstall then reinstall components
--action uninstall Remove agent components
Options:
--endpoint URL Base ingest endpoint (default: ${DEFAULT_ENDPOINT})
--metrics-endpoint URL Prometheus remote_write endpoint (optional override)
--logs-endpoint URL Loki push endpoint (optional override)
-y, --yes Non-interactive mode
-h, --help Show help
Example:
curl -fsSL .../agent-install.sh | bash -s -- --endpoint https://observability.svc.plus/ingest/otlp
EOF
}
confirm() {
local prompt="$1"
if [[ "${AUTO_YES}" == "true" ]]; then
return 0
fi
read -r -p "${prompt} [y/N] " reply
[[ "${reply}" =~ ^[Yy]$ ]]
}
while [[ $# -gt 0 ]]; do
case "$1" in
--action)
ACTION="$2"
shift 2
;;
--action=*)
ACTION="${1#*=}"
shift
;;
--endpoint)
ENDPOINT="$2"
shift 2
;;
--endpoint=*)
ENDPOINT="${1#*=}"
shift
;;
--metrics-endpoint)
METRICS_ENDPOINT="$2"
METRICS_ENDPOINT_SET=true
shift 2
;;
--metrics-endpoint=*)
METRICS_ENDPOINT="${1#*=}"
METRICS_ENDPOINT_SET=true
shift
;;
--logs-endpoint)
LOGS_ENDPOINT="$2"
LOGS_ENDPOINT_SET=true
shift 2
;;
--logs-endpoint=*)
LOGS_ENDPOINT="${1#*=}"
LOGS_ENDPOINT_SET=true
shift
;;
-y|--yes)
AUTO_YES=true
shift
;;
-h|--help)
usage
exit 0
;;
*)
log_error "Unknown parameter: $1"
usage
exit 1
;;
esac
done
base_endpoint="${ENDPOINT%/}"
if [[ "${base_endpoint}" == */ingest/otlp* ]]; then
base_endpoint="${base_endpoint%%/ingest/otlp*}"
fi
if [[ -z "${METRICS_ENDPOINT}" ]]; then
METRICS_ENDPOINT="${base_endpoint}/ingest/metrics/api/v1/write"
fi
if [[ -z "${LOGS_ENDPOINT}" ]]; then
LOGS_ENDPOINT="${base_endpoint}/ingest/logs/insert"
fi
# observability server should bypass external HTTPS ingress for local self-monitoring
local_host="$(hostname -f 2>/dev/null || hostname)"
local_short="${local_host%%.*}"
if [[ "${local_host}" == "observability.svc.plus" || "${local_short}" == "observability" ]]; then
if [[ "${METRICS_ENDPOINT_SET}" == "false" ]]; then
METRICS_ENDPOINT="http://127.0.0.1:8428/api/v1/write"
fi
if [[ "${LOGS_ENDPOINT_SET}" == "false" ]]; then
LOGS_ENDPOINT="http://127.0.0.1:9428/insert"
fi
fi
if [[ $EUID -ne 0 ]]; then
log_error "This script must be run as root"
exit 1
fi
ARCH="$(uname -m)"
case "${ARCH}" in
x86_64)
ARCH_NODE="amd64"
ARCH_PROCESS="amd64"
ARCH_VECTOR="x86_64"
;;
aarch64|arm64)
ARCH_NODE="arm64"
ARCH_PROCESS="arm64"
ARCH_VECTOR="aarch64"
;;
*)
log_error "Unsupported architecture: ${ARCH}"
exit 1
;;
esac
mkdir -p "${BIN_DIR}" "${CONFIG_DIR}" "${DATA_DIR}" "${DATA_DIR}/vector"
version_from_bin() {
local bin="$1"
local regex="$2"
if [[ ! -x "${bin}" ]]; then
return 1
fi
"${bin}" --version 2>/dev/null | grep -Eo "${regex}" | head -n1 || true
}
write_unit_if_changed() {
local unit_name="$1"
local content="$2"
local unit_path="/etc/systemd/system/${unit_name}.service"
local tmp_file
tmp_file="$(mktemp)"
printf "%s\n" "${content}" > "${tmp_file}"
if [[ ! -f "${unit_path}" ]] || ! cmp -s "${tmp_file}" "${unit_path}"; then
install -m 0644 "${tmp_file}" "${unit_path}"
systemctl daemon-reload
fi
rm -f "${tmp_file}"
}
download_tar_binary() {
local url="$1"
local archive_name="$2"
local source_binary_relpath="$3"
local target_binary="$4"
local tmp_dir
tmp_dir="$(mktemp -d)"
curl -fL --progress-bar "${url}" -o "${tmp_dir}/${archive_name}"
tar -xzf "${tmp_dir}/${archive_name}" -C "${tmp_dir}"
install -m 0755 "${tmp_dir}/${source_binary_relpath}" "${target_binary}"
rm -rf "${tmp_dir}"
}
install_node_exporter() {
local current_version
current_version="$(version_from_bin "${BIN_DIR}/node_exporter" '[0-9]+\.[0-9]+\.[0-9]+')"
if [[ "${current_version}" != "${NODE_EXPORTER_VERSION}" ]]; then
log_info "Installing Node Exporter v${NODE_EXPORTER_VERSION} (current: ${current_version:-none})"
download_tar_binary \
"https://github.com/prometheus/node_exporter/releases/download/v${NODE_EXPORTER_VERSION}/node_exporter-${NODE_EXPORTER_VERSION}.linux-${ARCH_NODE}.tar.gz" \
"node_exporter.tar.gz" \
"node_exporter-${NODE_EXPORTER_VERSION}.linux-${ARCH_NODE}/node_exporter" \
"${BIN_DIR}/node_exporter"
else
log_info "Node Exporter already at desired version ${NODE_EXPORTER_VERSION}"
fi
write_unit_if_changed "node_exporter" "[Unit]
Description=Node Exporter
After=network.target
[Service]
User=root
ExecStart=${BIN_DIR}/node_exporter
Restart=always
[Install]
WantedBy=multi-user.target"
systemctl enable --now node_exporter
systemctl restart node_exporter
}
install_process_exporter() {
local current_version
current_version="$(version_from_bin "${BIN_DIR}/process-exporter" '[0-9]+\.[0-9]+\.[0-9]+')"
if [[ "${current_version}" != "${PROCESS_EXPORTER_VERSION}" ]]; then
log_info "Installing Process Exporter v${PROCESS_EXPORTER_VERSION} (current: ${current_version:-none})"
download_tar_binary \
"https://github.com/ncabatoff/process-exporter/releases/download/v${PROCESS_EXPORTER_VERSION}/process-exporter-${PROCESS_EXPORTER_VERSION}.linux-${ARCH_PROCESS}.tar.gz" \
"process_exporter.tar.gz" \
"process-exporter-${PROCESS_EXPORTER_VERSION}.linux-${ARCH_PROCESS}/process-exporter" \
"${BIN_DIR}/process-exporter"
else
log_info "Process Exporter already at desired version ${PROCESS_EXPORTER_VERSION}"
fi
cat <<EOF > "${CONFIG_DIR}/process-config.yaml"
process_names:
- name: "{{.Comm}}"
cmdline:
- '.+'
EOF
write_unit_if_changed "process_exporter" "[Unit]
Description=Process Exporter
After=network.target
[Service]
User=root
ExecStart=${BIN_DIR}/process-exporter -config.path ${CONFIG_DIR}/process-config.yaml
Restart=always
[Install]
WantedBy=multi-user.target"
systemctl enable --now process_exporter
systemctl restart process_exporter
}
write_vector_config() {
cat <<EOF > "${CONFIG_DIR}/vector.yaml"
data_dir: "${DATA_DIR}/vector"
sources:
node_exporter:
type: prometheus_scrape
endpoints:
- http://127.0.0.1:9100/metrics
scrape_interval_secs: 15
process_exporter:
type: prometheus_scrape
endpoints:
- http://127.0.0.1:9256/metrics
scrape_interval_secs: 15
journald:
type: journald
current_boot_only: true
syslog_files:
type: file
include:
- /var/log/syslog
- /var/log/messages
- /var/log/auth.log
read_from: end
transforms:
add_metric_labels:
type: remap
inputs: ["node_exporter", "process_exporter"]
source: |
.tags.host = get_hostname!()
.tags.job = "node"
.tags.origin = "vector-agent"
add_log_labels:
type: remap
inputs: ["journald", "syslog_files"]
source: |
.host = get_hostname!()
.job = "node"
.origin = "vector-agent"
.timestamp = now()
sinks:
metrics_out:
type: prometheus_remote_write
inputs: ["add_metric_labels"]
endpoint: "${METRICS_ENDPOINT}"
compression: snappy
healthcheck: false
logs_out:
type: loki
inputs: ["add_log_labels"]
endpoint: "${LOGS_ENDPOINT}"
compression: gzip
encoding:
codec: json
labels:
host: "{{ host }}"
job: "{{ job }}"
origin: "{{ origin }}"
EOF
}
install_vector() {
local current_version
current_version="$(version_from_bin "${BIN_DIR}/vector" '[0-9]+\.[0-9]+\.[0-9]+')"
if [[ "${current_version}" != "${VECTOR_VERSION}" ]]; then
log_info "Installing Vector v${VECTOR_VERSION} (current: ${current_version:-none})"
download_tar_binary \
"https://packages.timber.io/vector/${VECTOR_VERSION}/vector-${VECTOR_VERSION}-${ARCH_VECTOR}-unknown-linux-gnu.tar.gz" \
"vector.tar.gz" \
"vector-${ARCH_VECTOR}-unknown-linux-gnu/bin/vector" \
"${BIN_DIR}/vector"
else
log_info "Vector already at desired version ${VECTOR_VERSION}"
fi
write_vector_config
if ! "${BIN_DIR}/vector" validate --no-environment --config-yaml "${CONFIG_DIR}/vector.yaml" >/dev/null 2>&1; then
log_error "Vector config validation failed."
"${BIN_DIR}/vector" validate --no-environment --config-yaml "${CONFIG_DIR}/vector.yaml" || true
exit 1
fi
write_unit_if_changed "vector" "[Unit]
Description=Vector
Documentation=https://vector.dev
After=network-online.target
Requires=network-online.target
[Service]
User=root
ExecStart=${BIN_DIR}/vector --config ${CONFIG_DIR}/vector.yaml
Restart=always
RestartSec=5
AmbientCapabilities=CAP_NET_BIND_SERVICE
Environment=VECTOR_LOG=info
[Install]
WantedBy=multi-user.target"
systemctl enable --now vector
systemctl restart vector
}
uninstall_agent() {
confirm "This will uninstall observability agent components. Continue?" || {
log_info "Cancelled."
return 0
}
for svc in vector process_exporter node_exporter; do
systemctl disable --now "${svc}" >/dev/null 2>&1 || true
rm -f "/etc/systemd/system/${svc}.service"
done
systemctl daemon-reload
rm -rf "${INSTALL_DIR}"
log_success "Agent components uninstalled."
}
verify_installation() {
sleep 2
log_info "Verifying services..."
for service in node_exporter process_exporter vector; do
if systemctl is-active --quiet "${service}"; then
log_success "Service '${service}' is running"
else
log_fail "Service '${service}' is NOT running"
systemctl status "${service}" --no-pager | head -n 20 || true
fi
done
log_info "Checking ports..."
for item in "9100 Node Exporter" "9256 Process Exporter"; do
local port name
port="${item%% *}"
name="${item#* }"
if ss -tuln | grep -q ":${port} "; then
log_success "Port ${port} (${name}) is listening"
else
log_fail "Port ${port} (${name}) is NOT listening"
fi
done
}
deploy_agent() {
log_info "Action=${ACTION}"
log_info "Base endpoint=${ENDPOINT}"
log_info "Metrics endpoint=${METRICS_ENDPOINT}"
log_info "Logs endpoint=${LOGS_ENDPOINT}"
install_node_exporter
install_process_exporter
install_vector
verify_installation
log_success "Agent deploy/upgrade complete."
}
case "${ACTION}" in
deploy|upgrade)
deploy_agent
;;
reset)
uninstall_agent
deploy_agent
;;
uninstall)
uninstall_agent
;;
*)
log_error "Unsupported action: ${ACTION}"
usage
exit 1
;;
esac