From fbfa32ca2a8bba932b365b640bb09ea22e0770a2 Mon Sep 17 00:00:00 2001 From: Haitao Pan Date: Thu, 25 Jun 2026 20:48:20 +0800 Subject: [PATCH] fix(ci): poll on-host bootstrap logs across ssh reconnects --- .../workflows/deploy-ai-workspace-iac.yaml | 27 +---- scripts/run-on-host-bootstrap.sh | 101 ++++++++++++++++++ 2 files changed, 106 insertions(+), 22 deletions(-) create mode 100644 scripts/run-on-host-bootstrap.sh diff --git a/.github/workflows/deploy-ai-workspace-iac.yaml b/.github/workflows/deploy-ai-workspace-iac.yaml index 3311867..aea83f1 100644 --- a/.github/workflows/deploy-ai-workspace-iac.yaml +++ b/.github/workflows/deploy-ai-workspace-iac.yaml @@ -342,31 +342,14 @@ jobs: - name: Run on-host bootstrap (curl | bash, local-mode install) env: + MATRIX_HOST: ${{ matrix.host }} + CMDB_PATH: cmdb/cmdb.json + SSH_KEY_PATH: ~/.ssh/id_deploy + XWORKMATE_BRIDGE_DOMAIN: ${{ github.event.inputs.bridge_domain }} DEEPSEEK_API_KEY: ${{ steps.vault.outputs.DEEPSEEK_API_KEY }} NVIDIA_API_KEY: ${{ steps.vault.outputs.NVIDIA_API_KEY }} OLLAMA_API_KEY: ${{ steps.vault.outputs.OLLAMA_API_KEY }} - run: | - set -euo pipefail - ip="$(jq -r '.["${{ matrix.host }}"].ip' cmdb/cmdb.json)" - user="$(jq -r '.["${{ matrix.host }}"].ansible_user // "root"' cmdb/cmdb.json)" - # bridge 域名 = operator 覆盖(input) 否则各主机 CMDB service_domains 的首个, - # 用作 /etc/hostname 与 xworkmate-bridge.caddy;on-host 模型拿不到 inventory, - # 故由流水线作为 XWORKMATE_BRIDGE_DOMAIN env 注入。 - domain='${{ github.event.inputs.bridge_domain }}' - if [ -z "$domain" ]; then - domain="$(jq -r '.["${{ matrix.host }}"].host_vars.service_domains // ""' cmdb/cmdb.json | cut -d, -f1 | tr -d ' ')" - fi - echo "Bootstrapping ${{ matrix.host }} (${user}@${ip}) on-host, domain=${domain:-} ..." - ssh -i ~/.ssh/id_deploy \ - -o StrictHostKeyChecking=accept-new \ - -o ServerAliveInterval=20 -o ServerAliveCountMax=15 \ - -o ConnectTimeout=20 \ - "${user}@${ip}" \ - "XWORKMATE_BRIDGE_DOMAIN='${domain}' \ - DEEPSEEK_API_KEY='${DEEPSEEK_API_KEY}' \ - NVIDIA_API_KEY='${NVIDIA_API_KEY}' \ - OLLAMA_API_KEY='${OLLAMA_API_KEY}' \ - bash -lc 'curl -sfL https://install.svc.plus/ai-workspace | bash -'" + run: scripts/run-on-host-bootstrap.sh # --------------------------------------------------------------------------- dns: diff --git a/scripts/run-on-host-bootstrap.sh b/scripts/run-on-host-bootstrap.sh new file mode 100644 index 0000000..bb046a0 --- /dev/null +++ b/scripts/run-on-host-bootstrap.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +set -euo pipefail + +cmdb_path=${CMDB_PATH:-cmdb/cmdb.json} +host=${MATRIX_HOST:?MATRIX_HOST is required} +ssh_key=${SSH_KEY_PATH:-"$HOME/.ssh/id_deploy"} +run_id=${GITHUB_RUN_ID:-manual} + +ip="$(jq -r --arg host "$host" '.[$host].ip' "$cmdb_path")" +user="$(jq -r --arg host "$host" '.[$host].ansible_user // "root"' "$cmdb_path")" +domain="${XWORKMATE_BRIDGE_DOMAIN:-}" +if [ -z "$domain" ]; then + domain="$(jq -r --arg host "$host" '.[$host].host_vars.service_domains // ""' "$cmdb_path" | cut -d, -f1 | tr -d ' ')" +fi + +if [ -z "$ip" ] || [ "$ip" = "null" ]; then + echo "::error::No IP found in ${cmdb_path} for ${host}" >&2 + exit 1 +fi + +ssh_opts=( + -i "$ssh_key" + -o StrictHostKeyChecking=accept-new + -o ServerAliveInterval=30 + -o ServerAliveCountMax=60 + -o ConnectTimeout=20 + -o BatchMode=yes +) + +remote_dir="/tmp/xworkspace-bootstrap-${run_id}-${host//[^A-Za-z0-9_.-]/_}" +remote_env="${remote_dir}/env" +remote_log="${remote_dir}/bootstrap.log" +remote_rc="${remote_dir}/bootstrap.rc" +remote_runner="${remote_dir}/run.sh" + +echo "Bootstrapping ${host} (${user}@${ip}) on-host, domain=${domain:-} ..." + +remote_payload="$(mktemp)" +trap 'rm -f "$remote_payload"' EXIT + +{ + printf 'XWORKMATE_BRIDGE_DOMAIN=%q\n' "$domain" + printf 'DEEPSEEK_API_KEY=%q\n' "${DEEPSEEK_API_KEY:-}" + printf 'NVIDIA_API_KEY=%q\n' "${NVIDIA_API_KEY:-}" + printf 'OLLAMA_API_KEY=%q\n' "${OLLAMA_API_KEY:-}" +} > "$remote_payload" + +ssh "${ssh_opts[@]}" "${user}@${ip}" "mkdir -p '$remote_dir' && chmod 700 '$remote_dir'" +scp "${ssh_opts[@]}" "$remote_payload" "${user}@${ip}:${remote_env}" >/dev/null +ssh "${ssh_opts[@]}" "${user}@${ip}" "chmod 600 '$remote_env'" + +ssh "${ssh_opts[@]}" "${user}@${ip}" "cat > '$remote_runner' && chmod 700 '$remote_runner'" <<'REMOTE_SCRIPT' +#!/usr/bin/env bash +set -euo pipefail +remote_env=$1 +remote_log=$2 +remote_rc=$3 +if [ -f "$remote_rc" ]; then + exit 0 +fi +( + set +e + source "$remote_env" + export XWORKMATE_BRIDGE_DOMAIN DEEPSEEK_API_KEY NVIDIA_API_KEY OLLAMA_API_KEY + bash -lc 'curl -sfL https://install.svc.plus/ai-workspace | bash -' + rc=$? + printf '%s\n' "$rc" > "$remote_rc" + exit "$rc" +) > "$remote_log" 2>&1 & +REMOTE_SCRIPT + +ssh "${ssh_opts[@]}" "${user}@${ip}" "nohup '$remote_runner' '$remote_env' '$remote_log' '$remote_rc' >/dev/null 2>&1 &" + +last_lines=0 +while true; do + poll_output="$(ssh "${ssh_opts[@]}" "${user}@${ip}" "if [ -f '$remote_log' ]; then wc -l < '$remote_log'; else echo 0; fi; if [ -f '$remote_rc' ]; then cat '$remote_rc'; else echo RUNNING; fi" 2>/dev/null || true)" + line_count="$(printf '%s\n' "$poll_output" | sed -n '1p')" + rc_value="$(printf '%s\n' "$poll_output" | sed -n '2p')" + case "$line_count" in + ''|*[!0-9]*) line_count=0 ;; + esac + + if [ "$line_count" -gt "$last_lines" ]; then + start=$((last_lines + 1)) + ssh "${ssh_opts[@]}" "${user}@${ip}" "sed -n '${start},${line_count}p' '$remote_log'" || true + last_lines="$line_count" + else + echo "[INFO] Bootstrap still running on ${host}; no new log lines." + fi + + if [ "$rc_value" != "RUNNING" ] && [ -n "$rc_value" ]; then + if [ "$rc_value" = "0" ]; then + echo "[SUCCESS] Bootstrap completed on ${host}." + exit 0 + fi + echo "::error::Bootstrap failed on ${host} with exit code ${rc_value}." + exit "$rc_value" + fi + + sleep 20 +done