fix(ci): poll on-host bootstrap logs across ssh reconnects

This commit is contained in:
Haitao Pan 2026-06-25 20:48:20 +08:00
parent cd630c45d5
commit fbfa32ca2a
2 changed files with 106 additions and 22 deletions

View File

@ -342,31 +342,14 @@ jobs:
- name: Run on-host bootstrap (curl | bash, local-mode install)
env:
MATRIX_HOST: ${{ matrix.host }}
CMDB_PATH: cmdb/cmdb.json
SSH_KEY_PATH: ~/.ssh/id_deploy
XWORKMATE_BRIDGE_DOMAIN: ${{ github.event.inputs.bridge_domain }}
DEEPSEEK_API_KEY: ${{ steps.vault.outputs.DEEPSEEK_API_KEY }}
NVIDIA_API_KEY: ${{ steps.vault.outputs.NVIDIA_API_KEY }}
OLLAMA_API_KEY: ${{ steps.vault.outputs.OLLAMA_API_KEY }}
run: |
set -euo pipefail
ip="$(jq -r '.["${{ matrix.host }}"].ip' cmdb/cmdb.json)"
user="$(jq -r '.["${{ matrix.host }}"].ansible_user // "root"' cmdb/cmdb.json)"
# bridge 域名 = operator 覆盖(input) 否则各主机 CMDB service_domains 的首个,
# 用作 /etc/hostname 与 xworkmate-bridge.caddyon-host 模型拿不到 inventory
# 故由流水线作为 XWORKMATE_BRIDGE_DOMAIN env 注入。
domain='${{ github.event.inputs.bridge_domain }}'
if [ -z "$domain" ]; then
domain="$(jq -r '.["${{ matrix.host }}"].host_vars.service_domains // ""' cmdb/cmdb.json | cut -d, -f1 | tr -d ' ')"
fi
echo "Bootstrapping ${{ matrix.host }} (${user}@${ip}) on-host, domain=${domain:-<none>} ..."
ssh -i ~/.ssh/id_deploy \
-o StrictHostKeyChecking=accept-new \
-o ServerAliveInterval=20 -o ServerAliveCountMax=15 \
-o ConnectTimeout=20 \
"${user}@${ip}" \
"XWORKMATE_BRIDGE_DOMAIN='${domain}' \
DEEPSEEK_API_KEY='${DEEPSEEK_API_KEY}' \
NVIDIA_API_KEY='${NVIDIA_API_KEY}' \
OLLAMA_API_KEY='${OLLAMA_API_KEY}' \
bash -lc 'curl -sfL https://install.svc.plus/ai-workspace | bash -'"
run: scripts/run-on-host-bootstrap.sh
# ---------------------------------------------------------------------------
dns:

View File

@ -0,0 +1,101 @@
#!/usr/bin/env bash
set -euo pipefail
cmdb_path=${CMDB_PATH:-cmdb/cmdb.json}
host=${MATRIX_HOST:?MATRIX_HOST is required}
ssh_key=${SSH_KEY_PATH:-"$HOME/.ssh/id_deploy"}
run_id=${GITHUB_RUN_ID:-manual}
ip="$(jq -r --arg host "$host" '.[$host].ip' "$cmdb_path")"
user="$(jq -r --arg host "$host" '.[$host].ansible_user // "root"' "$cmdb_path")"
domain="${XWORKMATE_BRIDGE_DOMAIN:-}"
if [ -z "$domain" ]; then
domain="$(jq -r --arg host "$host" '.[$host].host_vars.service_domains // ""' "$cmdb_path" | cut -d, -f1 | tr -d ' ')"
fi
if [ -z "$ip" ] || [ "$ip" = "null" ]; then
echo "::error::No IP found in ${cmdb_path} for ${host}" >&2
exit 1
fi
ssh_opts=(
-i "$ssh_key"
-o StrictHostKeyChecking=accept-new
-o ServerAliveInterval=30
-o ServerAliveCountMax=60
-o ConnectTimeout=20
-o BatchMode=yes
)
remote_dir="/tmp/xworkspace-bootstrap-${run_id}-${host//[^A-Za-z0-9_.-]/_}"
remote_env="${remote_dir}/env"
remote_log="${remote_dir}/bootstrap.log"
remote_rc="${remote_dir}/bootstrap.rc"
remote_runner="${remote_dir}/run.sh"
echo "Bootstrapping ${host} (${user}@${ip}) on-host, domain=${domain:-<none>} ..."
remote_payload="$(mktemp)"
trap 'rm -f "$remote_payload"' EXIT
{
printf 'XWORKMATE_BRIDGE_DOMAIN=%q\n' "$domain"
printf 'DEEPSEEK_API_KEY=%q\n' "${DEEPSEEK_API_KEY:-}"
printf 'NVIDIA_API_KEY=%q\n' "${NVIDIA_API_KEY:-}"
printf 'OLLAMA_API_KEY=%q\n' "${OLLAMA_API_KEY:-}"
} > "$remote_payload"
ssh "${ssh_opts[@]}" "${user}@${ip}" "mkdir -p '$remote_dir' && chmod 700 '$remote_dir'"
scp "${ssh_opts[@]}" "$remote_payload" "${user}@${ip}:${remote_env}" >/dev/null
ssh "${ssh_opts[@]}" "${user}@${ip}" "chmod 600 '$remote_env'"
ssh "${ssh_opts[@]}" "${user}@${ip}" "cat > '$remote_runner' && chmod 700 '$remote_runner'" <<'REMOTE_SCRIPT'
#!/usr/bin/env bash
set -euo pipefail
remote_env=$1
remote_log=$2
remote_rc=$3
if [ -f "$remote_rc" ]; then
exit 0
fi
(
set +e
source "$remote_env"
export XWORKMATE_BRIDGE_DOMAIN DEEPSEEK_API_KEY NVIDIA_API_KEY OLLAMA_API_KEY
bash -lc 'curl -sfL https://install.svc.plus/ai-workspace | bash -'
rc=$?
printf '%s\n' "$rc" > "$remote_rc"
exit "$rc"
) > "$remote_log" 2>&1 &
REMOTE_SCRIPT
ssh "${ssh_opts[@]}" "${user}@${ip}" "nohup '$remote_runner' '$remote_env' '$remote_log' '$remote_rc' >/dev/null 2>&1 &"
last_lines=0
while true; do
poll_output="$(ssh "${ssh_opts[@]}" "${user}@${ip}" "if [ -f '$remote_log' ]; then wc -l < '$remote_log'; else echo 0; fi; if [ -f '$remote_rc' ]; then cat '$remote_rc'; else echo RUNNING; fi" 2>/dev/null || true)"
line_count="$(printf '%s\n' "$poll_output" | sed -n '1p')"
rc_value="$(printf '%s\n' "$poll_output" | sed -n '2p')"
case "$line_count" in
''|*[!0-9]*) line_count=0 ;;
esac
if [ "$line_count" -gt "$last_lines" ]; then
start=$((last_lines + 1))
ssh "${ssh_opts[@]}" "${user}@${ip}" "sed -n '${start},${line_count}p' '$remote_log'" || true
last_lines="$line_count"
else
echo "[INFO] Bootstrap still running on ${host}; no new log lines."
fi
if [ "$rc_value" != "RUNNING" ] && [ -n "$rc_value" ]; then
if [ "$rc_value" = "0" ]; then
echo "[SUCCESS] Bootstrap completed on ${host}."
exit 0
fi
echo "::error::Bootstrap failed on ${host} with exit code ${rc_value}."
exit "$rc_value"
fi
sleep 20
done