ci: harden deploy validation retries

This commit is contained in:
Haitao Pan 2026-04-14 10:56:53 +08:00
parent 425a38f1e8
commit e3bf2063a2
2 changed files with 373 additions and 38 deletions

View File

@ -0,0 +1,226 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
SCRIPT_PATH="${ROOT_DIR}/scripts/github-actions/validate-deploy.sh"
IMAGE_REF="ghcr.io/x-evor/xworkmate-bridge:425a38f1e8076899400d4a858d4678dffd876afb"
RUN_OUTPUT=""
RUN_STATUS=0
RUN_TMP_DIR=""
RUN_STATE_DIR=""
fail() {
printf 'FAIL: %s\n' "$*" >&2
exit 1
}
assert_contains() {
local haystack="$1"
local needle="$2"
if [[ "${haystack}" != *"${needle}"* ]]; then
fail "expected output to contain: ${needle}"
fi
}
assert_not_contains() {
local haystack="$1"
local needle="$2"
if [[ "${haystack}" == *"${needle}"* ]]; then
fail "expected output to not contain: ${needle}"
fi
}
cleanup_run() {
if [[ -n "${RUN_TMP_DIR}" && -d "${RUN_TMP_DIR}" ]]; then
rm -rf "${RUN_TMP_DIR}"
fi
RUN_OUTPUT=""
RUN_STATUS=0
RUN_TMP_DIR=""
RUN_STATE_DIR=""
}
create_fake_tools() {
local scenario="$1"
local tmp_dir="$2"
mkdir -p "${tmp_dir}/bin" "${tmp_dir}/state"
cat >"${tmp_dir}/bin/curl" <<'EOF'
#!/usr/bin/env bash
set -euo pipefail
state_dir="${FAKE_CURL_STATE_DIR:?}"
scenario="${FAKE_CURL_SCENARIO:?}"
url="${@: -1}"
data=""
write_out=""
for ((i = 1; i <= $#; i += 1)); do
arg="${!i}"
case "${arg}" in
--data)
next_index=$((i + 1))
data="${!next_index}"
;;
--write-out)
next_index=$((i + 1))
write_out="${!next_index}"
;;
esac
done
counter_file() {
printf '%s/%s.count\n' "${state_dir}" "$1"
}
read_count() {
local file
file="$(counter_file "$1")"
if [[ -f "${file}" ]]; then
cat "${file}"
return
fi
printf '0\n'
}
bump_count() {
local name="$1"
local file value
file="$(counter_file "${name}")"
value=$(( $(read_count "${name}") + 1 ))
printf '%s\n' "${value}" >"${file}"
printf '%s\n' "${value}"
}
if [[ -n "${write_out}" ]]; then
printf '200'
exit 0
fi
case "${scenario}" in
bridge-timeout)
case "${url}" in
https://xworkmate-bridge.svc.plus/api/ping)
printf '{"status":"ok","image":"ghcr.io/x-evor/xworkmate-bridge:425a38f1e8076899400d4a858d4678dffd876afb","tag":"425a38f1e8076899400d4a858d4678dffd876afb","commit":"425a38f1e8076899400d4a858d4678dffd876afb","version":"425a38f1e8076899400d4a858d4678dffd876afb"}\n'
;;
https://xworkmate-bridge.svc.plus/)
printf 'xworkmate-bridge is running\n'
;;
https://acp-server.svc.plus/*/acp/rpc)
printf '{"jsonrpc":"2.0","result":{"providers":["ok"]}}\n'
;;
https://xworkmate-bridge.svc.plus/acp/rpc)
printf 'curl: (28) Operation timed out after 20001 milliseconds with 0 bytes received\n' >&2
exit 1
;;
*)
printf 'unexpected url in bridge-timeout scenario: %s\n' "${url}" >&2
exit 1
;;
esac
;;
retry-success)
case "${url}" in
https://xworkmate-bridge.svc.plus/api/ping)
ping_attempt="$(bump_count ping)"
if (( ping_attempt < 3 )); then
printf 'curl: (28) Operation timed out after 20001 milliseconds with 0 bytes received\n' >&2
exit 1
fi
printf '{"status":"ok","image":"ghcr.io/x-evor/xworkmate-bridge:425a38f1e8076899400d4a858d4678dffd876afb","tag":"425a38f1e8076899400d4a858d4678dffd876afb","commit":"425a38f1e8076899400d4a858d4678dffd876afb","version":"425a38f1e8076899400d4a858d4678dffd876afb"}\n'
;;
https://xworkmate-bridge.svc.plus/)
printf 'xworkmate-bridge is running\n'
;;
https://acp-server.svc.plus/*/acp/rpc)
printf '{"jsonrpc":"2.0","result":{"providers":["ok"]}}\n'
;;
https://xworkmate-bridge.svc.plus/acp/rpc)
printf '{"jsonrpc":"2.0","result":{"success":true,"output":"pong"}}\n'
;;
*)
printf 'unexpected url in retry-success scenario: %s\n' "${url}" >&2
exit 1
;;
esac
;;
*)
printf 'unsupported fake curl scenario: %s\n' "${scenario}" >&2
exit 1
;;
esac
EOF
cat >"${tmp_dir}/bin/sleep" <<'EOF'
#!/usr/bin/env bash
set -euo pipefail
exit 0
EOF
chmod +x "${tmp_dir}/bin/curl" "${tmp_dir}/bin/sleep"
}
run_validate_capture() {
local scenario="$1"
cleanup_run
RUN_TMP_DIR="$(mktemp -d)"
RUN_STATE_DIR="${RUN_TMP_DIR}/state"
create_fake_tools "${scenario}" "${RUN_TMP_DIR}"
set +e
RUN_OUTPUT="$(
PATH="${RUN_TMP_DIR}/bin:${PATH}" \
FAKE_CURL_SCENARIO="${scenario}" \
FAKE_CURL_STATE_DIR="${RUN_STATE_DIR}" \
BRIDGE_SERVER_URL="https://xworkmate-bridge.svc.plus" \
OPENCLAW_URL="wss://openclaw.svc.plus" \
CODEX_RPC_URL="https://acp-server.svc.plus/codex/acp/rpc" \
OPENCODE_RPC_URL="https://acp-server.svc.plus/opencode/acp/rpc" \
GEMINI_RPC_URL="https://acp-server.svc.plus/gemini/acp/rpc" \
INTERNAL_SERVICE_TOKEN="test-token" \
bash "${SCRIPT_PATH}" "${IMAGE_REF}" 2>&1
)"
RUN_STATUS=$?
set -e
}
test_bridge_timeout_stops_without_json_decode_noise() {
run_validate_capture "bridge-timeout"
if [[ "${RUN_STATUS}" -eq 0 ]]; then
fail "expected bridge-timeout scenario to fail"
fi
assert_contains "${RUN_OUTPUT}" "bridge rpc https://xworkmate-bridge.svc.plus/acp/rpc request failed"
assert_not_contains "${RUN_OUTPUT}" "JSONDecodeError"
cleanup_run
}
test_ping_retry_reaches_successful_release_validation() {
run_validate_capture "retry-success"
if [[ "${RUN_STATUS}" -ne 0 ]]; then
printf '%s\n' "${RUN_OUTPUT}" >&2
fail "expected retry-success scenario to pass"
fi
if [[ ! -f "${RUN_STATE_DIR}/ping.count" ]]; then
fail "expected ping retry counter to be recorded"
fi
ping_attempts="$(tr -d '\n' <"${RUN_STATE_DIR}/ping.count")"
if [[ "${ping_attempts}" != "3" ]]; then
fail "expected ping to succeed on third attempt, got ${ping_attempts}"
fi
cleanup_run
}
test_bridge_timeout_stops_without_json_decode_noise
test_ping_retry_reaches_successful_release_validation
printf 'validate-deploy regression tests passed\n'

View File

@ -2,6 +2,10 @@
set -euo pipefail
IMAGE_REF="${1:?image_ref is required}"
RETRYABLE_TRANSPORT=10
RETRYABLE_NOT_READY=11
FAST_HTTP_TIMEOUT_SECONDS=20
BRIDGE_RPC_TIMEOUT_SECONDS=130
normalize_url() {
local value="$1"
@ -55,12 +59,20 @@ OPENCODE_RPC_URL="$(normalize_url "${OPENCODE_RPC_URL:-${5:-https://acp-server.s
GEMINI_RPC_URL="$(normalize_url "${GEMINI_RPC_URL:-${6:-https://acp-server.svc.plus/gemini/acp/rpc}}")"
AUTH_TOKEN="${BRIDGE_AUTH_TOKEN:-${INTERNAL_SERVICE_TOKEN:-${7:-}}}"
curl_common=(
fast_http_curl_common=(
--silent
--show-error
--fail
--location
--max-time 20
--max-time "${FAST_HTTP_TIMEOUT_SECONDS}"
)
bridge_rpc_curl_common=(
--silent
--show-error
--fail
--location
--max-time "${BRIDGE_RPC_TIMEOUT_SECONDS}"
)
auth_headers=()
@ -68,6 +80,8 @@ if [[ -n "${AUTH_TOKEN}" ]]; then
auth_headers+=(-H "Authorization: Bearer ${AUTH_TOKEN}")
fi
# Use explicit assignment guards so transport failures are not swallowed inside
# nested command substitutions when bash runs without inherit_errexit.
capture_http_response() {
local label="$1"
shift
@ -75,18 +89,63 @@ capture_http_response() {
local response
if ! response="$(curl "$@" 2>&1)"; then
printf '%s request failed: %s\n' "${label}" "${response}" >&2
return 1
return "${RETRYABLE_TRANSPORT}"
fi
if [[ -z "${response}" ]]; then
printf '%s request returned an empty response\n' "${label}" >&2
return 1
return "${RETRYABLE_TRANSPORT}"
fi
printf '%s\n' "${response}"
}
probe_jsonrpc_capabilities() {
should_retry_exit_code() {
local exit_code="$1"
local allowed="$2"
local candidate
IFS=',' read -r -a candidates <<<"${allowed}"
for candidate in "${candidates[@]}"; do
if [[ "${exit_code}" == "${candidate}" ]]; then
return 0
fi
done
return 1
}
run_with_retry() {
local label="$1"
local attempts="$2"
local sleep_seconds="$3"
local retryable_codes="$4"
shift 4
local attempt exit_code
for ((attempt = 1; attempt <= attempts; attempt += 1)); do
if "$@"; then
return 0
else
exit_code=$?
fi
if (( attempt == attempts )) || ! should_retry_exit_code "${exit_code}" "${retryable_codes}"; then
return "${exit_code}"
fi
printf '%s attempt %d/%d failed; retrying in %ss\n' \
"${label}" \
"${attempt}" \
"${attempts}" \
"${sleep_seconds}" >&2
sleep "${sleep_seconds}"
done
return 1
}
probe_jsonrpc_capabilities_once() {
local endpoint="$1"
local response
local headers=(
@ -96,19 +155,28 @@ probe_jsonrpc_capabilities() {
headers+=("${auth_headers[@]}")
response="$(
if response="$(
capture_http_response "capabilities ${endpoint}" \
"${curl_common[@]}" \
"${fast_http_curl_common[@]}" \
"${headers[@]}" \
--data '{"jsonrpc":"2.0","id":"cap-1","method":"acp.capabilities"}' \
"${endpoint}"
)"
)"; then
:
else
local exit_code=$?
return "${exit_code}"
fi
RESPONSE_JSON="${response}" python3 - <<'PY'
import json
import os
payload = json.loads(os.environ["RESPONSE_JSON"])
try:
payload = json.loads(os.environ["RESPONSE_JSON"])
except json.JSONDecodeError as exc:
raise SystemExit(f"capabilities response returned invalid JSON: {exc}") from None
if payload.get("jsonrpc") != "2.0":
raise SystemExit("capabilities response missing jsonrpc envelope")
@ -131,18 +199,23 @@ jsonrpc_bridge_call() {
headers+=("${auth_headers[@]}")
response="$(
if response="$(
capture_http_response "bridge rpc ${BASE_URL}/acp/rpc" \
"${curl_common[@]}" \
"${bridge_rpc_curl_common[@]}" \
"${headers[@]}" \
--data "${payload}" \
"${BASE_URL}/acp/rpc"
)"
)"; then
:
else
local exit_code=$?
return "${exit_code}"
fi
printf '%s\n' "${response}"
}
probe_bridge_single_agent_smoke() {
probe_bridge_single_agent_smoke_once() {
local provider_id="$1"
local request_id="smoke-${provider_id}-$(date +%s)"
local session_id="validate-${provider_id}-$(date +%s)"
@ -154,14 +227,22 @@ probe_bridge_single_agent_smoke() {
JSON
)"
response="$(jsonrpc_bridge_call "${payload}")"
if response="$(jsonrpc_bridge_call "${payload}")"; then
:
else
local exit_code=$?
return "${exit_code}"
fi
PROVIDER_ID="${provider_id}" RESPONSE_JSON="${response}" python3 - <<'PY'
import json
import os
provider = os.environ["PROVIDER_ID"]
payload = json.loads(os.environ["RESPONSE_JSON"])
try:
payload = json.loads(os.environ["RESPONSE_JSON"])
except json.JSONDecodeError as exc:
raise SystemExit(f"{provider}: bridge rpc returned invalid JSON: {exc}") from None
if payload.get("jsonrpc") != "2.0":
raise SystemExit(f"{provider}: missing jsonrpc envelope")
@ -205,12 +286,12 @@ probe_safe_http_endpoint() {
--output /dev/null \
--write-out '%{http_code}' \
--location \
--max-time 20 \
--max-time "${FAST_HTTP_TIMEOUT_SECONDS}" \
"${auth_headers[@]}" \
"${endpoint}" 2>&1
)"; then
printf 'HTTP probe failed for %s: %s\n' "${endpoint}" "${status}" >&2
return 1
return "${RETRYABLE_TRANSPORT}"
fi
case "${status}" in
@ -224,20 +305,30 @@ probe_safe_http_endpoint() {
esac
}
ping_json="$(
capture_http_response "bridge ping ${BASE_URL}/api/ping" \
"${curl_common[@]}" \
"${auth_headers[@]}" \
"${BASE_URL}/api/ping"
)"
wait_for_release_ping_once() {
local ping_json
PING_JSON="${ping_json}" python3 - "${image_ref}" "${tag}" "${commit}" "${version}" <<'PY'
if ping_json="$(
capture_http_response "bridge ping ${BASE_URL}/api/ping" \
"${fast_http_curl_common[@]}" \
"${BASE_URL}/api/ping"
)"; then
:
else
local exit_code=$?
return "${exit_code}"
fi
if PING_JSON="${ping_json}" python3 - "${image_ref}" "${tag}" "${commit}" "${version}" <<'PY'
import json
import os
import sys
image_ref, tag, commit, version = sys.argv[1:5]
payload = json.loads(os.environ["PING_JSON"])
try:
payload = json.loads(os.environ["PING_JSON"])
except json.JSONDecodeError as exc:
raise SystemExit(f"bridge ping returned invalid JSON: {exc}") from None
if payload.get("status") != "ok":
raise SystemExit("ping status not ok")
@ -254,19 +345,37 @@ if commit and payload.get("commit") != commit:
if version and payload.get("version") != version:
raise SystemExit(f"expected version {version!r}, got {payload.get('version')!r}")
PY
then
return 0
fi
bridge_root="$(
capture_http_response "bridge root ${BASE_URL}/" \
"${curl_common[@]}" \
"${auth_headers[@]}" \
"${BASE_URL}/"
)"
grep -qi 'xworkmate-bridge' <<<"${bridge_root}"
return "${RETRYABLE_NOT_READY}"
}
probe_bridge_root() {
local bridge_root
if bridge_root="$(
capture_http_response "bridge root ${BASE_URL}/" \
"${fast_http_curl_common[@]}" \
"${BASE_URL}/"
)"; then
:
else
local exit_code=$?
return "${exit_code}"
fi
grep -qi 'xworkmate-bridge' <<<"${bridge_root}"
}
run_with_retry "bridge ping ${BASE_URL}/api/ping" 6 5 "${RETRYABLE_TRANSPORT},${RETRYABLE_NOT_READY}" wait_for_release_ping_once
probe_bridge_root
probe_safe_http_endpoint "${OPENCLAW_HTTP_PROBE_URL}"
probe_jsonrpc_capabilities "${CODEX_RPC_URL}"
probe_jsonrpc_capabilities "${OPENCODE_RPC_URL}"
probe_jsonrpc_capabilities "${GEMINI_RPC_URL}"
probe_bridge_single_agent_smoke "codex"
probe_bridge_single_agent_smoke "opencode"
probe_bridge_single_agent_smoke "gemini"
run_with_retry "capabilities ${CODEX_RPC_URL}" 3 5 "${RETRYABLE_TRANSPORT}" probe_jsonrpc_capabilities_once "${CODEX_RPC_URL}"
run_with_retry "capabilities ${OPENCODE_RPC_URL}" 3 5 "${RETRYABLE_TRANSPORT}" probe_jsonrpc_capabilities_once "${OPENCODE_RPC_URL}"
run_with_retry "capabilities ${GEMINI_RPC_URL}" 3 5 "${RETRYABLE_TRANSPORT}" probe_jsonrpc_capabilities_once "${GEMINI_RPC_URL}"
run_with_retry "bridge single-agent smoke codex" 3 10 "${RETRYABLE_TRANSPORT}" probe_bridge_single_agent_smoke_once "codex"
run_with_retry "bridge single-agent smoke opencode" 3 10 "${RETRYABLE_TRANSPORT}" probe_bridge_single_agent_smoke_once "opencode"
run_with_retry "bridge single-agent smoke gemini" 3 10 "${RETRYABLE_TRANSPORT}" probe_bridge_single_agent_smoke_once "gemini"