name: Deploy AI Workspace (IaC + Ansible + Cloudflare) # ============================================================================= # IaC ↔ Ansible 动态 inventory 联动的最终部署流水线(矩阵模式) # # ── TLDR 前置条件(首次运行必须全部就绪)──────────────────────────────────── # # 1. Vault JWT auth(一次性,已存在) # - auth/jwt 挂载,oidc_discovery_url = https://token.actions.githubusercontent.com # # 2. Vault role + policy(已创建) # - policy: github-actions-xworkspace-console(读 kv/CICD + kv/openclaw) # - role: github-actions-xworkspace-console(JWT,bound 本仓库 OIDC) # → 创建/校验命令见 docs/operations/vault-github-actions.md §2 # # 3. Vault KV 必填键(vault kv patch kv/CICD ... / vault kv patch kv/openclaw ...) # kv/CICD: # VULTR_API_KEY → Vultr 账号 API key(provision 创主机) # SSH_PRIVATE_DEPLOY_KEY_B64 → 部署 SSH 私钥 base64(deploy 登录主机,优先) # SSH_PRIVATE_DEPLOY_KEY → 同上原始多行格式(回退,二选一必填) # CLOUDFLARE_DNS_API_TOKEN → CF Zone DNS Edit token(dns 同步) # CLOUDFLARE_API_TOKEN → 兼容旧名;DNS job 优先使用 CLOUDFLARE_DNS_API_TOKEN # kv/openclaw: # DEEPSEEK_API_KEY → LLM provider key(deploy 注入主机) # NVIDIA_API_KEY → 同上 # OLLAMA_API_KEY → 同上 # # 4. Vault KV 必填键(远端 S3 兼容 state 后端强制启用,缺失即 fail-fast; # 不再回退本地 state,确保 destroy 不丢 state) # kv/CICD: # TF_STATE_ENDPOINT → S3 兼容对象存储 API URL(如 https://.r2.cloudflarestorage.com) # TF_STATE_BUCKET → bucket 名(如 ai-workspace-tfstate) # TF_STATE_ACCESS_KEY / TF_STATE_SECRET_KEY → 对象存储凭据 # TF_STATE_REGION → 地域(Cloudflare R2 必须填 auto;Vultr 填 us-east-1) # → 对象存储搭建指南见 docs/operations/iac-prerequisites.md §3 # # 5. ai-workspace-infra 私有仓库(可选加速) # - kv/CICD.CODEX_GITHUB_PERSONAL_ACCESS_TOKEN → checkout iac_modules + playbooks # - 不填则 actions/checkout 走公开访问(仓库须为 public) # # 6. SSH 公钥注入 infra hosts.yaml # - SSH_PRIVATE_DEPLOY_KEY 对应的公钥须写入 # ai-workspace-infra/vultr-vps/config/resources/ai-workspace-hosts.yaml # 的 ssh_keys[].public,否则 Terraform 创机后 runner 无法 SSH 登录。 # # 7. AI_WORKSPACE_AUTH_TOKEN(LiteLLM 认证 token,存储在 Vault) # - 用于 OpenCode ACP adapter 的 LITELLM_MASTER_KEY # - 存储位置:vault kv patch kv/CICD AI_WORKSPACE_AUTH_TOKEN= # - TLDR 生成:python3 -c 'import uuid; print(uuid.uuid4())' # - 部署时自动从 Vault 读取,注入 ansible role 的 acp_opencode_auth_token # # ── 流水线结构 ─────────────────────────────────────────────────────────────── # # provision : 批量起机模式(开关:terraform_action=apply / run_deploy)。 # 用 vultr-vps/envs/ai-workspace 创建主机(Python+Jinja2 渲染显式 # HCL,无 for_each),导出 cmdb.json + inventory.ini,并据此动态 # 生成下游部署矩阵。 # deploy : 矩阵按主机并行,ssh 到主机本地跑官方引导(curl|bash → host 内部 # ansible -c local,自动离线包加速)。与用户 self-host 同一路径; # 不在 runner 远程跑 all-in-one(会撞 agent_skills delegate_to localhost)。 # dns : 部署完成后,依据 inventory 的 service_domains/IP 同步 Cloudflare DNS。 # # 数据契约 cmdb.json 由 ai-workspace-infra 的 generate.py 产出,贯穿三个 job。 # # 密钥管理:不使用 GitHub Actions Secrets,统一从 HashiCorp Vault # (https://vault.svc.plus) KV 安全获取,认证走 GitHub OIDC(JWT,无静态 token)。 # - Vault 角色: github-actions-xworkspace-console (jwt auth, audience=vault) # - KV 路径: kv/data/CICD(共享 CICD 机密) + kv/data/openclaw(LLM keys) # - 详细说明: docs/operations/vault-github-actions.md # docs/operations/iac-prerequisites.md # ============================================================================= on: workflow_dispatch: inputs: infra_ref: description: "ai-workspace-infra git ref (iac_modules + playbooks)" required: false default: "main" type: string bridge_domain: description: "XWORKMATE_BRIDGE_DOMAIN 覆盖(留空则取各主机 CMDB service_domains)" required: false default: "" type: string offline_mode: description: "on-host 离线包模式: off=在线拉最新 main(默认,离线包落后时用); auto=离线加速; force=强制离线" required: false default: "off" type: choice options: ["off", "auto", "force"] terraform_action: description: "apply 创建/更新,destroy 销毁" required: false default: "apply" type: choice options: [apply, destroy] run_deploy: description: "provision 后是否执行 on-host 引导部署" required: false default: true type: boolean run_dns: description: "部署后是否同步 Cloudflare DNS" required: false default: true type: boolean use_deepseek: description: "是否接入 DeepSeek API key" required: false default: true type: boolean use_nvidia: description: "是否接入 NVIDIA API key" required: false default: true type: boolean use_ollama: description: "是否接入 Ollama API key" required: false default: true type: boolean ai_workspace_auth_token: description: "AI Workspace auth token 覆盖(留空则取 Vault kv/CICD/AI_WORKSPACE_AUTH_TOKEN;生成: python3 -c 'import uuid; print(uuid.uuid4())')" required: false default: "" type: string # id-token: write 用于 Vault 的 GitHub OIDC(JWT) 认证;contents: read 拉代码 permissions: contents: read id-token: write concurrency: group: deploy-ai-workspace-iac cancel-in-progress: false env: VAULT_ADDR: https://vault.svc.plus VAULT_ROLE: github-actions-xworkspace-console # 共享 CICD 机密路径(KV v2 读路径含 data/)。键名见 docs/operations/vault-github-actions.md VAULT_KV: kv/data/CICD # LLM provider keys 放在 openclaw 路径 VAULT_KV_OPENCLAW: kv/data/openclaw # vultr-vps 根(共享 scripts/ templates/ config/);ENV_DIR 为 terraform 运行目录(workdir) VPS_ROOT: infra/iac_modules/terraform-hcl-standard/vultr-vps ENV_DIR: infra/iac_modules/terraform-hcl-standard/vultr-vps/envs/ai-workspace PLAYBOOKS_DIR: infra/playbooks jobs: # --------------------------------------------------------------------------- provision: name: Provision (terraform + render CMDB) runs-on: ubuntu-latest outputs: hosts: ${{ steps.matrix.outputs.hosts }} count: ${{ steps.matrix.outputs.count }} steps: - name: Load Vault secrets (OIDC) id: vault uses: hashicorp/vault-action@v4 with: url: ${{ env.VAULT_ADDR }} method: jwt role: ${{ env.VAULT_ROLE }} jwtGithubAudience: vault ignoreNotFound: true secrets: | ${{ env.VAULT_KV }} VULTR_API_KEY | VULTR_API_KEY ; ${{ env.VAULT_KV }} TF_STATE_ENDPOINT | TF_STATE_ENDPOINT ; ${{ env.VAULT_KV }} TF_STATE_BUCKET | TF_STATE_BUCKET ; ${{ env.VAULT_KV }} TF_STATE_ACCESS_KEY | TF_STATE_ACCESS_KEY ; ${{ env.VAULT_KV }} TF_STATE_SECRET_KEY | TF_STATE_SECRET_KEY ; ${{ env.VAULT_KV }} TF_STATE_REGION | TF_STATE_REGION ; ${{ env.VAULT_KV }} CLOUDFLARE_DNS_API_TOKEN | CLOUDFLARE_DNS_API_TOKEN ; ${{ env.VAULT_KV }} CLOUDFLARE_API_TOKEN | CLOUDFLARE_API_TOKEN - name: Validate required secrets env: VULTR_API_KEY: ${{ steps.vault.outputs.VULTR_API_KEY }} TF_STATE_ENDPOINT: ${{ steps.vault.outputs.TF_STATE_ENDPOINT }} TF_STATE_BUCKET: ${{ steps.vault.outputs.TF_STATE_BUCKET }} TF_STATE_ACCESS_KEY: ${{ steps.vault.outputs.TF_STATE_ACCESS_KEY }} TF_STATE_SECRET_KEY: ${{ steps.vault.outputs.TF_STATE_SECRET_KEY }} TF_STATE_REGION: ${{ steps.vault.outputs.TF_STATE_REGION }} CLOUDFLARE_DNS_API_TOKEN: ${{ steps.vault.outputs.CLOUDFLARE_DNS_API_TOKEN }} CLOUDFLARE_API_TOKEN: ${{ steps.vault.outputs.CLOUDFLARE_API_TOKEN }} run: | set -euo pipefail # 校验 REQUIRED 机密非空(不打印任何值,仅判空)。 # 远端 S3 兼容 state 后端为强制要求(默认开启,不再回退本地 state)。 missing=0 if [ -z "${VULTR_API_KEY:-}" ]; then echo "::error::缺少必需机密 VULTR_API_KEY (Vault: ${VAULT_KV}/VULTR_API_KEY)" missing=1 fi for k in TF_STATE_ENDPOINT TF_STATE_BUCKET TF_STATE_ACCESS_KEY TF_STATE_SECRET_KEY TF_STATE_REGION; do if [ -z "$(eval echo \"\${$k:-}\")" ]; then echo "::error::缺少必需机密 $k (Vault: ${VAULT_KV}/$k) —— 远端 S3 state 后端为强制要求" missing=1 fi done [ "$missing" -eq 0 ] || { echo "::error::必需机密缺失,终止 provision"; exit 1; } - name: Checkout iac_modules uses: actions/checkout@v7 with: repository: ai-workspace-infra/iac_modules ref: ${{ github.event.inputs.infra_ref || 'main' }} path: infra/iac_modules - name: Checkout playbooks uses: actions/checkout@v7 with: repository: ai-workspace-infra/playbooks ref: ${{ github.event.inputs.infra_ref || 'main' }} path: infra/playbooks - uses: hashicorp/setup-terraform@v3 with: terraform_version: "1.9.8" - uses: actions/setup-python@v6 with: python-version: "3.12" - name: Install render deps run: pip install --quiet pyyaml jinja2 - name: Configure remote backend (S3-compatible, required) working-directory: ${{ env.ENV_DIR }} env: TF_STATE_ENDPOINT: ${{ steps.vault.outputs.TF_STATE_ENDPOINT }} TF_STATE_REGION: ${{ steps.vault.outputs.TF_STATE_REGION }} run: python3 $GITHUB_WORKSPACE/${{ env.VPS_ROOT }}/scripts/render_backend_tf.py backend.tf - name: generate.py render (YAML -> 显式 HCL + tfvars) working-directory: ${{ env.VPS_ROOT }} run: python3 scripts/generate.py render - name: Terraform init working-directory: ${{ env.ENV_DIR }} env: AWS_ACCESS_KEY_ID: ${{ steps.vault.outputs.TF_STATE_ACCESS_KEY }} AWS_SECRET_ACCESS_KEY: ${{ steps.vault.outputs.TF_STATE_SECRET_KEY }} TF_STATE_ENDPOINT: ${{ steps.vault.outputs.TF_STATE_ENDPOINT }} TF_STATE_BUCKET: ${{ steps.vault.outputs.TF_STATE_BUCKET }} TF_STATE_REGION: ${{ steps.vault.outputs.TF_STATE_REGION }} run: | set -euo pipefail # 远端 S3 兼容 state 后端强制启用(backend.tf 已由上一步渲染); # 缺失 bucket 直接失败,不回退本地 state。 if [ -z "${TF_STATE_BUCKET}" ]; then echo "::error::TF_STATE_BUCKET 为空 —— 远端 state 后端为强制要求,终止" exit 1 fi terraform init -input=false \ -backend-config="bucket=${TF_STATE_BUCKET}" \ -backend-config="key=ai-workspace/terraform.tfstate" \ -backend-config="region=${TF_STATE_REGION}" - name: Terraform ${{ github.event.inputs.terraform_action || 'apply' }} working-directory: ${{ env.ENV_DIR }} env: AWS_ACCESS_KEY_ID: ${{ steps.vault.outputs.TF_STATE_ACCESS_KEY }} AWS_SECRET_ACCESS_KEY: ${{ steps.vault.outputs.TF_STATE_SECRET_KEY }} TF_VAR_vultr_api_key: ${{ steps.vault.outputs.VULTR_API_KEY }} run: | set -euo pipefail terraform ${{ github.event.inputs.terraform_action || 'apply' }} -auto-approve -input=false - name: generate.py inventory (terraform output + YAML -> cmdb.json + inventory.ini) if: ${{ (github.event.inputs.terraform_action || 'apply') == 'apply' }} working-directory: ${{ env.VPS_ROOT }} env: AWS_ACCESS_KEY_ID: ${{ steps.vault.outputs.TF_STATE_ACCESS_KEY }} AWS_SECRET_ACCESS_KEY: ${{ steps.vault.outputs.TF_STATE_SECRET_KEY }} run: python3 scripts/generate.py inventory - name: Build deploy matrix from cmdb.json id: matrix if: ${{ (github.event.inputs.terraform_action || 'apply') == 'apply' }} working-directory: ${{ env.ENV_DIR }} run: | set -euo pipefail hosts="$(jq -c 'keys' cmdb.json)" echo "hosts=${hosts}" >> "$GITHUB_OUTPUT" echo "count=$(jq 'length' cmdb.json)" >> "$GITHUB_OUTPUT" echo "matrix hosts: ${hosts}" - name: Upload CMDB + inventory artifact if: ${{ (github.event.inputs.terraform_action || 'apply') == 'apply' }} uses: actions/upload-artifact@v7 with: name: ai-workspace-cmdb path: | ${{ env.ENV_DIR }}/cmdb.json ${{ env.ENV_DIR }}/inventory.ini if-no-files-found: error # --------------------------------------------------------------------------- deploy: name: Deploy ${{ matrix.host }} (on-host bootstrap) needs: provision if: ${{ needs.provision.outputs.count != '0' && (github.event.inputs.run_deploy == 'true' || github.event.inputs.run_deploy == null) }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: host: ${{ fromJSON(needs.provision.outputs.hosts) }} steps: # all-in-one 是“在目标主机本地执行”的模型(host 内部 ansible-playbook -c local, # 自动走离线包加速)。从 runner 远程跑 all-in-one 会撞 roles/agent_skills 的 # delegate_to: localhost(写 runner 本地 /root),故 deploy 改为 ssh 到主机本地 # 跑官方引导脚本——与用户 self-host 的 curl|bash 完全同一路径。 - name: Load Vault secrets (OIDC) id: vault uses: hashicorp/vault-action@v4 with: url: ${{ env.VAULT_ADDR }} method: jwt role: ${{ env.VAULT_ROLE }} jwtGithubAudience: vault ignoreNotFound: true secrets: | ${{ env.VAULT_KV }} SSH_PRIVATE_DEPLOY_KEY | ANSIBLE_SSH_KEY ; ${{ env.VAULT_KV }} SSH_PRIVATE_DEPLOY_KEY_B64 | ANSIBLE_SSH_KEY_B64 ; ${{ env.VAULT_KV_OPENCLAW }} DEEPSEEK_API_KEY | DEEPSEEK_API_KEY ; ${{ env.VAULT_KV_OPENCLAW }} NVIDIA_API_KEY | NVIDIA_API_KEY ; ${{ env.VAULT_KV_OPENCLAW }} OLLAMA_API_KEY | OLLAMA_API_KEY - name: Report provider key wiring run: | set -euo pipefail echo "DeepSeek: ${{ github.event.inputs.use_deepseek == 'false' && 'skipped' || 'enabled' }}" echo "NVIDIA: ${{ github.event.inputs.use_nvidia == 'false' && 'skipped' || 'enabled' }}" echo "Ollama: ${{ github.event.inputs.use_ollama == 'false' && 'skipped' || 'enabled' }}" - name: Validate required secrets env: ANSIBLE_SSH_KEY: ${{ steps.vault.outputs.ANSIBLE_SSH_KEY }} ANSIBLE_SSH_KEY_B64: ${{ steps.vault.outputs.ANSIBLE_SSH_KEY_B64 }} DEEPSEEK_API_KEY: ${{ github.event.inputs.use_deepseek == 'false' && '' || steps.vault.outputs.DEEPSEEK_API_KEY }} NVIDIA_API_KEY: ${{ github.event.inputs.use_nvidia == 'false' && '' || steps.vault.outputs.NVIDIA_API_KEY }} OLLAMA_API_KEY: ${{ github.event.inputs.use_ollama == 'false' && '' || steps.vault.outputs.OLLAMA_API_KEY }} run: | set -euo pipefail # 只校验 REQUIRED 机密非空(不打印任何值,仅判空)。 missing=0 # SSH 私钥:B64 与原始至少有一个非空。 if [ -z "${ANSIBLE_SSH_KEY_B64:-}" ] && [ -z "${ANSIBLE_SSH_KEY:-}" ]; then echo "::error::缺少必需机密 SSH 私钥 (Vault: ${VAULT_KV}/SSH_PRIVATE_DEPLOY_KEY_B64 或 ${VAULT_KV}/SSH_PRIVATE_DEPLOY_KEY,至少一个)" missing=1 fi if [ "${{ github.event.inputs.use_deepseek || 'true' }}" = "true" ] && [ -z "${DEEPSEEK_API_KEY:-}" ]; then echo "::error::缺少必需机密 DEEPSEEK_API_KEY (Vault: ${VAULT_KV_OPENCLAW}/DEEPSEEK_API_KEY)" missing=1 fi if [ "${{ github.event.inputs.use_nvidia || 'true' }}" = "true" ] && [ -z "${NVIDIA_API_KEY:-}" ]; then echo "::error::缺少必需机密 NVIDIA_API_KEY (Vault: ${VAULT_KV_OPENCLAW}/NVIDIA_API_KEY)" missing=1 fi if [ "${{ github.event.inputs.use_ollama || 'true' }}" = "true" ] && [ -z "${OLLAMA_API_KEY:-}" ]; then echo "::error::缺少必需机密 OLLAMA_API_KEY (Vault: ${VAULT_KV_OPENCLAW}/OLLAMA_API_KEY)" missing=1 fi [ "$missing" -eq 0 ] || { echo "::error::必需机密缺失,终止 deploy"; exit 1; } - name: Checkout xworkspace-console helpers uses: actions/checkout@v7 - name: Download CMDB (host IP source) uses: actions/download-artifact@v8 with: name: ai-workspace-cmdb path: cmdb - name: Configure SSH (prefer base64 key, fall back to raw) env: ANSIBLE_SSH_KEY: ${{ steps.vault.outputs.ANSIBLE_SSH_KEY }} ANSIBLE_SSH_KEY_B64: ${{ steps.vault.outputs.ANSIBLE_SSH_KEY_B64 }} run: | set -euo pipefail mkdir -p ~/.ssh # 历史约定:优先解码单行 *_B64,再回退原始多行私钥,避免 GitHub Actions # 处理多行私钥时的 "Load key ... error in libcrypto"。 if [ -n "${ANSIBLE_SSH_KEY_B64:-}" ]; then printf '%s' "${ANSIBLE_SSH_KEY_B64}" | base64 -d > ~/.ssh/id_deploy elif [ -n "${ANSIBLE_SSH_KEY:-}" ]; then printf '%s\n' "${ANSIBLE_SSH_KEY}" > ~/.ssh/id_deploy else echo "::error::Vault 未提供 ANSIBLE_SSH_KEY[_B64]"; exit 1 fi chmod 600 ~/.ssh/id_deploy ssh-keygen -y -f ~/.ssh/id_deploy >/dev/null - name: Wait for host SSH run: | set -euo pipefail ip="$(jq -r '.["${{ matrix.host }}"].ip' cmdb/cmdb.json)" echo "Waiting for ${{ matrix.host }} (${ip}:22) ..." for _ in $(seq 1 60); do if nc -z -w 5 "$ip" 22; then echo "SSH up"; exit 0; fi sleep 10 done echo "::error::Timed out waiting for ${ip}:22"; exit 1 - name: Run on-host bootstrap (curl | bash, local-mode install) env: MATRIX_HOST: ${{ matrix.host }} CMDB_PATH: cmdb/cmdb.json SSH_KEY_PATH: ~/.ssh/id_deploy # 离线包落后于 main 时用在线模式拉最新 playbook(见 run-on-host-bootstrap.sh)。 # 离线包重新发布后可设为 auto 恢复离线加速。 AI_WORKSPACE_OFFLINE_MODE: ${{ github.event.inputs.offline_mode || 'off' }} XWORKMATE_BRIDGE_DOMAIN: ${{ github.event.inputs.bridge_domain }} AI_WORKSPACE_AUTH_TOKEN: ${{ github.event.inputs.ai_workspace_auth_token }} DEEPSEEK_API_KEY: ${{ github.event.inputs.use_deepseek == 'false' && '' || steps.vault.outputs.DEEPSEEK_API_KEY }} NVIDIA_API_KEY: ${{ github.event.inputs.use_nvidia == 'false' && '' || steps.vault.outputs.NVIDIA_API_KEY }} OLLAMA_API_KEY: ${{ github.event.inputs.use_ollama == 'false' && '' || steps.vault.outputs.OLLAMA_API_KEY }} run: bash scripts/run-on-host-bootstrap.sh # --------------------------------------------------------------------------- dns: name: Sync Cloudflare DNS needs: [provision, deploy] if: ${{ needs.provision.outputs.count != '0' && (github.event.inputs.run_dns == 'true' || github.event.inputs.run_dns == null) }} runs-on: ubuntu-latest steps: - name: Load Vault secrets (OIDC) id: vault uses: hashicorp/vault-action@v4 with: url: ${{ env.VAULT_ADDR }} method: jwt role: ${{ env.VAULT_ROLE }} jwtGithubAudience: vault ignoreNotFound: true secrets: | ${{ env.VAULT_KV }} CLOUDFLARE_DNS_API_TOKEN | CLOUDFLARE_DNS_API_TOKEN - name: Validate required secrets env: CLOUDFLARE_DNS_API_TOKEN: ${{ steps.vault.outputs.CLOUDFLARE_DNS_API_TOKEN }} run: | set -euo pipefail # 只校验 REQUIRED 机密非空(不打印任何值,仅判空);INFRA_REPO_TOKEN 可选不校验。 missing=0 if [ -z "${CLOUDFLARE_DNS_API_TOKEN:-}" ]; then echo "::error::缺少必需机密 CLOUDFLARE_DNS_API_TOKEN (Vault: ${VAULT_KV}/CLOUDFLARE_DNS_API_TOKEN)" missing=1 fi [ "$missing" -eq 0 ] || { echo "::error::必需机密缺失,终止 dns"; exit 1; } - name: Checkout playbooks uses: actions/checkout@v7 with: repository: ai-workspace-infra/playbooks ref: ${{ github.event.inputs.infra_ref || 'main' }} path: infra/playbooks - name: Download CMDB + inventory uses: actions/download-artifact@v8 with: name: ai-workspace-cmdb path: cmdb - uses: actions/setup-python@v6 with: python-version: "3.12" - name: Install Ansible run: pip install --quiet ansible - name: Reconcile Cloudflare DNS from inventory working-directory: ${{ env.PLAYBOOKS_DIR }} env: CLOUDFLARE_DNS_API_TOKEN: ${{ steps.vault.outputs.CLOUDFLARE_DNS_API_TOKEN }} run: | set -euo pipefail # 只为本次新建的 ai_workspace 组主机同步 A 记录(域名取各主机 # service_domains hostvar,内容取其公网 IP),不动其它静态记录。 ansible-playbook \ -i "${GITHUB_WORKSPACE}/cmdb/inventory.ini" \ update_cloudflare_dns.yml \ -e '{"cloudflare_dns_source_hosts":["ai_workspace"],"cloudflare_dns_static_records":[]}'