name: Deploy AI Workspace (IaC + Ansible + Cloudflare) # ============================================================================= # IaC ↔ Ansible 动态 inventory 联动的最终部署流水线(矩阵模式) # # ── TLDR 前置条件(首次运行必须全部就绪)──────────────────────────────────── # # 1. Vault JWT auth(一次性,已存在) # - auth/jwt 挂载,oidc_discovery_url = https://token.actions.githubusercontent.com # # 2. Vault role + policy(已创建) # - policy: github-actions-xworkspace-console(读 kv/CICD + kv/openclaw) # - role: github-actions-xworkspace-console(JWT,bound 本仓库 OIDC) # → 创建/校验命令见 docs/operations/vault-github-actions.md §2 # # 3. Vault KV 必填键(vault kv patch kv/CICD ... / vault kv patch kv/openclaw ...) # kv/CICD: # VULTR_API_KEY → Vultr 账号 API key(provision 创主机) # SSH_PRIVATE_DEPLOY_KEY_B64 → 部署 SSH 私钥 base64(deploy 登录主机,优先) # SSH_PRIVATE_DEPLOY_KEY → 同上原始多行格式(回退,二选一必填) # CLOUDFLARE_DNS_API_TOKEN → CF Zone DNS Edit token(dns 同步) # kv/openclaw: # DEEPSEEK_API_KEY → LLM provider key(deploy 注入主机) # NVIDIA_API_KEY → 同上 # OLLAMA_API_KEY → 同上 # # 4. Vault KV 可选键(不填则本地 state;生产建议填,防止 destroy 丢 state) # kv/CICD: # TF_STATE_ENDPOINT → S3 兼容对象存储 API URL(如 https://ewr1.vultrobjects.com) # TF_STATE_BUCKET → bucket 名(如 ai-workspace-tfstate) # TF_STATE_ACCESS_KEY / TF_STATE_SECRET_KEY → 对象存储凭据 # TF_STATE_REGION → 地域(Vultr 填 us-east-1;Cloudflare R2 必须填 auto) # → 对象存储搭建指南见 docs/operations/iac-prerequisites.md §3 # # 5. ai-workspace-infra 私有仓库(可选加速) # - kv/CICD.CODEX_GITHUB_PERSONAL_ACCESS_TOKEN → checkout iac_modules + playbooks # - 不填则 actions/checkout 走公开访问(仓库须为 public) # # 6. SSH 公钥注入 infra hosts.yaml # - SSH_PRIVATE_DEPLOY_KEY 对应的公钥须写入 # ai-workspace-infra/vultr-vps/config/resources/ai-workspace-hosts.yaml # 的 ssh_keys[].public,否则 Terraform 创机后 runner 无法 SSH 登录。 # # ── 流水线结构 ─────────────────────────────────────────────────────────────── # # provision : 批量起机模式(开关:terraform_action=apply / run_deploy)。 # 用 vultr-vps/envs/ai-workspace 创建主机(Python+Jinja2 渲染显式 # HCL,无 for_each),导出 cmdb.json + inventory.ini,并据此动态 # 生成下游部署矩阵。 # deploy : 矩阵按主机并行,ssh 到主机本地跑官方引导(curl|bash → host 内部 # ansible -c local,自动离线包加速)。与用户 self-host 同一路径; # 不在 runner 远程跑 all-in-one(会撞 agent_skills delegate_to localhost)。 # dns : 部署完成后,依据 inventory 的 service_domains/IP 同步 Cloudflare DNS。 # # 数据契约 cmdb.json 由 ai-workspace-infra 的 generate.py 产出,贯穿三个 job。 # # 密钥管理:不使用 GitHub Actions Secrets,统一从 HashiCorp Vault # (https://vault.svc.plus) KV 安全获取,认证走 GitHub OIDC(JWT,无静态 token)。 # - Vault 角色: github-actions-xworkspace-console (jwt auth, audience=vault) # - KV 路径: kv/data/CICD(共享 CICD 机密) + kv/data/openclaw(LLM keys) # - 详细说明: docs/operations/vault-github-actions.md # docs/operations/iac-prerequisites.md # ============================================================================= on: workflow_dispatch: inputs: infra_ref: description: "ai-workspace-infra git ref (iac_modules + playbooks)" required: false default: "main" type: string bridge_domain: description: "XWORKMATE_BRIDGE_DOMAIN 覆盖(留空则取各主机 CMDB service_domains)" required: false default: "" type: string terraform_action: description: "apply 创建/更新,destroy 销毁" required: false default: "apply" type: choice options: [apply, destroy] run_deploy: description: "provision 后是否执行 on-host 引导部署" required: false default: true type: boolean run_dns: description: "部署后是否同步 Cloudflare DNS" required: false default: true type: boolean # id-token: write 用于 Vault 的 GitHub OIDC(JWT) 认证;contents: read 拉代码 permissions: contents: read id-token: write concurrency: group: deploy-ai-workspace-iac cancel-in-progress: false env: VAULT_ADDR: https://vault.svc.plus VAULT_ROLE: github-actions-xworkspace-console # 共享 CICD 机密路径(KV v2 读路径含 data/)。键名见 docs/operations/vault-github-actions.md VAULT_KV: kv/data/CICD # LLM provider keys 放在 openclaw 路径 VAULT_KV_OPENCLAW: kv/data/openclaw # vultr-vps 根(共享 scripts/ templates/ config/);ENV_DIR 为 terraform 运行目录(workdir) VPS_ROOT: infra/iac_modules/terraform-hcl-standard/vultr-vps ENV_DIR: infra/iac_modules/terraform-hcl-standard/vultr-vps/envs/ai-workspace PLAYBOOKS_DIR: infra/playbooks jobs: # --------------------------------------------------------------------------- provision: name: Provision (terraform + render CMDB) runs-on: ubuntu-latest outputs: hosts: ${{ steps.matrix.outputs.hosts }} count: ${{ steps.matrix.outputs.count }} steps: - name: Load Vault secrets (OIDC) id: vault uses: hashicorp/vault-action@v2 with: url: ${{ env.VAULT_ADDR }} method: jwt role: ${{ env.VAULT_ROLE }} jwtGithubAudience: vault ignoreNotFound: true secrets: | ${{ env.VAULT_KV }} VULTR_API_KEY | VULTR_API_KEY ; ${{ env.VAULT_KV }} TF_STATE_ENDPOINT | TF_STATE_ENDPOINT ; ${{ env.VAULT_KV }} TF_STATE_BUCKET | TF_STATE_BUCKET ; ${{ env.VAULT_KV }} TF_STATE_ACCESS_KEY | TF_STATE_ACCESS_KEY ; ${{ env.VAULT_KV }} TF_STATE_SECRET_KEY | TF_STATE_SECRET_KEY ; ${{ env.VAULT_KV }} TF_STATE_REGION | TF_STATE_REGION - name: Validate required secrets env: VULTR_API_KEY: ${{ steps.vault.outputs.VULTR_API_KEY }} run: | set -euo pipefail # 只校验 REQUIRED 机密非空(不打印任何值,仅判空);可选键 # (INFRA_REPO_TOKEN / TF_STATE_*) 不在此校验。 missing=0 if [ -z "${VULTR_API_KEY:-}" ]; then echo "::error::缺少必需机密 VULTR_API_KEY (Vault: ${VAULT_KV}/VULTR_API_KEY)" missing=1 fi [ "$missing" -eq 0 ] || { echo "::error::必需机密缺失,终止 provision"; exit 1; } - name: Checkout iac_modules uses: actions/checkout@v4 with: repository: ai-workspace-infra/iac_modules ref: ${{ github.event.inputs.infra_ref || 'main' }} path: infra/iac_modules - name: Checkout playbooks uses: actions/checkout@v4 with: repository: ai-workspace-infra/playbooks ref: ${{ github.event.inputs.infra_ref || 'main' }} path: infra/playbooks - uses: hashicorp/setup-terraform@v3 with: terraform_version: "1.9.8" - uses: actions/setup-python@v5 with: python-version: "3.12" - name: Install render deps run: pip install --quiet pyyaml jinja2 - name: Configure remote backend (optional) if: ${{ steps.vault.outputs.TF_STATE_BUCKET != '' }} working-directory: ${{ env.ENV_DIR }} env: TF_STATE_ENDPOINT: ${{ steps.vault.outputs.TF_STATE_ENDPOINT }} run: python3 $GITHUB_WORKSPACE/${{ env.VPS_ROOT }}/scripts/render_backend_tf.py backend.tf - name: generate.py render (YAML -> 显式 HCL + tfvars) working-directory: ${{ env.VPS_ROOT }} run: python3 scripts/generate.py render - name: Terraform init working-directory: ${{ env.ENV_DIR }} env: AWS_ACCESS_KEY_ID: ${{ steps.vault.outputs.TF_STATE_ACCESS_KEY }} AWS_SECRET_ACCESS_KEY: ${{ steps.vault.outputs.TF_STATE_SECRET_KEY }} TF_STATE_ENDPOINT: ${{ steps.vault.outputs.TF_STATE_ENDPOINT }} TF_STATE_BUCKET: ${{ steps.vault.outputs.TF_STATE_BUCKET }} TF_STATE_REGION: ${{ steps.vault.outputs.TF_STATE_REGION }} run: | set -euo pipefail if [ -n "${TF_STATE_BUCKET}" ]; then terraform init -input=false \ -backend-config="bucket=${TF_STATE_BUCKET}" \ -backend-config="key=ai-workspace/terraform.tfstate" \ -backend-config="region=${TF_STATE_REGION:-us-east-1}" else echo "::warning::未配置远端 state(Vault 无 TF_STATE_BUCKET),使用本地 state(仅适合一次性演示,destroy 需同一次运行)" terraform init -input=false fi - name: Terraform ${{ github.event.inputs.terraform_action || 'apply' }} working-directory: ${{ env.ENV_DIR }} env: TF_VAR_vultr_api_key: ${{ steps.vault.outputs.VULTR_API_KEY }} run: | set -euo pipefail terraform ${{ github.event.inputs.terraform_action || 'apply' }} -auto-approve -input=false - name: generate.py inventory (terraform output + YAML -> cmdb.json + inventory.ini) if: ${{ (github.event.inputs.terraform_action || 'apply') == 'apply' }} working-directory: ${{ env.VPS_ROOT }} run: python3 scripts/generate.py inventory - name: Build deploy matrix from cmdb.json id: matrix if: ${{ (github.event.inputs.terraform_action || 'apply') == 'apply' }} working-directory: ${{ env.ENV_DIR }} run: | set -euo pipefail hosts="$(jq -c 'keys' cmdb.json)" echo "hosts=${hosts}" >> "$GITHUB_OUTPUT" echo "count=$(jq 'length' cmdb.json)" >> "$GITHUB_OUTPUT" echo "matrix hosts: ${hosts}" - name: Upload CMDB + inventory artifact if: ${{ (github.event.inputs.terraform_action || 'apply') == 'apply' }} uses: actions/upload-artifact@v4 with: name: ai-workspace-cmdb path: | ${{ env.ENV_DIR }}/cmdb.json ${{ env.ENV_DIR }}/inventory.ini if-no-files-found: error # --------------------------------------------------------------------------- deploy: name: Deploy ${{ matrix.host }} (on-host bootstrap) needs: provision if: ${{ needs.provision.outputs.count != '0' && (github.event.inputs.run_deploy == 'true' || github.event.inputs.run_deploy == null) }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: host: ${{ fromJSON(needs.provision.outputs.hosts) }} steps: # all-in-one 是“在目标主机本地执行”的模型(host 内部 ansible-playbook -c local, # 自动走离线包加速)。从 runner 远程跑 all-in-one 会撞 roles/agent_skills 的 # delegate_to: localhost(写 runner 本地 /root),故 deploy 改为 ssh 到主机本地 # 跑官方引导脚本——与用户 self-host 的 curl|bash 完全同一路径。 - name: Load Vault secrets (OIDC) id: vault uses: hashicorp/vault-action@v2 with: url: ${{ env.VAULT_ADDR }} method: jwt role: ${{ env.VAULT_ROLE }} jwtGithubAudience: vault ignoreNotFound: true secrets: | ${{ env.VAULT_KV }} SSH_PRIVATE_DEPLOY_KEY | ANSIBLE_SSH_KEY ; ${{ env.VAULT_KV }} SSH_PRIVATE_DEPLOY_KEY_B64 | ANSIBLE_SSH_KEY_B64 ; ${{ env.VAULT_KV_OPENCLAW }} DEEPSEEK_API_KEY | DEEPSEEK_API_KEY ; ${{ env.VAULT_KV_OPENCLAW }} NVIDIA_API_KEY | NVIDIA_API_KEY ; ${{ env.VAULT_KV_OPENCLAW }} OLLAMA_API_KEY | OLLAMA_API_KEY - name: Validate required secrets env: ANSIBLE_SSH_KEY: ${{ steps.vault.outputs.ANSIBLE_SSH_KEY }} ANSIBLE_SSH_KEY_B64: ${{ steps.vault.outputs.ANSIBLE_SSH_KEY_B64 }} DEEPSEEK_API_KEY: ${{ steps.vault.outputs.DEEPSEEK_API_KEY }} NVIDIA_API_KEY: ${{ steps.vault.outputs.NVIDIA_API_KEY }} OLLAMA_API_KEY: ${{ steps.vault.outputs.OLLAMA_API_KEY }} run: | set -euo pipefail # 只校验 REQUIRED 机密非空(不打印任何值,仅判空)。 missing=0 # SSH 私钥:B64 与原始至少有一个非空。 if [ -z "${ANSIBLE_SSH_KEY_B64:-}" ] && [ -z "${ANSIBLE_SSH_KEY:-}" ]; then echo "::error::缺少必需机密 SSH 私钥 (Vault: ${VAULT_KV}/SSH_PRIVATE_DEPLOY_KEY_B64 或 ${VAULT_KV}/SSH_PRIVATE_DEPLOY_KEY,至少一个)" missing=1 fi if [ -z "${DEEPSEEK_API_KEY:-}" ]; then echo "::error::缺少必需机密 DEEPSEEK_API_KEY (Vault: ${VAULT_KV_OPENCLAW}/DEEPSEEK_API_KEY)" missing=1 fi if [ -z "${NVIDIA_API_KEY:-}" ]; then echo "::error::缺少必需机密 NVIDIA_API_KEY (Vault: ${VAULT_KV_OPENCLAW}/NVIDIA_API_KEY)" missing=1 fi if [ -z "${OLLAMA_API_KEY:-}" ]; then echo "::error::缺少必需机密 OLLAMA_API_KEY (Vault: ${VAULT_KV_OPENCLAW}/OLLAMA_API_KEY)" missing=1 fi [ "$missing" -eq 0 ] || { echo "::error::必需机密缺失,终止 deploy"; exit 1; } - name: Download CMDB (host IP source) uses: actions/download-artifact@v4 with: name: ai-workspace-cmdb path: cmdb - name: Configure SSH (prefer base64 key, fall back to raw) env: ANSIBLE_SSH_KEY: ${{ steps.vault.outputs.ANSIBLE_SSH_KEY }} ANSIBLE_SSH_KEY_B64: ${{ steps.vault.outputs.ANSIBLE_SSH_KEY_B64 }} run: | set -euo pipefail mkdir -p ~/.ssh # 历史约定:优先解码单行 *_B64,再回退原始多行私钥,避免 GitHub Actions # 处理多行私钥时的 "Load key ... error in libcrypto"。 if [ -n "${ANSIBLE_SSH_KEY_B64:-}" ]; then printf '%s' "${ANSIBLE_SSH_KEY_B64}" | base64 -d > ~/.ssh/id_deploy elif [ -n "${ANSIBLE_SSH_KEY:-}" ]; then printf '%s\n' "${ANSIBLE_SSH_KEY}" > ~/.ssh/id_deploy else echo "::error::Vault 未提供 ANSIBLE_SSH_KEY[_B64]"; exit 1 fi chmod 600 ~/.ssh/id_deploy ssh-keygen -y -f ~/.ssh/id_deploy >/dev/null - name: Wait for host SSH run: | set -euo pipefail ip="$(jq -r '.["${{ matrix.host }}"].ip' cmdb/cmdb.json)" echo "Waiting for ${{ matrix.host }} (${ip}:22) ..." for _ in $(seq 1 60); do if nc -z -w 5 "$ip" 22; then echo "SSH up"; exit 0; fi sleep 10 done echo "::error::Timed out waiting for ${ip}:22"; exit 1 - name: Run on-host bootstrap (curl | bash, local-mode install) env: DEEPSEEK_API_KEY: ${{ steps.vault.outputs.DEEPSEEK_API_KEY }} NVIDIA_API_KEY: ${{ steps.vault.outputs.NVIDIA_API_KEY }} OLLAMA_API_KEY: ${{ steps.vault.outputs.OLLAMA_API_KEY }} run: | set -euo pipefail ip="$(jq -r '.["${{ matrix.host }}"].ip' cmdb/cmdb.json)" user="$(jq -r '.["${{ matrix.host }}"].ansible_user // "root"' cmdb/cmdb.json)" # bridge 域名 = operator 覆盖(input) 否则各主机 CMDB service_domains 的首个, # 用作 /etc/hostname 与 xworkmate-bridge.caddy;on-host 模型拿不到 inventory, # 故由流水线作为 XWORKMATE_BRIDGE_DOMAIN env 注入。 domain='${{ github.event.inputs.bridge_domain }}' if [ -z "$domain" ]; then domain="$(jq -r '.["${{ matrix.host }}"].host_vars.service_domains // ""' cmdb/cmdb.json | cut -d, -f1 | tr -d ' ')" fi echo "Bootstrapping ${{ matrix.host }} (${user}@${ip}) on-host, domain=${domain:-} ..." ssh -i ~/.ssh/id_deploy \ -o StrictHostKeyChecking=accept-new \ -o ServerAliveInterval=20 -o ServerAliveCountMax=15 \ -o ConnectTimeout=20 \ "${user}@${ip}" \ "XWORKMATE_BRIDGE_DOMAIN='${domain}' \ DEEPSEEK_API_KEY='${DEEPSEEK_API_KEY}' \ NVIDIA_API_KEY='${NVIDIA_API_KEY}' \ OLLAMA_API_KEY='${OLLAMA_API_KEY}' \ bash -lc 'curl -sfL https://install.svc.plus/ai-workspace | bash -'" # --------------------------------------------------------------------------- dns: name: Sync Cloudflare DNS needs: [provision, deploy] if: ${{ needs.provision.outputs.count != '0' && (github.event.inputs.run_dns == 'true' || github.event.inputs.run_dns == null) }} runs-on: ubuntu-latest steps: - name: Load Vault secrets (OIDC) id: vault uses: hashicorp/vault-action@v2 with: url: ${{ env.VAULT_ADDR }} method: jwt role: ${{ env.VAULT_ROLE }} jwtGithubAudience: vault ignoreNotFound: true secrets: | ${{ env.VAULT_KV }} CLOUDFLARE_DNS_API_TOKEN | CLOUDFLARE_DNS_API_TOKEN - name: Validate required secrets env: CLOUDFLARE_DNS_API_TOKEN: ${{ steps.vault.outputs.CLOUDFLARE_DNS_API_TOKEN }} run: | set -euo pipefail # 只校验 REQUIRED 机密非空(不打印任何值,仅判空);INFRA_REPO_TOKEN 可选不校验。 missing=0 if [ -z "${CLOUDFLARE_DNS_API_TOKEN:-}" ]; then echo "::error::缺少必需机密 CLOUDFLARE_DNS_API_TOKEN (Vault: ${VAULT_KV}/CLOUDFLARE_DNS_API_TOKEN)" missing=1 fi [ "$missing" -eq 0 ] || { echo "::error::必需机密缺失,终止 dns"; exit 1; } - name: Checkout playbooks uses: actions/checkout@v4 with: repository: ai-workspace-infra/playbooks ref: ${{ github.event.inputs.infra_ref || 'main' }} path: infra/playbooks - name: Download CMDB + inventory uses: actions/download-artifact@v4 with: name: ai-workspace-cmdb path: cmdb - uses: actions/setup-python@v5 with: python-version: "3.12" - name: Install Ansible run: pip install --quiet ansible - name: Reconcile Cloudflare DNS from inventory working-directory: ${{ env.PLAYBOOKS_DIR }} env: CLOUDFLARE_DNS_API_TOKEN: ${{ steps.vault.outputs.CLOUDFLARE_DNS_API_TOKEN }} run: | set -euo pipefail # 只为本次新建的 ai_workspace 组主机同步 A 记录(域名取各主机 # service_domains hostvar,内容取其公网 IP),不动其它静态记录。 ansible-playbook \ -i "${GITHUB_WORKSPACE}/cmdb/inventory.ini" \ update_cloudflare_dns.yml \ -e '{"cloudflare_dns_source_hosts":["ai_workspace"],"cloudflare_dns_static_records":[]}'