xworkspace-console/.github/workflows/deploy-ai-workspace-iac.yaml

445 lines
20 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

name: Deploy AI Workspace (IaC + Ansible + Cloudflare)
# =============================================================================
# IaC ↔ Ansible 动态 inventory 联动的最终部署流水线(矩阵模式)
#
# ── TLDR 前置条件(首次运行必须全部就绪)────────────────────────────────────
#
# 1. Vault JWT auth一次性已存在
# - auth/jwt 挂载oidc_discovery_url = https://token.actions.githubusercontent.com
#
# 2. Vault role + policy已创建
# - policy: github-actions-xworkspace-console读 kv/CICD + kv/openclaw
# - role: github-actions-xworkspace-consoleJWTbound 本仓库 OIDC
# → 创建/校验命令见 docs/operations/vault-github-actions.md §2
#
# 3. Vault KV 必填键vault kv patch kv/CICD ... / vault kv patch kv/openclaw ...
# kv/CICD:
# VULTR_API_KEY → Vultr 账号 API keyprovision 创主机)
# SSH_PRIVATE_DEPLOY_KEY_B64 → 部署 SSH 私钥 base64deploy 登录主机,优先)
# SSH_PRIVATE_DEPLOY_KEY → 同上原始多行格式(回退,二选一必填)
# CLOUDFLARE_DNS_API_TOKEN → CF Zone DNS Edit tokendns 同步)
# kv/openclaw:
# DEEPSEEK_API_KEY → LLM provider keydeploy 注入主机)
# NVIDIA_API_KEY → 同上
# OLLAMA_API_KEY → 同上
#
# 4. Vault KV 必填键(远端 S3 兼容 state 后端强制启用,缺失即 fail-fast
# 不再回退本地 state确保 destroy 不丢 state
# kv/CICD:
# TF_STATE_ENDPOINT → S3 兼容对象存储 API URL如 https://<acct>.r2.cloudflarestorage.com
# TF_STATE_BUCKET → bucket 名(如 ai-workspace-tfstate
# TF_STATE_ACCESS_KEY / TF_STATE_SECRET_KEY → 对象存储凭据
# TF_STATE_REGION → 地域Cloudflare R2 必须填 autoVultr 填 us-east-1
# → 对象存储搭建指南见 docs/operations/iac-prerequisites.md §3
#
# 5. ai-workspace-infra 私有仓库(可选加速)
# - kv/CICD.CODEX_GITHUB_PERSONAL_ACCESS_TOKEN → checkout iac_modules + playbooks
# - 不填则 actions/checkout 走公开访问(仓库须为 public
#
# 6. SSH 公钥注入 infra hosts.yaml
# - SSH_PRIVATE_DEPLOY_KEY 对应的公钥须写入
# ai-workspace-infra/vultr-vps/config/resources/ai-workspace-hosts.yaml
# 的 ssh_keys[].public否则 Terraform 创机后 runner 无法 SSH 登录。
#
# ── 流水线结构 ───────────────────────────────────────────────────────────────
#
# provision : 批量起机模式开关terraform_action=apply / run_deploy
# 用 vultr-vps/envs/ai-workspace 创建主机Python+Jinja2 渲染显式
# HCL无 for_each导出 cmdb.json + inventory.ini并据此动态
# 生成下游部署矩阵。
# deploy : 矩阵按主机并行ssh 到主机本地跑官方引导curl|bash → host 内部
# ansible -c local自动离线包加速。与用户 self-host 同一路径;
# 不在 runner 远程跑 all-in-one会撞 agent_skills delegate_to localhost
# dns : 部署完成后,依据 inventory 的 service_domains/IP 同步 Cloudflare DNS。
#
# 数据契约 cmdb.json 由 ai-workspace-infra 的 generate.py 产出,贯穿三个 job。
#
# 密钥管理:不使用 GitHub Actions Secrets统一从 HashiCorp Vault
# (https://vault.svc.plus) KV 安全获取,认证走 GitHub OIDCJWT无静态 token
# - Vault 角色: github-actions-xworkspace-console (jwt auth, audience=vault)
# - KV 路径: kv/data/CICD共享 CICD 机密) + kv/data/openclawLLM keys
# - 详细说明: docs/operations/vault-github-actions.md
# docs/operations/iac-prerequisites.md
# =============================================================================
on:
workflow_dispatch:
inputs:
infra_ref:
description: "ai-workspace-infra git ref (iac_modules + playbooks)"
required: false
default: "main"
type: string
bridge_domain:
description: "XWORKMATE_BRIDGE_DOMAIN 覆盖(留空则取各主机 CMDB service_domains)"
required: false
default: ""
type: string
offline_mode:
description: "on-host 离线包模式: off=在线拉最新 main(默认,离线包落后时用); auto=离线加速; force=强制离线"
required: false
default: "off"
type: choice
options: ["off", "auto", "force"]
terraform_action:
description: "apply 创建/更新destroy 销毁"
required: false
default: "apply"
type: choice
options: [apply, destroy]
run_deploy:
description: "provision 后是否执行 on-host 引导部署"
required: false
default: true
type: boolean
run_dns:
description: "部署后是否同步 Cloudflare DNS"
required: false
default: true
type: boolean
# id-token: write 用于 Vault 的 GitHub OIDC(JWT) 认证contents: read 拉代码
permissions:
contents: read
id-token: write
concurrency:
group: deploy-ai-workspace-iac
cancel-in-progress: false
env:
VAULT_ADDR: https://vault.svc.plus
VAULT_ROLE: github-actions-xworkspace-console
# 共享 CICD 机密路径KV v2 读路径含 data/)。键名见 docs/operations/vault-github-actions.md
VAULT_KV: kv/data/CICD
# LLM provider keys 放在 openclaw 路径
VAULT_KV_OPENCLAW: kv/data/openclaw
# vultr-vps 根(共享 scripts/ templates/ config/ENV_DIR 为 terraform 运行目录(workdir)
VPS_ROOT: infra/iac_modules/terraform-hcl-standard/vultr-vps
ENV_DIR: infra/iac_modules/terraform-hcl-standard/vultr-vps/envs/ai-workspace
PLAYBOOKS_DIR: infra/playbooks
jobs:
# ---------------------------------------------------------------------------
provision:
name: Provision (terraform + render CMDB)
runs-on: ubuntu-latest
outputs:
hosts: ${{ steps.matrix.outputs.hosts }}
count: ${{ steps.matrix.outputs.count }}
steps:
- name: Load Vault secrets (OIDC)
id: vault
uses: hashicorp/vault-action@v2
with:
url: ${{ env.VAULT_ADDR }}
method: jwt
role: ${{ env.VAULT_ROLE }}
jwtGithubAudience: vault
ignoreNotFound: true
secrets: |
${{ env.VAULT_KV }} VULTR_API_KEY | VULTR_API_KEY ;
${{ env.VAULT_KV }} TF_STATE_ENDPOINT | TF_STATE_ENDPOINT ;
${{ env.VAULT_KV }} TF_STATE_BUCKET | TF_STATE_BUCKET ;
${{ env.VAULT_KV }} TF_STATE_ACCESS_KEY | TF_STATE_ACCESS_KEY ;
${{ env.VAULT_KV }} TF_STATE_SECRET_KEY | TF_STATE_SECRET_KEY ;
${{ env.VAULT_KV }} TF_STATE_REGION | TF_STATE_REGION
- name: Validate required secrets
env:
VULTR_API_KEY: ${{ steps.vault.outputs.VULTR_API_KEY }}
TF_STATE_ENDPOINT: ${{ steps.vault.outputs.TF_STATE_ENDPOINT }}
TF_STATE_BUCKET: ${{ steps.vault.outputs.TF_STATE_BUCKET }}
TF_STATE_ACCESS_KEY: ${{ steps.vault.outputs.TF_STATE_ACCESS_KEY }}
TF_STATE_SECRET_KEY: ${{ steps.vault.outputs.TF_STATE_SECRET_KEY }}
TF_STATE_REGION: ${{ steps.vault.outputs.TF_STATE_REGION }}
run: |
set -euo pipefail
# 校验 REQUIRED 机密非空(不打印任何值,仅判空)。
# 远端 S3 兼容 state 后端为强制要求(默认开启,不再回退本地 state
missing=0
if [ -z "${VULTR_API_KEY:-}" ]; then
echo "::error::缺少必需机密 VULTR_API_KEY (Vault: ${VAULT_KV}/VULTR_API_KEY)"
missing=1
fi
for k in TF_STATE_ENDPOINT TF_STATE_BUCKET TF_STATE_ACCESS_KEY TF_STATE_SECRET_KEY TF_STATE_REGION; do
if [ -z "$(eval echo \"\${$k:-}\")" ]; then
echo "::error::缺少必需机密 $k (Vault: ${VAULT_KV}/$k) —— 远端 S3 state 后端为强制要求"
missing=1
fi
done
[ "$missing" -eq 0 ] || { echo "::error::必需机密缺失,终止 provision"; exit 1; }
- name: Checkout iac_modules
uses: actions/checkout@v4
with:
repository: ai-workspace-infra/iac_modules
ref: ${{ github.event.inputs.infra_ref || 'main' }}
path: infra/iac_modules
- name: Checkout playbooks
uses: actions/checkout@v4
with:
repository: ai-workspace-infra/playbooks
ref: ${{ github.event.inputs.infra_ref || 'main' }}
path: infra/playbooks
- uses: hashicorp/setup-terraform@v3
with:
terraform_version: "1.9.8"
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install render deps
run: pip install --quiet pyyaml jinja2
- name: Configure remote backend (S3-compatible, required)
working-directory: ${{ env.ENV_DIR }}
env:
TF_STATE_ENDPOINT: ${{ steps.vault.outputs.TF_STATE_ENDPOINT }}
TF_STATE_REGION: ${{ steps.vault.outputs.TF_STATE_REGION }}
run: python3 $GITHUB_WORKSPACE/${{ env.VPS_ROOT }}/scripts/render_backend_tf.py backend.tf
- name: generate.py render (YAML -> 显式 HCL + tfvars)
working-directory: ${{ env.VPS_ROOT }}
run: python3 scripts/generate.py render
- name: Terraform init
working-directory: ${{ env.ENV_DIR }}
env:
AWS_ACCESS_KEY_ID: ${{ steps.vault.outputs.TF_STATE_ACCESS_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ steps.vault.outputs.TF_STATE_SECRET_KEY }}
TF_STATE_ENDPOINT: ${{ steps.vault.outputs.TF_STATE_ENDPOINT }}
TF_STATE_BUCKET: ${{ steps.vault.outputs.TF_STATE_BUCKET }}
TF_STATE_REGION: ${{ steps.vault.outputs.TF_STATE_REGION }}
run: |
set -euo pipefail
# 远端 S3 兼容 state 后端强制启用backend.tf 已由上一步渲染);
# 缺失 bucket 直接失败,不回退本地 state。
if [ -z "${TF_STATE_BUCKET}" ]; then
echo "::error::TF_STATE_BUCKET 为空 —— 远端 state 后端为强制要求,终止"
exit 1
fi
terraform init -input=false \
-backend-config="bucket=${TF_STATE_BUCKET}" \
-backend-config="key=ai-workspace/terraform.tfstate" \
-backend-config="region=${TF_STATE_REGION}"
- name: Terraform ${{ github.event.inputs.terraform_action || 'apply' }}
working-directory: ${{ env.ENV_DIR }}
env:
AWS_ACCESS_KEY_ID: ${{ steps.vault.outputs.TF_STATE_ACCESS_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ steps.vault.outputs.TF_STATE_SECRET_KEY }}
TF_VAR_vultr_api_key: ${{ steps.vault.outputs.VULTR_API_KEY }}
run: |
set -euo pipefail
terraform ${{ github.event.inputs.terraform_action || 'apply' }} -auto-approve -input=false
- name: generate.py inventory (terraform output + YAML -> cmdb.json + inventory.ini)
if: ${{ (github.event.inputs.terraform_action || 'apply') == 'apply' }}
working-directory: ${{ env.VPS_ROOT }}
run: python3 scripts/generate.py inventory
- name: Build deploy matrix from cmdb.json
id: matrix
if: ${{ (github.event.inputs.terraform_action || 'apply') == 'apply' }}
working-directory: ${{ env.ENV_DIR }}
run: |
set -euo pipefail
hosts="$(jq -c 'keys' cmdb.json)"
echo "hosts=${hosts}" >> "$GITHUB_OUTPUT"
echo "count=$(jq 'length' cmdb.json)" >> "$GITHUB_OUTPUT"
echo "matrix hosts: ${hosts}"
- name: Upload CMDB + inventory artifact
if: ${{ (github.event.inputs.terraform_action || 'apply') == 'apply' }}
uses: actions/upload-artifact@v4
with:
name: ai-workspace-cmdb
path: |
${{ env.ENV_DIR }}/cmdb.json
${{ env.ENV_DIR }}/inventory.ini
if-no-files-found: error
# ---------------------------------------------------------------------------
deploy:
name: Deploy ${{ matrix.host }} (on-host bootstrap)
needs: provision
if: ${{ needs.provision.outputs.count != '0' && (github.event.inputs.run_deploy == 'true' || github.event.inputs.run_deploy == null) }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
host: ${{ fromJSON(needs.provision.outputs.hosts) }}
steps:
# all-in-one 是“在目标主机本地执行”的模型host 内部 ansible-playbook -c local
# 自动走离线包加速)。从 runner 远程跑 all-in-one 会撞 roles/agent_skills 的
# delegate_to: localhost写 runner 本地 /root故 deploy 改为 ssh 到主机本地
# 跑官方引导脚本——与用户 self-host 的 curl|bash 完全同一路径。
- name: Load Vault secrets (OIDC)
id: vault
uses: hashicorp/vault-action@v2
with:
url: ${{ env.VAULT_ADDR }}
method: jwt
role: ${{ env.VAULT_ROLE }}
jwtGithubAudience: vault
ignoreNotFound: true
secrets: |
${{ env.VAULT_KV }} SSH_PRIVATE_DEPLOY_KEY | ANSIBLE_SSH_KEY ;
${{ env.VAULT_KV }} SSH_PRIVATE_DEPLOY_KEY_B64 | ANSIBLE_SSH_KEY_B64 ;
${{ env.VAULT_KV_OPENCLAW }} DEEPSEEK_API_KEY | DEEPSEEK_API_KEY ;
${{ env.VAULT_KV_OPENCLAW }} NVIDIA_API_KEY | NVIDIA_API_KEY ;
${{ env.VAULT_KV_OPENCLAW }} OLLAMA_API_KEY | OLLAMA_API_KEY
- name: Validate required secrets
env:
ANSIBLE_SSH_KEY: ${{ steps.vault.outputs.ANSIBLE_SSH_KEY }}
ANSIBLE_SSH_KEY_B64: ${{ steps.vault.outputs.ANSIBLE_SSH_KEY_B64 }}
DEEPSEEK_API_KEY: ${{ steps.vault.outputs.DEEPSEEK_API_KEY }}
NVIDIA_API_KEY: ${{ steps.vault.outputs.NVIDIA_API_KEY }}
OLLAMA_API_KEY: ${{ steps.vault.outputs.OLLAMA_API_KEY }}
run: |
set -euo pipefail
# 只校验 REQUIRED 机密非空(不打印任何值,仅判空)。
missing=0
# SSH 私钥B64 与原始至少有一个非空。
if [ -z "${ANSIBLE_SSH_KEY_B64:-}" ] && [ -z "${ANSIBLE_SSH_KEY:-}" ]; then
echo "::error::缺少必需机密 SSH 私钥 (Vault: ${VAULT_KV}/SSH_PRIVATE_DEPLOY_KEY_B64 或 ${VAULT_KV}/SSH_PRIVATE_DEPLOY_KEY至少一个)"
missing=1
fi
if [ -z "${DEEPSEEK_API_KEY:-}" ]; then
echo "::error::缺少必需机密 DEEPSEEK_API_KEY (Vault: ${VAULT_KV_OPENCLAW}/DEEPSEEK_API_KEY)"
missing=1
fi
if [ -z "${NVIDIA_API_KEY:-}" ]; then
echo "::error::缺少必需机密 NVIDIA_API_KEY (Vault: ${VAULT_KV_OPENCLAW}/NVIDIA_API_KEY)"
missing=1
fi
if [ -z "${OLLAMA_API_KEY:-}" ]; then
echo "::error::缺少必需机密 OLLAMA_API_KEY (Vault: ${VAULT_KV_OPENCLAW}/OLLAMA_API_KEY)"
missing=1
fi
[ "$missing" -eq 0 ] || { echo "::error::必需机密缺失,终止 deploy"; exit 1; }
- name: Checkout xworkspace-console helpers
uses: actions/checkout@v4
- name: Download CMDB (host IP source)
uses: actions/download-artifact@v4
with:
name: ai-workspace-cmdb
path: cmdb
- name: Configure SSH (prefer base64 key, fall back to raw)
env:
ANSIBLE_SSH_KEY: ${{ steps.vault.outputs.ANSIBLE_SSH_KEY }}
ANSIBLE_SSH_KEY_B64: ${{ steps.vault.outputs.ANSIBLE_SSH_KEY_B64 }}
run: |
set -euo pipefail
mkdir -p ~/.ssh
# 历史约定:优先解码单行 *_B64再回退原始多行私钥避免 GitHub Actions
# 处理多行私钥时的 "Load key ... error in libcrypto"。
if [ -n "${ANSIBLE_SSH_KEY_B64:-}" ]; then
printf '%s' "${ANSIBLE_SSH_KEY_B64}" | base64 -d > ~/.ssh/id_deploy
elif [ -n "${ANSIBLE_SSH_KEY:-}" ]; then
printf '%s\n' "${ANSIBLE_SSH_KEY}" > ~/.ssh/id_deploy
else
echo "::error::Vault 未提供 ANSIBLE_SSH_KEY[_B64]"; exit 1
fi
chmod 600 ~/.ssh/id_deploy
ssh-keygen -y -f ~/.ssh/id_deploy >/dev/null
- name: Wait for host SSH
run: |
set -euo pipefail
ip="$(jq -r '.["${{ matrix.host }}"].ip' cmdb/cmdb.json)"
echo "Waiting for ${{ matrix.host }} (${ip}:22) ..."
for _ in $(seq 1 60); do
if nc -z -w 5 "$ip" 22; then echo "SSH up"; exit 0; fi
sleep 10
done
echo "::error::Timed out waiting for ${ip}:22"; exit 1
- name: Run on-host bootstrap (curl | bash, local-mode install)
env:
MATRIX_HOST: ${{ matrix.host }}
CMDB_PATH: cmdb/cmdb.json
SSH_KEY_PATH: ~/.ssh/id_deploy
# 离线包落后于 main 时用在线模式拉最新 playbook见 run-on-host-bootstrap.sh
# 离线包重新发布后可设为 auto 恢复离线加速。
AI_WORKSPACE_OFFLINE_MODE: ${{ github.event.inputs.offline_mode || 'off' }}
XWORKMATE_BRIDGE_DOMAIN: ${{ github.event.inputs.bridge_domain }}
DEEPSEEK_API_KEY: ${{ steps.vault.outputs.DEEPSEEK_API_KEY }}
NVIDIA_API_KEY: ${{ steps.vault.outputs.NVIDIA_API_KEY }}
OLLAMA_API_KEY: ${{ steps.vault.outputs.OLLAMA_API_KEY }}
run: bash scripts/run-on-host-bootstrap.sh
# ---------------------------------------------------------------------------
dns:
name: Sync Cloudflare DNS
needs: [provision, deploy]
if: ${{ needs.provision.outputs.count != '0' && (github.event.inputs.run_dns == 'true' || github.event.inputs.run_dns == null) }}
runs-on: ubuntu-latest
steps:
- name: Load Vault secrets (OIDC)
id: vault
uses: hashicorp/vault-action@v2
with:
url: ${{ env.VAULT_ADDR }}
method: jwt
role: ${{ env.VAULT_ROLE }}
jwtGithubAudience: vault
ignoreNotFound: true
secrets: |
${{ env.VAULT_KV }} CLOUDFLARE_DNS_API_TOKEN | CLOUDFLARE_DNS_API_TOKEN
- name: Validate required secrets
env:
CLOUDFLARE_DNS_API_TOKEN: ${{ steps.vault.outputs.CLOUDFLARE_DNS_API_TOKEN }}
run: |
set -euo pipefail
# 只校验 REQUIRED 机密非空不打印任何值仅判空INFRA_REPO_TOKEN 可选不校验。
missing=0
if [ -z "${CLOUDFLARE_DNS_API_TOKEN:-}" ]; then
echo "::error::缺少必需机密 CLOUDFLARE_DNS_API_TOKEN (Vault: ${VAULT_KV}/CLOUDFLARE_DNS_API_TOKEN)"
missing=1
fi
[ "$missing" -eq 0 ] || { echo "::error::必需机密缺失,终止 dns"; exit 1; }
- name: Checkout playbooks
uses: actions/checkout@v4
with:
repository: ai-workspace-infra/playbooks
ref: ${{ github.event.inputs.infra_ref || 'main' }}
path: infra/playbooks
- name: Download CMDB + inventory
uses: actions/download-artifact@v4
with:
name: ai-workspace-cmdb
path: cmdb
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install Ansible
run: pip install --quiet ansible
- name: Reconcile Cloudflare DNS from inventory
working-directory: ${{ env.PLAYBOOKS_DIR }}
env:
CLOUDFLARE_DNS_API_TOKEN: ${{ steps.vault.outputs.CLOUDFLARE_DNS_API_TOKEN }}
run: |
set -euo pipefail
# 只为本次新建的 ai_workspace 组主机同步 A 记录(域名取各主机
# service_domains hostvar内容取其公网 IP不动其它静态记录。
ansible-playbook \
-i "${GITHUB_WORKSPACE}/cmdb/inventory.ini" \
update_cloudflare_dns.yml \
-e '{"cloudflare_dns_source_hosts":["ai_workspace"],"cloudflare_dns_static_records":[]}'