diff --git a/iac_modules/pulumi/README.md b/iac_modules/pulumi/README.md index 0765c167..1a463296 100644 --- a/iac_modules/pulumi/README.md +++ b/iac_modules/pulumi/README.md @@ -27,6 +27,8 @@ GitHub Actions 与本地调试共享相同的一组环境变量。根据目标 | `ALICLOUD_ACCESS_KEY` / `ALICLOUD_SECRET_KEY` | 部署阿里云资源时所需的访问密钥。 | | `VULTR_API_KEY` | 部署 Vultr 资源时使用的 API Key。 | | `IAC_STATE_BACKEND` | Pulumi 后端地址,**必须**为 `s3:///` 形式,以确保状态文件全部存储在 S3。 | +| `PULUMI_LOGIN_RETRIES` | (可选)登录 S3 backend 失败时的重试次数,默认 3。 | +| `PULUMI_LOGIN_RETRY_DELAY` | (可选)首次重试前的等待秒数,默认 2,之后指数退避至最多 30 秒。 | | `PULUMI_STACK` | 当前操作的 Stack 名称,例如 `dev`、`prod`。 | | `CONFIG_PATH` | (可选)指定配置目录,默认根据云厂商选择 `config/`。 | @@ -34,7 +36,7 @@ GitHub Actions 与本地调试共享相同的一组环境变量。根据目标 ### 2.1 使用 `~/.iac/credentials` 管理多云凭据 -`cli.py` 会在启动时默认尝试读取 `~/.iac/credentials`(可通过 `IAC_CREDENTIALS_FILE` 或 `--credentials` 覆盖)。 +`cli.py` 会在启动时默认尝试读取 `~/.iac/credentials`(可通过 `IAC_CREDENTIALS_FILE` 或 `--credentials` 覆盖),并在访问 S3 backend 时自动针对网络抖动进行重试。 - 为避免泄漏,文件权限需设置为 `0400`: diff --git a/iac_modules/pulumi/cli.py b/iac_modules/pulumi/cli.py index d5692749..967100f4 100755 --- a/iac_modules/pulumi/cli.py +++ b/iac_modules/pulumi/cli.py @@ -8,6 +8,7 @@ import os import stat import subprocess import sys +import time from dataclasses import dataclass from pathlib import Path from typing import Any, Callable, Dict, Optional, Union @@ -221,6 +222,95 @@ def _ensure_region_harmony() -> None: # Command helpers # --------------------------------------------------------------------------- +def _emit_process_output(result: subprocess.CompletedProcess[str]) -> None: + if result.stdout: + print(result.stdout, end="") + if result.stderr: + print(result.stderr, file=sys.stderr, end="") + + +def _should_retry_login(message: str) -> bool: + lowered = message.lower() + transient_terms = ( + "timeout", + "temporarily unavailable", + "connection reset", + "connection refused", + "i/o timeout", + "tls handshake timeout", + "requesterror", + ) + return any(term in lowered for term in transient_terms) + + +def _login_backend_with_retry(context: PulumiContext, backend: str) -> None: + default_attempts = 3 + default_delay = 2.0 + + raw_attempts = os.environ.get("PULUMI_LOGIN_RETRIES") + raw_delay = os.environ.get("PULUMI_LOGIN_RETRY_DELAY") + + attempts = default_attempts + delay = default_delay + + if raw_attempts: + try: + attempts = max(1, int(raw_attempts)) + except ValueError: + _warn( + "PULUMI_LOGIN_RETRIES 必须为整数,已回退至默认值 3。", + ) + if raw_delay: + try: + delay = max(0.0, float(raw_delay)) + except ValueError: + _warn( + "PULUMI_LOGIN_RETRY_DELAY 必须为数字,已回退至默认值 2 秒。", + ) + + last_error: Optional[subprocess.CompletedProcess[str]] = None + for attempt in range(1, attempts + 1): + result = context.run( + "login", + backend, + check=False, + capture_output=True, + ) + if result.returncode == 0: + _emit_process_output(result) + return + + last_error = result + combined_message = "\n".join( + part.strip() + for part in (result.stderr or "", result.stdout or "") + if part + ) + + if attempt == attempts or not _should_retry_login(combined_message): + _emit_process_output(result) + detail = combined_message or "Pulumi 未返回详细错误信息" + raise CLIError( + "Pulumi 登录 S3 backend 失败,请检查网络连通性、代理设置或 S3 权限。" + f" 原始错误:{detail}", + ) + + wait_seconds = min(delay * (2 ** (attempt - 1)), 30.0) + short_error = combined_message.splitlines()[0] if combined_message else "未知错误" + _warn( + "Pulumi 登录失败 (第 {attempt} 次)。将在 {wait:.1f}s 后重试。错误: {error}".format( + attempt=attempt, + wait=wait_seconds, + error=short_error, + ) + ) + time.sleep(wait_seconds) + + if last_error is not None: + _emit_process_output(last_error) + raise CLIError("Pulumi 登录失败,且未返回具体错误信息。") + + def _require_backend(context: PulumiContext) -> str: backend = ( context.backend_url @@ -239,7 +329,7 @@ def _require_backend(context: PulumiContext) -> str: context.backend_url = backend _require_passphrase() - context.run("login", backend) + _login_backend_with_retry(context, backend) return backend