From 43c10370eecfa37ac353b2df4135b0028113c66b Mon Sep 17 00:00:00 2001 From: Yassin Kortam Date: Sat, 6 Jun 2026 13:21:06 -0700 Subject: [PATCH] fix(terraform/gcp): prompt for image_registry in DeployStack one-click (#29852) * fix(terraform/gcp): prompt for image_registry in DeployStack one-click The four litellm-* images live on GHCR and Cloud Run rejects ghcr.io URIs at apply time, so every deploy has to point image_registry at an Artifact Registry remote repo. The DeployStack installer didn't surface image_registry as a prompt, so a click-through user landed on the ghcr.io/berriai default and the apply failed ~20 min in, after Cloud SQL had already provisioned. Add image_registry to custom_settings with a PROJECT_ID-placeholder default and a description that flags the ghcr.io rejection so the failure happens at the prompt, not after billing the slow path. TUTORIAL.md is reworded to tell the user what to enter at the new prompt instead of "edit terraform.tfvars before applying". * fix(terraform/gcp): generalize image_registry default to any region Per Greptile feedback on #29852, the prior default hardcoded us-central1 and would silently produce a Cloud Run-incompatible image path for any deployment in another region. The user would substitute PROJECT_ID, miss the region segment, and reproduce the original late-apply failure. Use REGION as a second placeholder and tighten the prompt copy so both substitutions are mandatory. * fix(terraform/gcp): make destroy work without manual intervention Three Cloud Run v2 services and the migrations Cloud Run v2 job all default to deletion_protection=true at the provider level, which has no data-safety value on stateless resources and blocks terraform destroy with an error that can only be unstuck with a tfvars edit + apply roundtrip. Wire deletion_protection=false directly on all four; the operator-facing tripwire that matters is cloudsql_deletion_protection, which guards the only resource that actually holds data. The litellm Cloud SQL database also drops cleanly only if every connection is closed first. Cloud Run services and the migrations job hold connections open until they're torn down, so destroy races and fails with "database is being accessed by other users". Setting deletion_policy=ABANDON on the database resource lets terraform skip the explicit drop; the Cloud SQL instance deletion takes the database with it anyway. Together these turn destroy into a single command, matching the AWS stack's behavior. --- terraform/litellm/gcp/cloudrun.tf | 34 +++++++++++-------- terraform/litellm/gcp/cloudsql.tf | 2 ++ .../litellm/gcp/examples/default/TUTORIAL.md | 4 +-- .../gcp/examples/default/deploystack.json | 5 +++ 4 files changed, 28 insertions(+), 17 deletions(-) diff --git a/terraform/litellm/gcp/cloudrun.tf b/terraform/litellm/gcp/cloudrun.tf index 3e3f5f6924..7b1bb901e2 100644 --- a/terraform/litellm/gcp/cloudrun.tf +++ b/terraform/litellm/gcp/cloudrun.tf @@ -138,10 +138,11 @@ locals { # ---------- Gateway ---------- resource "google_cloud_run_v2_service" "gateway" { - name = "${local.name}-gateway" - location = var.region - ingress = "INGRESS_TRAFFIC_INTERNAL_LOAD_BALANCER" - labels = local.labels + name = "${local.name}-gateway" + location = var.region + ingress = "INGRESS_TRAFFIC_INTERNAL_LOAD_BALANCER" + labels = local.labels + deletion_protection = false template { service_account = google_service_account.runtime.email @@ -251,10 +252,11 @@ resource "google_cloud_run_v2_service" "gateway" { # ---------- Backend ---------- resource "google_cloud_run_v2_service" "backend" { - name = "${local.name}-backend" - location = var.region - ingress = "INGRESS_TRAFFIC_INTERNAL_LOAD_BALANCER" - labels = local.labels + name = "${local.name}-backend" + location = var.region + ingress = "INGRESS_TRAFFIC_INTERNAL_LOAD_BALANCER" + labels = local.labels + deletion_protection = false template { service_account = google_service_account.runtime.email @@ -366,10 +368,11 @@ resource "google_cloud_run_v2_service" "backend" { # with zero IAM bindings, so a compromised UI container can't pivot to # Secret Manager / Cloud SQL via the metadata service. resource "google_cloud_run_v2_service" "ui" { - name = "${local.name}-ui" - location = var.region - ingress = "INGRESS_TRAFFIC_INTERNAL_LOAD_BALANCER" - labels = local.labels + name = "${local.name}-ui" + location = var.region + ingress = "INGRESS_TRAFFIC_INTERNAL_LOAD_BALANCER" + labels = local.labels + deletion_protection = false template { service_account = google_service_account.ui_runtime.email @@ -441,9 +444,10 @@ resource "google_cloud_run_v2_service_iam_member" "ui_allusers" { # assembles DATABASE_URL from the DATABASE_* env vars and runs `prisma # migrate deploy`. No proxy_config, no master key, no shell wrapper. resource "google_cloud_run_v2_job" "migrations" { - name = "${local.name}-migrations" - location = var.region - labels = local.labels + name = "${local.name}-migrations" + location = var.region + labels = local.labels + deletion_protection = false template { template { diff --git a/terraform/litellm/gcp/cloudsql.tf b/terraform/litellm/gcp/cloudsql.tf index 0af15eec22..5d45721cea 100644 --- a/terraform/litellm/gcp/cloudsql.tf +++ b/terraform/litellm/gcp/cloudsql.tf @@ -92,6 +92,8 @@ resource "google_sql_database_instance" "reader" { resource "google_sql_database" "this" { name = var.db_name instance = google_sql_database_instance.writer.name + + deletion_policy = "ABANDON" } resource "random_password" "db_password" { diff --git a/terraform/litellm/gcp/examples/default/TUTORIAL.md b/terraform/litellm/gcp/examples/default/TUTORIAL.md index 9026207563..5c7144619d 100644 --- a/terraform/litellm/gcp/examples/default/TUTORIAL.md +++ b/terraform/litellm/gcp/examples/default/TUTORIAL.md @@ -42,7 +42,7 @@ gcloud artifacts repositories create litellm \ --remote-docker-repo=https://ghcr.io ``` -If the repo already exists, this command exits with a clear error and you can move on. Then set `image_registry` in `terraform.tfvars` to `-docker.pkg.dev//litellm/berriai` before applying. +If the repo already exists, this command exits with a clear error and you can move on. When `deploystack install` prompts for `image_registry`, enter `-docker.pkg.dev//litellm/berriai` (substituting your region and project). The shipped default contains a `PROJECT_ID` placeholder that will fail at apply time if left unedited. ## (Optional) Set tenant secrets @@ -58,7 +58,7 @@ Skip this step entirely for a trial deploy. ## Run the installer -DeployStack will prompt for project, region, tenant, env, image tag, and TLS posture, then run `terraform apply`. Open `deploystack.json` if you want to see the prompt definitions first. +DeployStack will prompt for project, region, tenant, env, image tag, `image_registry`, and TLS posture, then run `terraform apply`. Open `deploystack.json` if you want to see the prompt definitions first. ```bash deploystack install diff --git a/terraform/litellm/gcp/examples/default/deploystack.json b/terraform/litellm/gcp/examples/default/deploystack.json index 6e5339272e..47d1fd914c 100644 --- a/terraform/litellm/gcp/examples/default/deploystack.json +++ b/terraform/litellm/gcp/examples/default/deploystack.json @@ -27,6 +27,11 @@ "description": "Tag for the four litellm-* images (gateway, backend, ui, migrations). Bump together when bumping LiteLLM", "default": "v1.86.0-dev" }, + { + "name": "image_registry", + "description": "Artifact Registry path prefix for the four litellm-* images. Format: -docker.pkg.dev//litellm/berriai, pointing at the remote repo you created above. Substitute BOTH REGION and PROJECT_ID in the default to match the AR repo you just created (REGION must match the region you picked above). The ghcr.io/berriai default in the module does NOT work; Cloud Run rejects ghcr.io URIs at apply time", + "default": "REGION-docker.pkg.dev/PROJECT_ID/litellm/berriai" + }, { "name": "allow_plaintext_lb", "description": "Skip TLS on the load balancer (HTTP-only). Set true for trial/dev. For production, leave false and add lb_domains to terraform.tfvars after the first apply",