From 43c10370eecfa37ac353b2df4135b0028113c66b Mon Sep 17 00:00:00 2001
From: Yassin Kortam <yassin@berri.ai>
Date: Sat, 6 Jun 2026 13:21:06 -0700
Subject: [PATCH] fix(terraform/gcp): prompt for image_registry in DeployStack
 one-click (#29852)

* fix(terraform/gcp): prompt for image_registry in DeployStack one-click

The four litellm-* images live on GHCR and Cloud Run rejects ghcr.io URIs
at apply time, so every deploy has to point image_registry at an Artifact
Registry remote repo. The DeployStack installer didn't surface
image_registry as a prompt, so a click-through user landed on the
ghcr.io/berriai default and the apply failed ~20 min in, after Cloud SQL
had already provisioned. Add image_registry to custom_settings with a
PROJECT_ID-placeholder default and a description that flags the ghcr.io
rejection so the failure happens at the prompt, not after billing the
slow path. TUTORIAL.md is reworded to tell the user what to enter at the
new prompt instead of "edit terraform.tfvars before applying".

* fix(terraform/gcp): generalize image_registry default to any region

Per Greptile feedback on #29852, the prior default hardcoded us-central1
and would silently produce a Cloud Run-incompatible image path for any
deployment in another region. The user would substitute PROJECT_ID, miss
the region segment, and reproduce the original late-apply failure. Use
REGION as a second placeholder and tighten the prompt copy so both
substitutions are mandatory.

* fix(terraform/gcp): make destroy work without manual intervention

Three Cloud Run v2 services and the migrations Cloud Run v2 job all
default to deletion_protection=true at the provider level, which has no
data-safety value on stateless resources and blocks terraform destroy
with an error that can only be unstuck with a tfvars edit + apply
roundtrip. Wire deletion_protection=false directly on all four; the
operator-facing tripwire that matters is cloudsql_deletion_protection,
which guards the only resource that actually holds data.

The litellm Cloud SQL database also drops cleanly only if every
connection is closed first. Cloud Run services and the migrations job
hold connections open until they're torn down, so destroy races and
fails with "database is being accessed by other users". Setting
deletion_policy=ABANDON on the database resource lets terraform skip
the explicit drop; the Cloud SQL instance deletion takes the database
with it anyway.

Together these turn destroy into a single command, matching the AWS
stack's behavior.
---
 terraform/litellm/gcp/cloudrun.tf             | 34 +++++++++++--------
 terraform/litellm/gcp/cloudsql.tf             |  2 ++
 .../litellm/gcp/examples/default/TUTORIAL.md  |  4 +--
 .../gcp/examples/default/deploystack.json     |  5 +++
 4 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/terraform/litellm/gcp/cloudrun.tf b/terraform/litellm/gcp/cloudrun.tf
index 3e3f5f6924..7b1bb901e2 100644
--- a/terraform/litellm/gcp/cloudrun.tf
+++ b/terraform/litellm/gcp/cloudrun.tf
@@ -138,10 +138,11 @@ locals {
 
 # ---------- Gateway ----------
 resource "google_cloud_run_v2_service" "gateway" {
-  name     = "${local.name}-gateway"
-  location = var.region
-  ingress  = "INGRESS_TRAFFIC_INTERNAL_LOAD_BALANCER"
-  labels   = local.labels
+  name                = "${local.name}-gateway"
+  location            = var.region
+  ingress             = "INGRESS_TRAFFIC_INTERNAL_LOAD_BALANCER"
+  labels              = local.labels
+  deletion_protection = false
 
   template {
     service_account                  = google_service_account.runtime.email
@@ -251,10 +252,11 @@ resource "google_cloud_run_v2_service" "gateway" {
 
 # ---------- Backend ----------
 resource "google_cloud_run_v2_service" "backend" {
-  name     = "${local.name}-backend"
-  location = var.region
-  ingress  = "INGRESS_TRAFFIC_INTERNAL_LOAD_BALANCER"
-  labels   = local.labels
+  name                = "${local.name}-backend"
+  location            = var.region
+  ingress             = "INGRESS_TRAFFIC_INTERNAL_LOAD_BALANCER"
+  labels              = local.labels
+  deletion_protection = false
 
   template {
     service_account                  = google_service_account.runtime.email
@@ -366,10 +368,11 @@ resource "google_cloud_run_v2_service" "backend" {
 # with zero IAM bindings, so a compromised UI container can't pivot to
 # Secret Manager / Cloud SQL via the metadata service.
 resource "google_cloud_run_v2_service" "ui" {
-  name     = "${local.name}-ui"
-  location = var.region
-  ingress  = "INGRESS_TRAFFIC_INTERNAL_LOAD_BALANCER"
-  labels   = local.labels
+  name                = "${local.name}-ui"
+  location            = var.region
+  ingress             = "INGRESS_TRAFFIC_INTERNAL_LOAD_BALANCER"
+  labels              = local.labels
+  deletion_protection = false
 
   template {
     service_account                  = google_service_account.ui_runtime.email
@@ -441,9 +444,10 @@ resource "google_cloud_run_v2_service_iam_member" "ui_allusers" {
 # assembles DATABASE_URL from the DATABASE_* env vars and runs `prisma
 # migrate deploy`. No proxy_config, no master key, no shell wrapper.
 resource "google_cloud_run_v2_job" "migrations" {
-  name     = "${local.name}-migrations"
-  location = var.region
-  labels   = local.labels
+  name                = "${local.name}-migrations"
+  location            = var.region
+  labels              = local.labels
+  deletion_protection = false
 
   template {
     template {
diff --git a/terraform/litellm/gcp/cloudsql.tf b/terraform/litellm/gcp/cloudsql.tf
index 0af15eec22..5d45721cea 100644
--- a/terraform/litellm/gcp/cloudsql.tf
+++ b/terraform/litellm/gcp/cloudsql.tf
@@ -92,6 +92,8 @@ resource "google_sql_database_instance" "reader" {
 resource "google_sql_database" "this" {
   name     = var.db_name
   instance = google_sql_database_instance.writer.name
+
+  deletion_policy = "ABANDON"
 }
 
 resource "random_password" "db_password" {
diff --git a/terraform/litellm/gcp/examples/default/TUTORIAL.md b/terraform/litellm/gcp/examples/default/TUTORIAL.md
index 9026207563..5c7144619d 100644
--- a/terraform/litellm/gcp/examples/default/TUTORIAL.md
+++ b/terraform/litellm/gcp/examples/default/TUTORIAL.md
@@ -42,7 +42,7 @@ gcloud artifacts repositories create litellm \
   --remote-docker-repo=https://ghcr.io
 ```
 
-If the repo already exists, this command exits with a clear error and you can move on. Then set `image_registry` in `terraform.tfvars` to `<region>-docker.pkg.dev/<your-project>/litellm/berriai` before applying.
+If the repo already exists, this command exits with a clear error and you can move on. When `deploystack install` prompts for `image_registry`, enter `<region>-docker.pkg.dev/<your-project>/litellm/berriai` (substituting your region and project). The shipped default contains a `PROJECT_ID` placeholder that will fail at apply time if left unedited.
 
 ## (Optional) Set tenant secrets
 
@@ -58,7 +58,7 @@ Skip this step entirely for a trial deploy.
 
 ## Run the installer
 
-DeployStack will prompt for project, region, tenant, env, image tag, and TLS posture, then run `terraform apply`. Open `<walkthrough-editor-open-file filePath="terraform/litellm/gcp/examples/default/deploystack.json">deploystack.json</walkthrough-editor-open-file>` if you want to see the prompt definitions first.
+DeployStack will prompt for project, region, tenant, env, image tag, `image_registry`, and TLS posture, then run `terraform apply`. Open `<walkthrough-editor-open-file filePath="terraform/litellm/gcp/examples/default/deploystack.json">deploystack.json</walkthrough-editor-open-file>` if you want to see the prompt definitions first.
 
 ```bash
 deploystack install
diff --git a/terraform/litellm/gcp/examples/default/deploystack.json b/terraform/litellm/gcp/examples/default/deploystack.json
index 6e5339272e..47d1fd914c 100644
--- a/terraform/litellm/gcp/examples/default/deploystack.json
+++ b/terraform/litellm/gcp/examples/default/deploystack.json
@@ -27,6 +27,11 @@
       "description": "Tag for the four litellm-* images (gateway, backend, ui, migrations). Bump together when bumping LiteLLM",
       "default": "v1.86.0-dev"
     },
+    {
+      "name": "image_registry",
+      "description": "Artifact Registry path prefix for the four litellm-* images. Format: <region>-docker.pkg.dev/<project>/litellm/berriai, pointing at the remote repo you created above. Substitute BOTH REGION and PROJECT_ID in the default to match the AR repo you just created (REGION must match the region you picked above). The ghcr.io/berriai default in the module does NOT work; Cloud Run rejects ghcr.io URIs at apply time",
+      "default": "REGION-docker.pkg.dev/PROJECT_ID/litellm/berriai"
+    },
     {
       "name": "allow_plaintext_lb",
       "description": "Skip TLS on the load balancer (HTTP-only). Set true for trial/dev. For production, leave false and add lb_domains to terraform.tfvars after the first apply",