From f4a30b9e01457709a229ca05fcd41fcf3ae4c8e6 Mon Sep 17 00:00:00 2001 From: Haitao Pan Date: Mon, 22 Jun 2026 02:42:51 +0000 Subject: [PATCH] fix(litellm): resilient online dependency install litellm[proxy] pulls large wheels (polars-runtime ~46MB) that break mid-stream over slow/mirrored links with IncompleteRead, failing the deploy. Add pip --retries/--resume-retries (resumes partial downloads) + longer timeout, tunable via litellm_pip_* vars, and upgrade pip in the venv first so --resume-retries (pip>=25.1) exists. --- roles/vhosts/litellm/defaults/main.yml | 5 +++++ roles/vhosts/litellm/tasks/main.yml | 21 ++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/roles/vhosts/litellm/defaults/main.yml b/roles/vhosts/litellm/defaults/main.yml index b93d0ea..ecf9053 100644 --- a/roles/vhosts/litellm/defaults/main.yml +++ b/roles/vhosts/litellm/defaults/main.yml @@ -43,6 +43,11 @@ litellm_pip_cache_dir: >- litellm_install_marker_file: "{{ litellm_venv_dir }}/.install-spec" litellm_python_executable: "{{ litellm_venv_dir }}/bin/python" litellm_pip_executable: "{{ litellm_venv_dir }}/bin/pip" +# Network resilience for the (large) online dependency install. Resume-retries +# requires pip >= 25.1, which the role guarantees by upgrading pip in the venv. +litellm_pip_retries: 5 +litellm_pip_resume_retries: 5 +litellm_pip_timeout: 180 litellm_binary_path: "{{ litellm_venv_dir }}/bin/litellm" litellm_prisma_binary_path: "{{ litellm_venv_dir }}/bin/prisma" litellm_listen_host: 127.0.0.1 diff --git a/roles/vhosts/litellm/tasks/main.yml b/roles/vhosts/litellm/tasks/main.yml index 8fae880..0f3c071 100644 --- a/roles/vhosts/litellm/tasks/main.yml +++ b/roles/vhosts/litellm/tasks/main.yml @@ -110,6 +110,20 @@ become: true become_user: "{{ litellm_service_user }}" +# A venv bootstrapped by ensurepip can ship a pip older than 25.1, which lacks +# `--resume-retries`. Upgrade pip first so the resilient download flags used by +# the dependency install below are always available. +- name: Ensure recent pip in the LiteLLM environment + ansible.builtin.pip: + name: pip + state: latest + executable: "{{ litellm_pip_executable }}" + environment: + PIP_CACHE_DIR: "{{ litellm_pip_cache_dir }}" + PIP_DEFAULT_TIMEOUT: "120" + become: true + become_user: "{{ litellm_service_user }}" + - name: Inspect installed LiteLLM dependency marker ansible.builtin.stat: path: "{{ litellm_install_marker_file }}" @@ -165,9 +179,14 @@ - "psycopg2-binary" executable: "{{ litellm_pip_executable }}" state: present + # litellm[proxy] pulls large wheels (e.g. polars-runtime ~46MB) that often + # break mid-stream on slow/mirrored links with IncompleteRead. --retries + # reconnects and --resume-retries continues a partial download instead of + # restarting it, so a flaky connection no longer fails the whole deploy. + extra_args: "--retries {{ litellm_pip_retries }} --resume-retries {{ litellm_pip_resume_retries }}" environment: PIP_CACHE_DIR: "{{ litellm_pip_cache_dir }}" - PIP_DEFAULT_TIMEOUT: "120" + PIP_DEFAULT_TIMEOUT: "{{ litellm_pip_timeout }}" become: true become_user: "{{ litellm_service_user }}" when: litellm_dependency_install_required | bool