fix(litellm): resilient online dependency install

litellm[proxy] pulls large wheels (polars-runtime ~46MB) that break mid-stream over slow/mirrored links with IncompleteRead, failing the deploy. Add pip --retries/--resume-retries (resumes partial downloads) + longer timeout, tunable via litellm_pip_* vars, and upgrade pip in the venv first so --resume-retries (pip>=25.1) exists.
This commit is contained in:
Haitao Pan 2026-06-22 02:42:51 +00:00
parent 6a2f05f435
commit f4a30b9e01
2 changed files with 25 additions and 1 deletions

View File

@ -43,6 +43,11 @@ litellm_pip_cache_dir: >-
litellm_install_marker_file: "{{ litellm_venv_dir }}/.install-spec"
litellm_python_executable: "{{ litellm_venv_dir }}/bin/python"
litellm_pip_executable: "{{ litellm_venv_dir }}/bin/pip"
# Network resilience for the (large) online dependency install. Resume-retries
# requires pip >= 25.1, which the role guarantees by upgrading pip in the venv.
litellm_pip_retries: 5
litellm_pip_resume_retries: 5
litellm_pip_timeout: 180
litellm_binary_path: "{{ litellm_venv_dir }}/bin/litellm"
litellm_prisma_binary_path: "{{ litellm_venv_dir }}/bin/prisma"
litellm_listen_host: 127.0.0.1

View File

@ -110,6 +110,20 @@
become: true
become_user: "{{ litellm_service_user }}"
# A venv bootstrapped by ensurepip can ship a pip older than 25.1, which lacks
# `--resume-retries`. Upgrade pip first so the resilient download flags used by
# the dependency install below are always available.
- name: Ensure recent pip in the LiteLLM environment
ansible.builtin.pip:
name: pip
state: latest
executable: "{{ litellm_pip_executable }}"
environment:
PIP_CACHE_DIR: "{{ litellm_pip_cache_dir }}"
PIP_DEFAULT_TIMEOUT: "120"
become: true
become_user: "{{ litellm_service_user }}"
- name: Inspect installed LiteLLM dependency marker
ansible.builtin.stat:
path: "{{ litellm_install_marker_file }}"
@ -165,9 +179,14 @@
- "psycopg2-binary"
executable: "{{ litellm_pip_executable }}"
state: present
# litellm[proxy] pulls large wheels (e.g. polars-runtime ~46MB) that often
# break mid-stream on slow/mirrored links with IncompleteRead. --retries
# reconnects and --resume-retries continues a partial download instead of
# restarting it, so a flaky connection no longer fails the whole deploy.
extra_args: "--retries {{ litellm_pip_retries }} --resume-retries {{ litellm_pip_resume_retries }}"
environment:
PIP_CACHE_DIR: "{{ litellm_pip_cache_dir }}"
PIP_DEFAULT_TIMEOUT: "120"
PIP_DEFAULT_TIMEOUT: "{{ litellm_pip_timeout }}"
become: true
become_user: "{{ litellm_service_user }}"
when: litellm_dependency_install_required | bool