playbooks/roles/charts/vllm_service/templates/deployment.yaml.j2

66 lines
2.0 KiB
Django/Jinja

apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ vllm_service_name }}
namespace: {{ vllm_namespace }}
spec:
replicas: 1
selector:
matchLabels:
app: {{ vllm_service_name }}
template:
metadata:
labels:
app: {{ vllm_service_name }}
spec:
containers:
- name: vllm
image: {{ vllm_image }}
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
args:
- "--model={{ vllm_model }}"
- "--tensor-parallel-size={{ vllm_tensor_parallel_size }}"
- "--pipeline-parallel-size={{ vllm_pipeline_parallel_size }}"
- "--gpu-memory-utilization={{ vllm_gpu_memory_utilization }}"
- "--max-model-len={{ vllm_max_model_len }}"
- "--max-num-seqs={{ vllm_max_num_seqs }}"
- "--port={{ vllm_port }}"
- "--distributed-executor-backend=ray"
- "--worker-use-ray"
env:
- name: RAY_ADDRESS
value: "{{ ray_address }}"
- name: NCCL_SOCKET_IFNAME
value: "{{ nccl_socket_ifname }}"
- name: GLOO_SOCKET_IFNAME
value: "{{ gloo_socket_ifname }}"
- name: NCCL_IB_DISABLE
value: "{{ nccl_ib_disable }}"
- name: VLLM_LOGGING_LEVEL
value: "{{ vllm_logging_level }}"
- name: TORCH_DISTRIBUTED_INIT_TIMEOUT
value: "{{ torch_distributed_init_timeout }}"
- name: HUGGING_FACE_HUB_TOKEN
value: "{{ huggingface_token }}"
ports:
- containerPort: {{ vllm_port }}
readinessProbe:
httpGet:
path: /health
port: {{ vllm_port }}
initialDelaySeconds: 30
periodSeconds: 10
volumeMounts:
- name: dshm
mountPath: /dev/shm
- name: models
mountPath: {{ model_mount_path }}
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: models
hostPath:
path: {{ model_host_path }}
type: DirectoryOrCreate