From 85caf2f3b078fa10651b76ae49e84752c46f26ba Mon Sep 17 00:00:00 2001 From: shenlan Date: Wed, 25 Jun 2025 20:28:19 +0800 Subject: [PATCH] Fix NVIDIA runtime install --- docs/gpu-k8s-role.md | 2 +- playbooks/roles/vhosts/common/tasks/include_gpu.yaml | 10 ++++++---- .../roles/vhosts/gpu-k8s/tasks/install_driver.yml | 10 ++++++---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/docs/gpu-k8s-role.md b/docs/gpu-k8s-role.md index 37b65e1..26c493d 100644 --- a/docs/gpu-k8s-role.md +++ b/docs/gpu-k8s-role.md @@ -7,7 +7,7 @@ This document describes how to use the `gpu-k8s` role to deploy a simple Kuberne The role performs three main tasks: 1. **Create the Kubernetes cluster** using [sealos](https://github.com/labring/sealos). It runs the provided `sealos run` command to bootstrap the master and worker nodes. -2. **Install NVIDIA drivers and container runtime** on the target hosts so that Kubernetes can access GPU resources. +2. **Install NVIDIA drivers and the NVIDIA container toolkit** on the target hosts so that Kubernetes can access GPU resources. 3. **Verify GPU access** by deploying the official NVIDIA device plugin and running a small CUDA workload. diff --git a/playbooks/roles/vhosts/common/tasks/include_gpu.yaml b/playbooks/roles/vhosts/common/tasks/include_gpu.yaml index cb55513..09f86bf 100644 --- a/playbooks/roles/vhosts/common/tasks/include_gpu.yaml +++ b/playbooks/roles/vhosts/common/tasks/include_gpu.yaml @@ -1,9 +1,11 @@ -- name: Add NVIDIA repository +- name: Add NVIDIA repositories shell: | add-apt-repository -y ppa:graphics-drivers - curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | apt-key add - distribution=$(. /etc/os-release;echo $ID$VERSION_ID) - curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list + curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | apt-key add - + curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list + curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add - + curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | tee /etc/apt/sources.list.d/nvidia-docker.list apt-get update - name: Install NVIDIA driver and container runtime @@ -12,6 +14,6 @@ - nvidia-modprobe - nvidia-driver-535 - nvidia-headless-535 - - nvidia-container-runtime + - nvidia-container-toolkit state: present update_cache: yes diff --git a/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml b/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml index d18656d..8e828eb 100644 --- a/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml +++ b/playbooks/roles/vhosts/gpu-k8s/tasks/install_driver.yml @@ -1,9 +1,11 @@ -- name: Add NVIDIA repository +- name: Add NVIDIA repositories shell: | add-apt-repository -y ppa:graphics-drivers - curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | apt-key add - distribution=$(. /etc/os-release;echo $ID$VERSION_ID) - curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list + curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | apt-key add - + curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list + curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add - + curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | tee /etc/apt/sources.list.d/nvidia-docker.list apt-get update args: executable: /bin/bash @@ -15,7 +17,7 @@ - nvidia-modprobe - nvidia-driver-535 - nvidia-headless-535 - - nvidia-container-runtime + - nvidia-container-toolkit state: present update_cache: yes become: true