From 8c501051d3118b7f6657ac66ca008c26e4ec6186 Mon Sep 17 00:00:00 2001 From: shenlan Date: Fri, 27 Jun 2025 12:15:47 +0800 Subject: [PATCH] Support sealos GPU runtime --- docs/gpu-containerd.md | 52 ++++++++++++++++++++++++++++++++++++++++++ scripts/gpu-k8s.sh | 11 +++++++-- 2 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 docs/gpu-containerd.md diff --git a/docs/gpu-containerd.md b/docs/gpu-containerd.md new file mode 100644 index 0000000..d186c1d --- /dev/null +++ b/docs/gpu-containerd.md @@ -0,0 +1,52 @@ +# 在 Ubuntu 上安装 NVIDIA 驱动和 nvidia-container-toolkit + +以下步骤演示如何在主机安装 NVIDIA 驱动及 nvidia-container-toolkit,并将 containerd 配置为能够使用 GPU。末尾还补充了 sealos 自带 containerd 的配置方式。 + +## 1. 安装 NVIDIA 驱动和 nvidia-container-toolkit + +```bash +# 添加 NVIDIA 容器工具箱仓库 +sudo apt-get update +sudo apt-get install -y ca-certificates curl gnupg +curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | \ + sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + +distribution=$(awk -F= '/^ID=/{print $2}' /etc/os-release)$(awk -F= '/^VERSION_ID=/{print $2}' /etc/os-release) +curl -s -L "https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list" | \ + sed 's#^deb #deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] #' | \ + sudo tee /etc/apt/sources.list.d/nvidia-docker.list + +sudo apt-get update +sudo apt-get install -y nvidia-driver-535 nvidia-container-toolkit +``` + +安装完成后可通过 `nvidia-smi` 验证驱动是否正常: + +```bash +nvidia-smi +``` + +## 2. 配置 containerd 使用 GPU + +使用 `nvidia-ctk` 工具可以快速生成配置并设置为默认运行时: + +```bash +sudo nvidia-ctk runtime configure --runtime=containerd --set-as-default +sudo systemctl restart containerd +``` + +以上命令会在 `/etc/containerd/config.toml` 中新增 `nvidia` 运行时,使 kubelet 或其他工具可以直接调度 GPU 容器。 + +## 3. sealos-containerd 支持 + +若主机通过 [sealos](https://github.com/labring/sealos) 部署集群,其内置的 containerd 服务名通常为 `sealos-containerd`,配置文件位于 sealos 数据目录,例如: +`/var/lib/sealos/data/default/rootfs/etc/containerd/config.toml`。可按以下方式配置: + +```bash +sudo nvidia-ctk runtime configure \ + --config /var/lib/sealos/data/default/rootfs/etc/containerd/config.toml \ + --set-as-default +sudo systemctl restart sealos-containerd +``` + +完成后即可在 sealos 集群内运行需要 GPU 的容器或 Pod。 diff --git a/scripts/gpu-k8s.sh b/scripts/gpu-k8s.sh index 64e521a..043d115 100644 --- a/scripts/gpu-k8s.sh +++ b/scripts/gpu-k8s.sh @@ -118,8 +118,15 @@ install_nvidia() { sudo apt-get update -y sudo apt-get install -y ${NVIDIA_DRIVER_VERSION} nvidia-container-toolkit fi - sudo nvidia-ctk runtime configure --runtime=containerd --set-as-default - sudo systemctl restart containerd + if [ "$DEPLOY_MODE" = "sealos" ]; then + sudo nvidia-ctk runtime configure \ + --config /var/lib/sealos/data/default/rootfs/etc/containerd/config.toml \ + --set-as-default + sudo systemctl restart sealos-containerd + else + sudo nvidia-ctk runtime configure --runtime=containerd --set-as-default + sudo systemctl restart containerd + fi if ! command -v nvidia-smi >/dev/null; then echo "❌ nvidia-smi 未找到"; exit 1; fi nvidia-smi || { echo "❌ NVIDIA 驱动有问题"; exit 1; } }