From 9c054fe027a1a9ed1e9354efcee08ec499053ac6 Mon Sep 17 00:00:00 2001 From: shenlan Date: Fri, 27 Jun 2025 17:19:00 +0800 Subject: [PATCH] Add GPU status checker and enhance offline packaging --- docs/gpu-offline-package.md | 3 +- scripts/check-gpu-status.sh | 78 +++++++++++++++++++++++ scripts/create-gpu-k8s-offline-package.sh | 31 ++++++++- scripts/gpu-k8s.sh | 29 +++++++-- 4 files changed, 132 insertions(+), 9 deletions(-) create mode 100644 scripts/check-gpu-status.sh diff --git a/docs/gpu-offline-package.md b/docs/gpu-offline-package.md index ab9797d..c503240 100644 --- a/docs/gpu-offline-package.md +++ b/docs/gpu-offline-package.md @@ -17,9 +17,10 @@ bash scripts/create-gpu-k8s-offline-package.sh - Kubernetes 二进制镜像 - Cilium、Helm 等依赖镜像 -- NVIDIA 驱动(nvidia-driver-535)及 nvidia-container-toolkit deb 包 +- NVIDIA 驱动(nvidia-driver-535)及 nvidia-container-toolkit 离线包(deb/rpm) - nerdctl CLI(v${NERDCTL_VERSION:-2.1.2}) - 必要的容器镜像,包括 `registry.k8s.io/pause:3.8` +- GPU 环境检测脚本 `check-gpu-status.sh` 该离线包用于基于 `sealos` 部署 Kubernetes,最低推荐版本为 **1.29**,也可以使用更新的 `1.30` 等稳定版本。 diff --git a/scripts/check-gpu-status.sh b/scripts/check-gpu-status.sh new file mode 100644 index 0000000..4bc11ad --- /dev/null +++ b/scripts/check-gpu-status.sh @@ -0,0 +1,78 @@ +#!/bin/bash +set -euo pipefail + +AUTO_FIX=false + +# 检查是否带 --fix 参数 +if [[ "${1:-}" == "--fix" ]]; then + AUTO_FIX=true +fi + +echo "🔍 Checking NVIDIA GPU status..." + +# 1. 检查是否识别 GPU +echo -e "\n📦 [1] PCI 设备检测:" +if lspci | grep -i nvidia; then + echo "✅ 已检测到 NVIDIA GPU" +else + echo "❌ 未检测到 GPU,请检查硬件绑定或云平台配置" + exit 1 +fi + +# 2. 检查内核模块 +echo -e "\n📦 [2] 内核模块检测:" +if lsmod | grep -q nvidia; then + echo "✅ nvidia 模块已加载" +else + echo "❌ nvidia 模块未加载" + echo "👉 尝试执行:sudo modprobe nvidia" +fi + +# 3. 检查设备节点 +echo -e "\n📦 [3] 设备节点检测:" +if ls /dev/nvidia0 &>/dev/null; then + echo "✅ /dev/nvidia0 存在" +else + echo "❌ 缺少 /dev/nvidia0,驱动可能未成功加载" +fi + +# 4. 检查 nvidia-smi +echo -e "\n📦 [4] 驱动状态检测 (nvidia-smi):" +if command -v nvidia-smi &>/dev/null; then + if nvidia-smi; then + echo "✅ nvidia-smi 正常" + else + echo "❌ nvidia-smi 执行失败,驱动可能未正确绑定设备" + fi +else + echo "❌ 未安装 nvidia-smi 工具" + echo "👉 需安装驱动包 nvidia-driver-535、nvidia-utils-535 等" + if $AUTO_FIX; then + echo -e "\n⚙️ 正在自动安装驱动..." + sudo apt-get update + sudo apt-get install -y nvidia-driver-535 nvidia-utils-535 dkms linux-headers-$(uname -r) + echo -e "\n✅ 驱动安装完成,请重启后再运行本脚本确认" + exit 0 + else + echo -e "\n👉 可执行以下命令安装推荐驱动:" + echo "sudo apt-get update && sudo apt-get install -y nvidia-driver-535 nvidia-utils-535 dkms linux-headers-\$(uname -r)" + fi +fi + +# 5. dmesg 错误日志 +echo -e "\n📦 [5] dmesg 日志(最近 NVIDIA 行):" +dmesg | grep -i nvidia | tail -n 20 || echo "ℹ️ 无 NVIDIA 错误日志" + +# 6. nerdctl 测试(可选) +if command -v nerdctl &>/dev/null; then + echo -e "\n📦 [6] nerdctl GPU 容器测试:" + if nerdctl run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi; then + echo "✅ nerdctl 能访问 GPU" + else + echo "❌ nerdctl 无法访问 GPU" + fi +else + echo -e "\n📦 [6] nerdctl 未安装,跳过容器测试" +fi + +echo -e "\n🎉 GPU 检查完成" diff --git a/scripts/create-gpu-k8s-offline-package.sh b/scripts/create-gpu-k8s-offline-package.sh index e2b5683..b26d80d 100755 --- a/scripts/create-gpu-k8s-offline-package.sh +++ b/scripts/create-gpu-k8s-offline-package.sh @@ -40,16 +40,42 @@ curl -s -L "https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker sudo tee /etc/apt/sources.list.d/nvidia-docker.list sudo apt-get update -y -export NVIDIA_CONTAINER_TOOLKIT_VERSION=1.17.8-1 -sudo apt-get install --download-only -y \ + +if [ -n "${NVIDIA_CONTAINER_TOOLKIT_VERSION:-}" ]; then + sudo apt-get install --download-only -y \ nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \ nvidia-container-toolkit-base=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \ libnvidia-container-tools=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \ libnvidia-container1=${NVIDIA_CONTAINER_TOOLKIT_VERSION} +else + sudo apt-get install --download-only -y \ + nvidia-container-toolkit nvidia-container-toolkit-base \ + libnvidia-container-tools libnvidia-container1 +fi + sudo apt-get install --download-only -y "${APT_PACKAGES[@]}" cp /var/cache/apt/archives/*.deb "$WORKDIR/packages/" sudo apt-get clean +if command -v dnf >/dev/null; then + distribution=$(. /etc/os-release; echo $ID$VERSION_ID) + curl -s -L "https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker.repo" \ + | sudo tee /etc/yum.repos.d/nvidia-docker.repo + sudo dnf -y install 'dnf-command(download)' + sudo dnf -y makecache + RPM_PACKAGES=(nvidia-container-toolkit nvidia-container-toolkit-base \ + libnvidia-container-tools libnvidia-container1) + if [ -n "${NVIDIA_CONTAINER_TOOLKIT_VERSION:-}" ]; then + for pkg in "${RPM_PACKAGES[@]}"; do + sudo dnf download --resolve --destdir "$WORKDIR/packages" \ + "${pkg}-${NVIDIA_CONTAINER_TOOLKIT_VERSION}" + done + else + sudo dnf download --resolve --destdir "$WORKDIR/packages" "${RPM_PACKAGES[@]}" + fi + sudo rm -f /etc/yum.repos.d/nvidia-docker.repo +fi + # Download sealos deb curl -L -o "$WORKDIR/sealos_5.0.1_linux_amd64.deb" \ https://github.com/labring/sealos/releases/download/v5.0.1/sealos_5.0.1_linux_amd64.deb @@ -75,6 +101,7 @@ curl -L -o "$WORKDIR/nvidia-gpgkey" https://nvidia.github.io/nvidia-docker/gpgke # Include deployment script cp "$(dirname "$0")/gpu-k8s.sh" "$WORKDIR/" +cp "$(dirname "$0")/check-gpu-status.sh" "$WORKDIR/" # Create final archive TAR_NAME="gpu_k8s_offline_packages.tar.gz" diff --git a/scripts/gpu-k8s.sh b/scripts/gpu-k8s.sh index 2144d6c..23e2717 100644 --- a/scripts/gpu-k8s.sh +++ b/scripts/gpu-k8s.sh @@ -79,12 +79,26 @@ load_offline_images() { } install_all_offline_packages() { - if [ -d "${OFFLINE_DIR}/packages" ]; then - echo "📦 Using offline deb packages" - sudo dpkg -i ${OFFLINE_DIR}/packages/*.deb 2>/dev/null || sudo apt-get -f install -y - return 0 + if [ ! -d "${OFFLINE_DIR}/packages" ]; then + return 1 fi - return 1 + + echo "📦 Using offline packages" + + shopt -s nullglob + debs=("${OFFLINE_DIR}"/packages/*.deb) + rpms=("${OFFLINE_DIR}"/packages/*.rpm) + + if [ ${#debs[@]} -gt 0 ]; then + sudo dpkg -i "${debs[@]}" 2>/dev/null || sudo apt-get -f install -y + fi + + if [ ${#rpms[@]} -gt 0 ] && command -v dnf >/dev/null; then + sudo dnf install -y "${rpms[@]}" + fi + + shopt -u nullglob + return 0 } install_base() { @@ -120,6 +134,7 @@ install_nerdctl() { install_nvidia() { echo "[3/8] 安装 NVIDIA 驱动和容器工具" distribution="ubuntu22.04" + if [ -f "${OFFLINE_DIR}/nvidia-gpgkey" ]; then sudo install -m 0644 "${OFFLINE_DIR}/nvidia-gpgkey" /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg else @@ -127,7 +142,9 @@ install_nvidia() { sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg fi - if ! install_all_offline_packages; then + if install_all_offline_packages; then + echo "✅ 离线安装 NVIDIA 相关包完成" + else proxy_curl -sL https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker.list | \ sed 's|^deb |deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] |' | \ sudo tee /etc/apt/sources.list.d/nvidia-docker.list