Merge pull request #120 from svc-design/grntcf-codex/修复离线安装支持问题

Fix GPU offline packaging
This commit is contained in:
shenlan 2025-06-27 17:19:19 +08:00 committed by GitHub
commit 3478e05d1d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 99 additions and 0 deletions

View File

@ -20,6 +20,7 @@ bash scripts/create-gpu-k8s-offline-package.sh
- NVIDIA 驱动nvidia-driver-535及 nvidia-container-toolkit 离线包deb/rpm
- nerdctl CLIv${NERDCTL_VERSION:-2.1.2}
- 必要的容器镜像,包括 `registry.k8s.io/pause:3.8`
- GPU 环境检测脚本 `check-gpu-status.sh`
该离线包用于基于 `sealos` 部署 Kubernetes最低推荐版本为 **1.29**,也可以使用更新的 `1.30` 等稳定版本。

View File

@ -0,0 +1,78 @@
#!/bin/bash
set -euo pipefail
AUTO_FIX=false
# 检查是否带 --fix 参数
if [[ "${1:-}" == "--fix" ]]; then
AUTO_FIX=true
fi
echo "🔍 Checking NVIDIA GPU status..."
# 1. 检查是否识别 GPU
echo -e "\n📦 [1] PCI 设备检测:"
if lspci | grep -i nvidia; then
echo "✅ 已检测到 NVIDIA GPU"
else
echo "❌ 未检测到 GPU请检查硬件绑定或云平台配置"
exit 1
fi
# 2. 检查内核模块
echo -e "\n📦 [2] 内核模块检测:"
if lsmod | grep -q nvidia; then
echo "✅ nvidia 模块已加载"
else
echo "❌ nvidia 模块未加载"
echo "👉 尝试执行sudo modprobe nvidia"
fi
# 3. 检查设备节点
echo -e "\n📦 [3] 设备节点检测:"
if ls /dev/nvidia0 &>/dev/null; then
echo "✅ /dev/nvidia0 存在"
else
echo "❌ 缺少 /dev/nvidia0驱动可能未成功加载"
fi
# 4. 检查 nvidia-smi
echo -e "\n📦 [4] 驱动状态检测 (nvidia-smi):"
if command -v nvidia-smi &>/dev/null; then
if nvidia-smi; then
echo "✅ nvidia-smi 正常"
else
echo "❌ nvidia-smi 执行失败,驱动可能未正确绑定设备"
fi
else
echo "❌ 未安装 nvidia-smi 工具"
echo "👉 需安装驱动包 nvidia-driver-535、nvidia-utils-535 等"
if $AUTO_FIX; then
echo -e "\n⚙ 正在自动安装驱动..."
sudo apt-get update
sudo apt-get install -y nvidia-driver-535 nvidia-utils-535 dkms linux-headers-$(uname -r)
echo -e "\n✅ 驱动安装完成,请重启后再运行本脚本确认"
exit 0
else
echo -e "\n👉 可执行以下命令安装推荐驱动:"
echo "sudo apt-get update && sudo apt-get install -y nvidia-driver-535 nvidia-utils-535 dkms linux-headers-\$(uname -r)"
fi
fi
# 5. dmesg 错误日志
echo -e "\n📦 [5] dmesg 日志(最近 NVIDIA 行):"
dmesg | grep -i nvidia | tail -n 20 || echo " 无 NVIDIA 错误日志"
# 6. nerdctl 测试(可选)
if command -v nerdctl &>/dev/null; then
echo -e "\n📦 [6] nerdctl GPU 容器测试:"
if nerdctl run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi; then
echo "✅ nerdctl 能访问 GPU"
else
echo "❌ nerdctl 无法访问 GPU"
fi
else
echo -e "\n📦 [6] nerdctl 未安装,跳过容器测试"
fi
echo -e "\n🎉 GPU 检查完成"

View File

@ -57,6 +57,25 @@ sudo apt-get install --download-only -y "${APT_PACKAGES[@]}"
cp /var/cache/apt/archives/*.deb "$WORKDIR/packages/"
sudo apt-get clean
if command -v dnf >/dev/null; then
distribution=$(. /etc/os-release; echo $ID$VERSION_ID)
curl -s -L "https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker.repo" \
| sudo tee /etc/yum.repos.d/nvidia-docker.repo
sudo dnf -y install 'dnf-command(download)'
sudo dnf -y makecache
RPM_PACKAGES=(nvidia-container-toolkit nvidia-container-toolkit-base \
libnvidia-container-tools libnvidia-container1)
if [ -n "${NVIDIA_CONTAINER_TOOLKIT_VERSION:-}" ]; then
for pkg in "${RPM_PACKAGES[@]}"; do
sudo dnf download --resolve --destdir "$WORKDIR/packages" \
"${pkg}-${NVIDIA_CONTAINER_TOOLKIT_VERSION}"
done
else
sudo dnf download --resolve --destdir "$WORKDIR/packages" "${RPM_PACKAGES[@]}"
fi
sudo rm -f /etc/yum.repos.d/nvidia-docker.repo
fi
# Download sealos deb
curl -L -o "$WORKDIR/sealos_5.0.1_linux_amd64.deb" \
https://github.com/labring/sealos/releases/download/v5.0.1/sealos_5.0.1_linux_amd64.deb
@ -82,6 +101,7 @@ curl -L -o "$WORKDIR/nvidia-gpgkey" https://nvidia.github.io/nvidia-docker/gpgke
# Include deployment script
cp "$(dirname "$0")/gpu-k8s.sh" "$WORKDIR/"
cp "$(dirname "$0")/check-gpu-status.sh" "$WORKDIR/"
# Create final archive
TAR_NAME="gpu_k8s_offline_packages.tar.gz"