Add GPU status checker and enhance offline packaging

This commit is contained in:
shenlan 2025-06-27 17:19:00 +08:00
parent 3552407a92
commit 9c054fe027
4 changed files with 132 additions and 9 deletions

View File

@ -17,9 +17,10 @@ bash scripts/create-gpu-k8s-offline-package.sh
- Kubernetes 二进制镜像
- Cilium、Helm 等依赖镜像
- NVIDIA 驱动nvidia-driver-535及 nvidia-container-toolkit deb 包
- NVIDIA 驱动nvidia-driver-535及 nvidia-container-toolkit 离线包deb/rpm
- nerdctl CLIv${NERDCTL_VERSION:-2.1.2}
- 必要的容器镜像,包括 `registry.k8s.io/pause:3.8`
- GPU 环境检测脚本 `check-gpu-status.sh`
该离线包用于基于 `sealos` 部署 Kubernetes最低推荐版本为 **1.29**,也可以使用更新的 `1.30` 等稳定版本。

View File

@ -0,0 +1,78 @@
#!/bin/bash
set -euo pipefail
AUTO_FIX=false
# 检查是否带 --fix 参数
if [[ "${1:-}" == "--fix" ]]; then
AUTO_FIX=true
fi
echo "🔍 Checking NVIDIA GPU status..."
# 1. 检查是否识别 GPU
echo -e "\n📦 [1] PCI 设备检测:"
if lspci | grep -i nvidia; then
echo "✅ 已检测到 NVIDIA GPU"
else
echo "❌ 未检测到 GPU请检查硬件绑定或云平台配置"
exit 1
fi
# 2. 检查内核模块
echo -e "\n📦 [2] 内核模块检测:"
if lsmod | grep -q nvidia; then
echo "✅ nvidia 模块已加载"
else
echo "❌ nvidia 模块未加载"
echo "👉 尝试执行sudo modprobe nvidia"
fi
# 3. 检查设备节点
echo -e "\n📦 [3] 设备节点检测:"
if ls /dev/nvidia0 &>/dev/null; then
echo "✅ /dev/nvidia0 存在"
else
echo "❌ 缺少 /dev/nvidia0驱动可能未成功加载"
fi
# 4. 检查 nvidia-smi
echo -e "\n📦 [4] 驱动状态检测 (nvidia-smi):"
if command -v nvidia-smi &>/dev/null; then
if nvidia-smi; then
echo "✅ nvidia-smi 正常"
else
echo "❌ nvidia-smi 执行失败,驱动可能未正确绑定设备"
fi
else
echo "❌ 未安装 nvidia-smi 工具"
echo "👉 需安装驱动包 nvidia-driver-535、nvidia-utils-535 等"
if $AUTO_FIX; then
echo -e "\n⚙ 正在自动安装驱动..."
sudo apt-get update
sudo apt-get install -y nvidia-driver-535 nvidia-utils-535 dkms linux-headers-$(uname -r)
echo -e "\n✅ 驱动安装完成,请重启后再运行本脚本确认"
exit 0
else
echo -e "\n👉 可执行以下命令安装推荐驱动:"
echo "sudo apt-get update && sudo apt-get install -y nvidia-driver-535 nvidia-utils-535 dkms linux-headers-\$(uname -r)"
fi
fi
# 5. dmesg 错误日志
echo -e "\n📦 [5] dmesg 日志(最近 NVIDIA 行):"
dmesg | grep -i nvidia | tail -n 20 || echo " 无 NVIDIA 错误日志"
# 6. nerdctl 测试(可选)
if command -v nerdctl &>/dev/null; then
echo -e "\n📦 [6] nerdctl GPU 容器测试:"
if nerdctl run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi; then
echo "✅ nerdctl 能访问 GPU"
else
echo "❌ nerdctl 无法访问 GPU"
fi
else
echo -e "\n📦 [6] nerdctl 未安装,跳过容器测试"
fi
echo -e "\n🎉 GPU 检查完成"

View File

@ -40,16 +40,42 @@ curl -s -L "https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker
sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt-get update -y
export NVIDIA_CONTAINER_TOOLKIT_VERSION=1.17.8-1
if [ -n "${NVIDIA_CONTAINER_TOOLKIT_VERSION:-}" ]; then
sudo apt-get install --download-only -y \
nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
nvidia-container-toolkit-base=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
libnvidia-container-tools=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
libnvidia-container1=${NVIDIA_CONTAINER_TOOLKIT_VERSION}
else
sudo apt-get install --download-only -y \
nvidia-container-toolkit nvidia-container-toolkit-base \
libnvidia-container-tools libnvidia-container1
fi
sudo apt-get install --download-only -y "${APT_PACKAGES[@]}"
cp /var/cache/apt/archives/*.deb "$WORKDIR/packages/"
sudo apt-get clean
if command -v dnf >/dev/null; then
distribution=$(. /etc/os-release; echo $ID$VERSION_ID)
curl -s -L "https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker.repo" \
| sudo tee /etc/yum.repos.d/nvidia-docker.repo
sudo dnf -y install 'dnf-command(download)'
sudo dnf -y makecache
RPM_PACKAGES=(nvidia-container-toolkit nvidia-container-toolkit-base \
libnvidia-container-tools libnvidia-container1)
if [ -n "${NVIDIA_CONTAINER_TOOLKIT_VERSION:-}" ]; then
for pkg in "${RPM_PACKAGES[@]}"; do
sudo dnf download --resolve --destdir "$WORKDIR/packages" \
"${pkg}-${NVIDIA_CONTAINER_TOOLKIT_VERSION}"
done
else
sudo dnf download --resolve --destdir "$WORKDIR/packages" "${RPM_PACKAGES[@]}"
fi
sudo rm -f /etc/yum.repos.d/nvidia-docker.repo
fi
# Download sealos deb
curl -L -o "$WORKDIR/sealos_5.0.1_linux_amd64.deb" \
https://github.com/labring/sealos/releases/download/v5.0.1/sealos_5.0.1_linux_amd64.deb
@ -75,6 +101,7 @@ curl -L -o "$WORKDIR/nvidia-gpgkey" https://nvidia.github.io/nvidia-docker/gpgke
# Include deployment script
cp "$(dirname "$0")/gpu-k8s.sh" "$WORKDIR/"
cp "$(dirname "$0")/check-gpu-status.sh" "$WORKDIR/"
# Create final archive
TAR_NAME="gpu_k8s_offline_packages.tar.gz"

View File

@ -79,12 +79,26 @@ load_offline_images() {
}
install_all_offline_packages() {
if [ -d "${OFFLINE_DIR}/packages" ]; then
echo "📦 Using offline deb packages"
sudo dpkg -i ${OFFLINE_DIR}/packages/*.deb 2>/dev/null || sudo apt-get -f install -y
return 0
fi
if [ ! -d "${OFFLINE_DIR}/packages" ]; then
return 1
fi
echo "📦 Using offline packages"
shopt -s nullglob
debs=("${OFFLINE_DIR}"/packages/*.deb)
rpms=("${OFFLINE_DIR}"/packages/*.rpm)
if [ ${#debs[@]} -gt 0 ]; then
sudo dpkg -i "${debs[@]}" 2>/dev/null || sudo apt-get -f install -y
fi
if [ ${#rpms[@]} -gt 0 ] && command -v dnf >/dev/null; then
sudo dnf install -y "${rpms[@]}"
fi
shopt -u nullglob
return 0
}
install_base() {
@ -120,6 +134,7 @@ install_nerdctl() {
install_nvidia() {
echo "[3/8] 安装 NVIDIA 驱动和容器工具"
distribution="ubuntu22.04"
if [ -f "${OFFLINE_DIR}/nvidia-gpgkey" ]; then
sudo install -m 0644 "${OFFLINE_DIR}/nvidia-gpgkey" /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
else
@ -127,7 +142,9 @@ install_nvidia() {
sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
fi
if ! install_all_offline_packages; then
if install_all_offline_packages; then
echo "✅ 离线安装 NVIDIA 相关包完成"
else
proxy_curl -sL https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker.list | \
sed 's|^deb |deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] |' | \
sudo tee /etc/apt/sources.list.d/nvidia-docker.list