Add GPU status checker and enhance offline packaging
This commit is contained in:
parent
3552407a92
commit
9c054fe027
@ -17,9 +17,10 @@ bash scripts/create-gpu-k8s-offline-package.sh
|
||||
|
||||
- Kubernetes 二进制镜像
|
||||
- Cilium、Helm 等依赖镜像
|
||||
- NVIDIA 驱动(nvidia-driver-535)及 nvidia-container-toolkit deb 包
|
||||
- NVIDIA 驱动(nvidia-driver-535)及 nvidia-container-toolkit 离线包(deb/rpm)
|
||||
- nerdctl CLI(v${NERDCTL_VERSION:-2.1.2})
|
||||
- 必要的容器镜像,包括 `registry.k8s.io/pause:3.8`
|
||||
- GPU 环境检测脚本 `check-gpu-status.sh`
|
||||
|
||||
该离线包用于基于 `sealos` 部署 Kubernetes,最低推荐版本为 **1.29**,也可以使用更新的 `1.30` 等稳定版本。
|
||||
|
||||
|
||||
78
scripts/check-gpu-status.sh
Normal file
78
scripts/check-gpu-status.sh
Normal file
@ -0,0 +1,78 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
AUTO_FIX=false
|
||||
|
||||
# 检查是否带 --fix 参数
|
||||
if [[ "${1:-}" == "--fix" ]]; then
|
||||
AUTO_FIX=true
|
||||
fi
|
||||
|
||||
echo "🔍 Checking NVIDIA GPU status..."
|
||||
|
||||
# 1. 检查是否识别 GPU
|
||||
echo -e "\n📦 [1] PCI 设备检测:"
|
||||
if lspci | grep -i nvidia; then
|
||||
echo "✅ 已检测到 NVIDIA GPU"
|
||||
else
|
||||
echo "❌ 未检测到 GPU,请检查硬件绑定或云平台配置"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 2. 检查内核模块
|
||||
echo -e "\n📦 [2] 内核模块检测:"
|
||||
if lsmod | grep -q nvidia; then
|
||||
echo "✅ nvidia 模块已加载"
|
||||
else
|
||||
echo "❌ nvidia 模块未加载"
|
||||
echo "👉 尝试执行:sudo modprobe nvidia"
|
||||
fi
|
||||
|
||||
# 3. 检查设备节点
|
||||
echo -e "\n📦 [3] 设备节点检测:"
|
||||
if ls /dev/nvidia0 &>/dev/null; then
|
||||
echo "✅ /dev/nvidia0 存在"
|
||||
else
|
||||
echo "❌ 缺少 /dev/nvidia0,驱动可能未成功加载"
|
||||
fi
|
||||
|
||||
# 4. 检查 nvidia-smi
|
||||
echo -e "\n📦 [4] 驱动状态检测 (nvidia-smi):"
|
||||
if command -v nvidia-smi &>/dev/null; then
|
||||
if nvidia-smi; then
|
||||
echo "✅ nvidia-smi 正常"
|
||||
else
|
||||
echo "❌ nvidia-smi 执行失败,驱动可能未正确绑定设备"
|
||||
fi
|
||||
else
|
||||
echo "❌ 未安装 nvidia-smi 工具"
|
||||
echo "👉 需安装驱动包 nvidia-driver-535、nvidia-utils-535 等"
|
||||
if $AUTO_FIX; then
|
||||
echo -e "\n⚙️ 正在自动安装驱动..."
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y nvidia-driver-535 nvidia-utils-535 dkms linux-headers-$(uname -r)
|
||||
echo -e "\n✅ 驱动安装完成,请重启后再运行本脚本确认"
|
||||
exit 0
|
||||
else
|
||||
echo -e "\n👉 可执行以下命令安装推荐驱动:"
|
||||
echo "sudo apt-get update && sudo apt-get install -y nvidia-driver-535 nvidia-utils-535 dkms linux-headers-\$(uname -r)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 5. dmesg 错误日志
|
||||
echo -e "\n📦 [5] dmesg 日志(最近 NVIDIA 行):"
|
||||
dmesg | grep -i nvidia | tail -n 20 || echo "ℹ️ 无 NVIDIA 错误日志"
|
||||
|
||||
# 6. nerdctl 测试(可选)
|
||||
if command -v nerdctl &>/dev/null; then
|
||||
echo -e "\n📦 [6] nerdctl GPU 容器测试:"
|
||||
if nerdctl run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi; then
|
||||
echo "✅ nerdctl 能访问 GPU"
|
||||
else
|
||||
echo "❌ nerdctl 无法访问 GPU"
|
||||
fi
|
||||
else
|
||||
echo -e "\n📦 [6] nerdctl 未安装,跳过容器测试"
|
||||
fi
|
||||
|
||||
echo -e "\n🎉 GPU 检查完成"
|
||||
@ -40,16 +40,42 @@ curl -s -L "https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker
|
||||
sudo tee /etc/apt/sources.list.d/nvidia-docker.list
|
||||
|
||||
sudo apt-get update -y
|
||||
export NVIDIA_CONTAINER_TOOLKIT_VERSION=1.17.8-1
|
||||
sudo apt-get install --download-only -y \
|
||||
|
||||
if [ -n "${NVIDIA_CONTAINER_TOOLKIT_VERSION:-}" ]; then
|
||||
sudo apt-get install --download-only -y \
|
||||
nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
|
||||
nvidia-container-toolkit-base=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
|
||||
libnvidia-container-tools=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
|
||||
libnvidia-container1=${NVIDIA_CONTAINER_TOOLKIT_VERSION}
|
||||
else
|
||||
sudo apt-get install --download-only -y \
|
||||
nvidia-container-toolkit nvidia-container-toolkit-base \
|
||||
libnvidia-container-tools libnvidia-container1
|
||||
fi
|
||||
|
||||
sudo apt-get install --download-only -y "${APT_PACKAGES[@]}"
|
||||
cp /var/cache/apt/archives/*.deb "$WORKDIR/packages/"
|
||||
sudo apt-get clean
|
||||
|
||||
if command -v dnf >/dev/null; then
|
||||
distribution=$(. /etc/os-release; echo $ID$VERSION_ID)
|
||||
curl -s -L "https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker.repo" \
|
||||
| sudo tee /etc/yum.repos.d/nvidia-docker.repo
|
||||
sudo dnf -y install 'dnf-command(download)'
|
||||
sudo dnf -y makecache
|
||||
RPM_PACKAGES=(nvidia-container-toolkit nvidia-container-toolkit-base \
|
||||
libnvidia-container-tools libnvidia-container1)
|
||||
if [ -n "${NVIDIA_CONTAINER_TOOLKIT_VERSION:-}" ]; then
|
||||
for pkg in "${RPM_PACKAGES[@]}"; do
|
||||
sudo dnf download --resolve --destdir "$WORKDIR/packages" \
|
||||
"${pkg}-${NVIDIA_CONTAINER_TOOLKIT_VERSION}"
|
||||
done
|
||||
else
|
||||
sudo dnf download --resolve --destdir "$WORKDIR/packages" "${RPM_PACKAGES[@]}"
|
||||
fi
|
||||
sudo rm -f /etc/yum.repos.d/nvidia-docker.repo
|
||||
fi
|
||||
|
||||
# Download sealos deb
|
||||
curl -L -o "$WORKDIR/sealos_5.0.1_linux_amd64.deb" \
|
||||
https://github.com/labring/sealos/releases/download/v5.0.1/sealos_5.0.1_linux_amd64.deb
|
||||
@ -75,6 +101,7 @@ curl -L -o "$WORKDIR/nvidia-gpgkey" https://nvidia.github.io/nvidia-docker/gpgke
|
||||
|
||||
# Include deployment script
|
||||
cp "$(dirname "$0")/gpu-k8s.sh" "$WORKDIR/"
|
||||
cp "$(dirname "$0")/check-gpu-status.sh" "$WORKDIR/"
|
||||
|
||||
# Create final archive
|
||||
TAR_NAME="gpu_k8s_offline_packages.tar.gz"
|
||||
|
||||
@ -79,12 +79,26 @@ load_offline_images() {
|
||||
}
|
||||
|
||||
install_all_offline_packages() {
|
||||
if [ -d "${OFFLINE_DIR}/packages" ]; then
|
||||
echo "📦 Using offline deb packages"
|
||||
sudo dpkg -i ${OFFLINE_DIR}/packages/*.deb 2>/dev/null || sudo apt-get -f install -y
|
||||
return 0
|
||||
if [ ! -d "${OFFLINE_DIR}/packages" ]; then
|
||||
return 1
|
||||
fi
|
||||
return 1
|
||||
|
||||
echo "📦 Using offline packages"
|
||||
|
||||
shopt -s nullglob
|
||||
debs=("${OFFLINE_DIR}"/packages/*.deb)
|
||||
rpms=("${OFFLINE_DIR}"/packages/*.rpm)
|
||||
|
||||
if [ ${#debs[@]} -gt 0 ]; then
|
||||
sudo dpkg -i "${debs[@]}" 2>/dev/null || sudo apt-get -f install -y
|
||||
fi
|
||||
|
||||
if [ ${#rpms[@]} -gt 0 ] && command -v dnf >/dev/null; then
|
||||
sudo dnf install -y "${rpms[@]}"
|
||||
fi
|
||||
|
||||
shopt -u nullglob
|
||||
return 0
|
||||
}
|
||||
|
||||
install_base() {
|
||||
@ -120,6 +134,7 @@ install_nerdctl() {
|
||||
install_nvidia() {
|
||||
echo "[3/8] 安装 NVIDIA 驱动和容器工具"
|
||||
distribution="ubuntu22.04"
|
||||
|
||||
if [ -f "${OFFLINE_DIR}/nvidia-gpgkey" ]; then
|
||||
sudo install -m 0644 "${OFFLINE_DIR}/nvidia-gpgkey" /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||
else
|
||||
@ -127,7 +142,9 @@ install_nvidia() {
|
||||
sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||
fi
|
||||
|
||||
if ! install_all_offline_packages; then
|
||||
if install_all_offline_packages; then
|
||||
echo "✅ 离线安装 NVIDIA 相关包完成"
|
||||
else
|
||||
proxy_curl -sL https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker.list | \
|
||||
sed 's|^deb |deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] |' | \
|
||||
sudo tee /etc/apt/sources.list.d/nvidia-docker.list
|
||||
|
||||
Loading…
Reference in New Issue
Block a user