79 lines
2.2 KiB
Bash
79 lines
2.2 KiB
Bash
#!/bin/bash
|
||
set -euo pipefail
|
||
|
||
AUTO_FIX=false
|
||
|
||
# 检查是否带 --fix 参数
|
||
if [[ "${1:-}" == "--fix" ]]; then
|
||
AUTO_FIX=true
|
||
fi
|
||
|
||
echo "🔍 Checking NVIDIA GPU status..."
|
||
|
||
# 1. 检查是否识别 GPU
|
||
echo -e "\n📦 [1] PCI 设备检测:"
|
||
if lspci | grep -i nvidia; then
|
||
echo "✅ 已检测到 NVIDIA GPU"
|
||
else
|
||
echo "❌ 未检测到 GPU,请检查硬件绑定或云平台配置"
|
||
exit 1
|
||
fi
|
||
|
||
# 2. 检查内核模块
|
||
echo -e "\n📦 [2] 内核模块检测:"
|
||
if lsmod | grep -q nvidia; then
|
||
echo "✅ nvidia 模块已加载"
|
||
else
|
||
echo "❌ nvidia 模块未加载"
|
||
echo "👉 尝试执行:sudo modprobe nvidia"
|
||
fi
|
||
|
||
# 3. 检查设备节点
|
||
echo -e "\n📦 [3] 设备节点检测:"
|
||
if ls /dev/nvidia0 &>/dev/null; then
|
||
echo "✅ /dev/nvidia0 存在"
|
||
else
|
||
echo "❌ 缺少 /dev/nvidia0,驱动可能未成功加载"
|
||
fi
|
||
|
||
# 4. 检查 nvidia-smi
|
||
echo -e "\n📦 [4] 驱动状态检测 (nvidia-smi):"
|
||
if command -v nvidia-smi &>/dev/null; then
|
||
if nvidia-smi; then
|
||
echo "✅ nvidia-smi 正常"
|
||
else
|
||
echo "❌ nvidia-smi 执行失败,驱动可能未正确绑定设备"
|
||
fi
|
||
else
|
||
echo "❌ 未安装 nvidia-smi 工具"
|
||
echo "👉 需安装驱动包 nvidia-driver-535、nvidia-utils-535 等"
|
||
if $AUTO_FIX; then
|
||
echo -e "\n⚙️ 正在自动安装驱动..."
|
||
sudo apt-get update
|
||
sudo apt-get install -y nvidia-driver-535 nvidia-utils-535 dkms linux-headers-$(uname -r)
|
||
echo -e "\n✅ 驱动安装完成,请重启后再运行本脚本确认"
|
||
exit 0
|
||
else
|
||
echo -e "\n👉 可执行以下命令安装推荐驱动:"
|
||
echo "sudo apt-get update && sudo apt-get install -y nvidia-driver-535 nvidia-utils-535 dkms linux-headers-\$(uname -r)"
|
||
fi
|
||
fi
|
||
|
||
# 5. dmesg 错误日志
|
||
echo -e "\n📦 [5] dmesg 日志(最近 NVIDIA 行):"
|
||
dmesg | grep -i nvidia | tail -n 20 || echo "ℹ️ 无 NVIDIA 错误日志"
|
||
|
||
# 6. nerdctl 测试(可选)
|
||
if command -v nerdctl &>/dev/null; then
|
||
echo -e "\n📦 [6] nerdctl GPU 容器测试:"
|
||
if nerdctl run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi; then
|
||
echo "✅ nerdctl 能访问 GPU"
|
||
else
|
||
echo "❌ nerdctl 无法访问 GPU"
|
||
fi
|
||
else
|
||
echo -e "\n📦 [6] nerdctl 未安装,跳过容器测试"
|
||
fi
|
||
|
||
echo -e "\n🎉 GPU 检查完成"
|