add deepflow_agent role

This commit is contained in:
Haitao Pan 2025-07-07 13:32:16 +08:00
parent 88955b78a3
commit 998effa5a8
6 changed files with 550 additions and 0 deletions

View File

@ -0,0 +1,7 @@
---
- name: Deploy or Upgrade DeepFlow Agent
hosts: all
become: true
roles:
- role: roles/vhosts/deepflow_agent

22
playbooks/inventory.ini Normal file
View File

@ -0,0 +1,22 @@
[deepflow_agents]
192.168.1.101 ansible_user=root ansible_ssh_pass=pass101
192.168.1.102 ansible_user=admin ansible_ssh_pass=pass102
192.168.1.103 ansible_user=root ansible_ssh_pass=pass103 ansible_port=2222
192.168.1.104 ansible_user=ubuntu ansible_ssh_private_key_file=~/.ssh/id_rsa_ubuntu
[all:vars]
ansible_port=22
ansible_user=root
ansible_host_key_checking=False
# SSH 密钥或密码(二选一)
# ansible_ssh_private_key_file=~/.ssh/id_rsa
# ansible_ssh_pass=your_password
# DeepFlow agent 配置变量
controller_ips=["10.10.10.10", "10.10.10.11"]
vtap_group_id="g-P22vLIMdB6"
# DeepFlow agent 安装包位置
agent_base_dir="deepflow-agent-for-linux"
agent_package_name="deepflow-agent-1.0-5407.systemd.x86_64.rpm"

View File

@ -0,0 +1,49 @@
- name: Copy agent package to /tmp on target
copy:
src: "{{ agent_base_dir }}/{{ agent_package_name }}"
dest: "/tmp/{{ agent_package_name }}"
mode: '0644'
- name: Install agent package
become: true
shell: |
case "{{ agent_package_name }}" in
*.rpm)
rpm -Uvh --force /tmp/{{ agent_package_name }}
;;
*.deb)
dpkg -i /tmp/{{ agent_package_name }}
;;
*)
echo "Unsupported package format"
exit 1
;;
esac
args:
executable: /bin/bash
- name: Render config to /tmp
template:
src: deepflow-agent.yaml.j2
dest: "/tmp/deepflow-agent.yaml"
mode: '0644'
- name: Move config to /etc
become: true
command: mv /tmp/deepflow-agent.yaml /etc/deepflow-agent.yaml
- name: Restart deepflow-agent service
become: true
systemd:
name: deepflow-agent
state: restarted
enabled: true
- name: Show service status
shell: systemctl is-active deepflow-agent
register: agent_status
changed_when: false
- name: Report Agent status
debug:
msg: "DeepFlow Agent status on {{ inventory_hostname }}: {{ agent_status.stdout }}"

View File

@ -0,0 +1,7 @@
controller-ips:
{% for ip in controller_ips %}
- {{ ip }}
{% endfor %}
{% if vtap_group_id is defined %}
vtap-group-id: "{{ vtap_group_id }}"
{% endif %}

View File

@ -0,0 +1,266 @@
#!/bin/bash
set -e
####################################
# 🌐 配置区
####################################
IP_LIST="./ip.list"
SERVICE_NAME="deepflow-agent"
PKG_DIR="deepflow-agent-for-linux"
MAX_PARALLEL=5
CONTROLLER_IP=""
VTAP_GROUP_ID=""
LIMIT=""
SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=15"
FAILED_FILE="failed_hosts.txt"
SUCCESS_FILE="success_hosts.txt"
> "$FAILED_FILE"
> "$SUCCESS_FILE"
####################################
# 参数解析
####################################
if [[ $# -eq 0 ]]; then
echo "用法: $0 {deploy|upgrade|verify} --controller <ip> --group <id> [--limit ip1,ip2]"
exit 1
fi
ACTION="$1"
shift
while [[ $# -gt 0 ]]; do
case "$1" in
--controller)
CONTROLLER_IP="$2"
shift 2
;;
--group)
VTAP_GROUP_ID="$2"
shift 2
;;
--limit)
LIMIT="$2"
shift 2
;;
*)
echo "未知参数: $1"
exit 1
;;
esac
done
if [[ "$ACTION" != "deploy" && "$ACTION" != "upgrade" && "$ACTION" != "verify" ]]; then
echo "用法: $0 {deploy|upgrade|verify} --controller <ip> --group <id> [--limit ip1,ip2]"
exit 1
fi
if [[ "$ACTION" != "verify" && ( -z "$CONTROLLER_IP" || -z "$VTAP_GROUP_ID" ) ]]; then
echo "❗ deploy/upgrade 必须传入 --controller 和 --group 参数"
exit 1
fi
####################################
# 核心函数
####################################
worker() {
local ip="$1"
local user="$2"
local pass="$3"
echo "🔧 [$ACTION] 处理主机 $ip ($user)"
if [[ "$ACTION" == "verify" ]]; then
verify_agent "$ip" "$user" "$pass" && {
echo "$ip" >> "$SUCCESS_FILE"
return
} || {
echo "$ip" >> "$FAILED_FILE"
return
}
fi
remote_info=$(fetch_remote_info "$ip" "$user" "$pass") || {
echo "$ip 获取远程信息失败"
echo "$ip" >> "$FAILED_FILE"
return
}
arch=$(echo "$remote_info" | cut -d'|' -f1)
init=$(echo "$remote_info" | cut -d'|' -f2)
if [[ "$init" == "unknown" ]]; then
echo "$ip 不支持的初始化系统: $init"
echo "$ip" >> "$FAILED_FILE"
return
fi
pkg_path=$(choose_agent_package "$arch" "$init")
if [[ "$pkg_path" == "UNSUPPORTED" ]]; then
echo "$ip 无匹配安装包: $arch/$init"
echo "$ip" >> "$FAILED_FILE"
return
fi
install_agent "$ip" "$user" "$pass" "$pkg_path" && update_config "$ip" "$user" "$pass" && {
echo "$ip $ACTION 完成"
echo "$ip" >> "$SUCCESS_FILE"
} || {
echo "$ip 安装或配置失败"
echo "$ip" >> "$FAILED_FILE"
}
echo "-------------------------------------------"
}
fetch_remote_info() {
local ip="$1" user="$2" pass="$3"
sshpass -p "$pass" ssh $SSH_OPTS "$user@$ip" bash <<'EOF'
arch=$(uname -m)
case "$arch" in
aarch64|arm64) arch="arm" ;;
*) arch="x86" ;;
esac
if command -v systemctl >/dev/null; then init=systemd;
elif command -v initctl >/dev/null; then init=upstart;
else init=unknown; fi
echo "${arch}|${init}"
EOF
}
choose_agent_package() {
local arch="$1" init="$2"
shopt -s nullglob
declare -a patterns
if [[ "$arch" == "arm" ]]; then
patterns=("$PKG_DIR"/deepflow-agent-*.$init-arm.* \
"$PKG_DIR"/deepflow-agent-*.$init-arm64.* \
"$PKG_DIR"/deepflow-agent-*.$init-aarch64.*)
else
patterns=("$PKG_DIR"/deepflow-agent-*.$init-x86.* \
"$PKG_DIR"/deepflow-agent-*.$init.*)
fi
files=()
for pattern in "${patterns[@]}"; do
for file in $pattern; do
files+=("$file")
done
done
if [[ ${#files[@]} -gt 0 ]]; then
latest=$(printf "%s\n" "${files[@]}" | sort -V | tail -1)
echo "🎯 选择安装包: $latest" >&2
echo "$latest"
else
echo "UNSUPPORTED"
fi
}
install_agent() {
local ip="$1" user="$2" pass="$3" pkg_path="$4"
local remote_pkg="/tmp/agent.${pkg_path##*.}"
sshpass -p "$pass" scp $SSH_OPTS "$pkg_path" "$user@$ip:$remote_pkg"
sshpass -p "$pass" ssh $SSH_OPTS "$user@$ip" bash <<EOF
set -e
if command -v sudo >/dev/null; then SUDO="sudo"; else SUDO=""; fi
if [[ "$remote_pkg" == *.rpm ]]; then
\$SUDO rpm -Uvh --replacepkgs "$remote_pkg"
elif [[ "$remote_pkg" == *.deb ]]; then
\$SUDO dpkg -i "$remote_pkg" || \$SUDO apt-get install -f -y
else
echo "❌ 不支持的安装包格式"
exit 1
fi
if command -v systemctl &>/dev/null; then
\$SUDO systemctl enable $SERVICE_NAME
\$SUDO systemctl restart $SERVICE_NAME
elif command -v service &>/dev/null; then
\$SUDO service $SERVICE_NAME restart
\$SUDO chkconfig $SERVICE_NAME on
elif command -v initctl &>/dev/null; then
\$SUDO initctl restart $SERVICE_NAME || \$SUDO initctl start $SERVICE_NAME
else
echo "❌ 无法识别服务管理方式"
fi
EOF
}
update_config() {
local ip="$1" user="$2" pass="$3"
sshpass -p "$pass" ssh $SSH_OPTS "$user@$ip" bash <<EOF
set -e
if command -v sudo >/dev/null; then SUDO="sudo"; else SUDO=""; fi
CONFIG_FILE="/etc/deepflow-agent.yaml"
\$SUDO mkdir -p \$(dirname \$CONFIG_FILE)
cat <<CFG | \$SUDO tee "\$CONFIG_FILE" >/dev/null
controller-ips:
- $CONTROLLER_IP
vtap-group-id: "$VTAP_GROUP_ID"
CFG
\$SUDO chmod 644 "\$CONFIG_FILE"
\$SUDO chown root:root "\$CONFIG_FILE"
EOF
}
verify_agent() {
local ip="$1" user="$2" pass="$3"
echo "🔍 $ip 状态检查:"
sshpass -p "$pass" ssh $SSH_OPTS "$user@$ip" "
systemctl is-active $SERVICE_NAME 2>/dev/null || \
service $SERVICE_NAME status || \
initctl status $SERVICE_NAME
"
}
####################################
# 并发控制主逻辑
####################################
sem(){
while [[ $(jobs -r | wc -l) -ge $MAX_PARALLEL ]]; do
sleep 0.5
done
}
while read -r ip user pass; do
if [[ -n "$LIMIT" ]]; then
IFS=',' read -ra LIMIT_IPS <<< "$LIMIT"
skip=true
for lim_ip in "${LIMIT_IPS[@]}"; do
[[ "$ip" == "$lim_ip" ]] && skip=false
done
$skip && continue
fi
sem
worker "$ip" "$user" "$pass" &
done < "$IP_LIST"
wait
TOTAL_SUCCESS=$(wc -l < "$SUCCESS_FILE")
TOTAL_FAIL=$(wc -l < "$FAILED_FILE")
echo "🎯 全部任务执行完成: 成功 $TOTAL_SUCCESS 台,失败 $TOTAL_FAIL"
if [[ -s "$FAILED_FILE" ]]; then
echo "❗ 失败主机列表已保存: $FAILED_FILE"
fi

View File

@ -0,0 +1,199 @@
#!/bin/bash
set -euo pipefail
# === 全局变量 ===
MASTER_IP=$(hostname -I | awk '{print $1}')
USER=${USER:-$(whoami)}
SSH_KEY="${HOME}/.ssh/id_rsa"
K8S_VERSION="labring/kubernetes:v1.25.16"
CILIUM_VERSION="labring/cilium:v1.13.4"
HELM_VERSION="labring/helm:v3.9.4"
NVIDIA_DRIVER_VERSION="nvidia-driver-535"
NVIDIA_PLUGIN_VERSION="v0.17.1"
NERDCTL_VERSION="2.1.2"
PROXY_ADDR="http://127.0.0.1:1081"
USE_PROXY=${USE_PROXY:-false}
# === 选项代理 ===
configure_proxy() {
if [ "$USE_PROXY" = true ]; then
export http_proxy=$PROXY_ADDR
export https_proxy=$PROXY_ADDR
export HTTP_PROXY=$PROXY_ADDR
export HTTPS_PROXY=$PROXY_ADDR
echo "🌐 代理已启用: $PROXY_ADDR"
else
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
echo "🌐 代理已关闭"
fi
}
proxy_curl() {
if [ "$USE_PROXY" = true ]; then
curl --proxy "$PROXY_ADDR" "$@"
else
curl "$@"
fi
}
install_base() {
echo "[1/8] 安装基础依赖"
sudo apt-get update -y
sudo apt-get install -y curl gnupg2 ca-certificates lsb-release \
apt-transport-https software-properties-common openssh-client \
openssh-server uidmap
}
install_containerd() {
echo "[2/8] 安装 containerd + nerdctl"
sudo apt-get purge -y docker.io docker-ce docker-ce-cli containerd.io || true
sudo apt-get install -y containerd
tmpdir=$(mktemp -d)
archive="nerdctl-full-${NERDCTL_VERSION}-linux-amd64.tar.gz"
url="https://github.com/containerd/nerdctl/releases/download/v${NERDCTL_VERSION}/${archive}"
echo "🔽 下载 nerdctl: $url"
proxy_curl -fLo "${tmpdir}/${archive}" "$url"
echo "📆 解压 nerdctl 到 /usr/local"
sudo tar -xzf "${tmpdir}/${archive}" -C /usr/local
sudo mkdir -p /etc/containerd
sudo containerd config default | sudo tee /etc/containerd/config.toml > /dev/null
sudo systemctl enable --now containerd
nerdctl --version && echo "✅ nerdctl 安装成功" || echo "❌ nerdctl 安装失败"
}
install_nvidia() {
echo "[3/8] 安装 NVIDIA 驱动和容器工具"
distribution="ubuntu22.04"
proxy_curl -sL https://nvidia.github.io/nvidia-docker/gpgkey | \
sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
proxy_curl -sL https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker.list | \
sed 's|^deb |deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] |' | \
sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt-get update -y
sudo apt-get install -y ${NVIDIA_DRIVER_VERSION} nvidia-container-toolkit
sudo nvidia-ctk runtime configure --runtime=containerd --set-as-default
sudo systemctl restart containerd
if ! command -v nvidia-smi >/dev/null; then echo "❌ nvidia-smi 未找到"; exit 1; fi
nvidia-smi || { echo "❌ NVIDIA 驱动有问题"; exit 1; }
}
install_sealos() {
echo "[4/8] 安装 Sealos"
if ! command -v sealos &>/dev/null; then
proxy_curl -sfL https://raw.githubusercontent.com/labring/sealos/main/scripts/install.sh | bash
fi
}
setup_ssh() {
echo "[5/8] 配置 SSH 免密"
[ ! -f "${SSH_KEY}" ] && ssh-keygen -f "${SSH_KEY}" -N ""
cat "${SSH_KEY}.pub" >> ~/.ssh/authorized_keys
chmod 600 ~/.ssh/authorized_keys && chmod 700 ~/.ssh
sudo systemctl enable --now ssh || sudo systemctl enable --now sshd
}
deploy_k8s() {
echo "[6/8] 使用 Sealos 部署 K8s"
sealos run "${K8S_VERSION}" "${CILIUM_VERSION}" "${HELM_VERSION}" \
--masters "${MASTER_IP}" --user "${USER}" --pk "${SSH_KEY}" \
--env '{}' --cmd "kubeadm init --skip-phases=addon/kube-proxy"
echo "[6.1] 禁用 sealos containerd, 启用系统 containerd"
sudo systemctl disable --now sealos-containerd || true
sudo systemctl enable --now containerd
sleep 3
sudo systemctl status containerd --no-pager | grep Active
}
deploy_plugin() {
echo "[7/8] 部署 NVIDIA Device Plugin"
plugin_url="https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/${NVIDIA_PLUGIN_VERSION}/deployments/static/nvidia-device-plugin.yml"
if [ "$USE_PROXY" = true ]; then
HTTPS_PROXY=$PROXY_ADDR HTTP_PROXY=$PROXY_ADDR \
kubectl apply -f "$plugin_url"
else
kubectl apply -f "$plugin_url"
fi
sleep 15
kubectl -n kube-system get pods | grep nvidia || echo "⚠️ 插件未启动"
kubectl describe node | grep -A10 Capacity | grep -i nvidia
}
run_test() {
echo "[8/8] 运行 CUDA vectoradd GPU 测试"
kubectl apply -f - <<EOF
apiVersion: v1
kind: Pod
metadata:
name: gpu-pod
spec:
restartPolicy: Never
containers:
- name: cuda-test
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0
resources: { limits: { "nvidia.com/gpu": 1 } }
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
EOF
kubectl wait pod/gpu-pod --for=condition=Succeeded --timeout=120s || true
kubectl logs gpu-pod || echo "⚠️ 未获取日志"
}
show_help() {
echo -e "用法: ./gpu-k8s.sh [阶段参数...]\n"
echo "可用阶段:"
echo " --install-base 安装基础依赖"
echo " --install-containerd 安装 containerd + nerdctl"
echo " --install-nvidia 安装 NVIDIA 驱动和工具"
echo " --install-sealos 安装 Sealos"
echo " --setup-ssh 配置 SSH 免密"
echo " --deploy-k8s 使用 Sealos 部署 Kubernetes"
echo " --deploy-plugin 部署 NVIDIA Device Plugin"
echo " --run-test 运行 GPU 测试"
echo " --all 全部步骤执行"
echo -e "\n示例命令\t\t\t\t说明"
echo "USE_PROXY=true ./gpu-k8s.sh --install-nvidia # 只安装 NVIDIA 工具包并走代理"
echo "USE_PROXY=false ./gpu-k8s.sh --all # 全流程执行但不使用代理"
echo "./gpu-k8s.sh --install-sealos --deploy-k8s # 默认关闭代理执行指定阶段"
}
# === 执行 ===
configure_proxy
if [ $# -eq 0 ]; then
show_help
exit 1
fi
for arg in "$@"; do
case "$arg" in
--install-base) install_base ;;
--install-containerd) install_containerd ;;
--install-nvidia) install_nvidia ;;
--install-sealos) install_sealos ;;
--setup-ssh) setup_ssh ;;
--deploy-k8s) deploy_k8s ;;
--deploy-plugin) deploy_plugin ;;
--run-test) run_test ;;
--all)
install_base
install_containerd
install_nvidia
install_sealos
setup_ssh
deploy_k8s
deploy_plugin
run_test
;;
*) show_help; exit 1 ;;
esac
echo
done