diff --git a/playbooks/deploy_deepflow_agent b/playbooks/deploy_deepflow_agent new file mode 100644 index 0000000..f8ba686 --- /dev/null +++ b/playbooks/deploy_deepflow_agent @@ -0,0 +1,7 @@ +--- +- name: Deploy or Upgrade DeepFlow Agent + hosts: all + become: true + + roles: + - role: roles/vhosts/deepflow_agent diff --git a/playbooks/inventory.ini b/playbooks/inventory.ini new file mode 100644 index 0000000..c57cf81 --- /dev/null +++ b/playbooks/inventory.ini @@ -0,0 +1,22 @@ +[deepflow_agents] +192.168.1.101 ansible_user=root ansible_ssh_pass=pass101 +192.168.1.102 ansible_user=admin ansible_ssh_pass=pass102 +192.168.1.103 ansible_user=root ansible_ssh_pass=pass103 ansible_port=2222 +192.168.1.104 ansible_user=ubuntu ansible_ssh_private_key_file=~/.ssh/id_rsa_ubuntu + +[all:vars] +ansible_port=22 +ansible_user=root +ansible_host_key_checking=False + +# SSH 密钥或密码(二选一) +# ansible_ssh_private_key_file=~/.ssh/id_rsa +# ansible_ssh_pass=your_password + +# DeepFlow agent 配置变量 +controller_ips=["10.10.10.10", "10.10.10.11"] +vtap_group_id="g-P22vLIMdB6" + +# DeepFlow agent 安装包位置 +agent_base_dir="deepflow-agent-for-linux" +agent_package_name="deepflow-agent-1.0-5407.systemd.x86_64.rpm" diff --git a/playbooks/roles/vhosts/deepflow_agent/tasks/main.yml b/playbooks/roles/vhosts/deepflow_agent/tasks/main.yml new file mode 100644 index 0000000..1110175 --- /dev/null +++ b/playbooks/roles/vhosts/deepflow_agent/tasks/main.yml @@ -0,0 +1,49 @@ +- name: Copy agent package to /tmp on target + copy: + src: "{{ agent_base_dir }}/{{ agent_package_name }}" + dest: "/tmp/{{ agent_package_name }}" + mode: '0644' + +- name: Install agent package + become: true + shell: | + case "{{ agent_package_name }}" in + *.rpm) + rpm -Uvh --force /tmp/{{ agent_package_name }} + ;; + *.deb) + dpkg -i /tmp/{{ agent_package_name }} + ;; + *) + echo "Unsupported package format" + exit 1 + ;; + esac + args: + executable: /bin/bash + +- name: Render config to /tmp + template: + src: deepflow-agent.yaml.j2 + dest: "/tmp/deepflow-agent.yaml" + mode: '0644' + +- name: Move config to /etc + become: true + command: mv /tmp/deepflow-agent.yaml /etc/deepflow-agent.yaml + +- name: Restart deepflow-agent service + become: true + systemd: + name: deepflow-agent + state: restarted + enabled: true + +- name: Show service status + shell: systemctl is-active deepflow-agent + register: agent_status + changed_when: false + +- name: Report Agent status + debug: + msg: "DeepFlow Agent status on {{ inventory_hostname }}: {{ agent_status.stdout }}" diff --git a/playbooks/roles/vhosts/deepflow_agent/templates/deepflow-agent.yaml.j2 b/playbooks/roles/vhosts/deepflow_agent/templates/deepflow-agent.yaml.j2 new file mode 100644 index 0000000..4c5111b --- /dev/null +++ b/playbooks/roles/vhosts/deepflow_agent/templates/deepflow-agent.yaml.j2 @@ -0,0 +1,7 @@ +controller-ips: +{% for ip in controller_ips %} + - {{ ip }} +{% endfor %} +{% if vtap_group_id is defined %} +vtap-group-id: "{{ vtap_group_id }}" +{% endif %} diff --git a/scripts/deploy_deepflow_agent.sh b/scripts/deploy_deepflow_agent.sh new file mode 100644 index 0000000..8986d5b --- /dev/null +++ b/scripts/deploy_deepflow_agent.sh @@ -0,0 +1,266 @@ +#!/bin/bash + +set -e + +#################################### +# 🌐 配置区 +#################################### + +IP_LIST="./ip.list" +SERVICE_NAME="deepflow-agent" +PKG_DIR="deepflow-agent-for-linux" +MAX_PARALLEL=5 + +CONTROLLER_IP="" +VTAP_GROUP_ID="" +LIMIT="" + +SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=15" + +FAILED_FILE="failed_hosts.txt" +SUCCESS_FILE="success_hosts.txt" +> "$FAILED_FILE" +> "$SUCCESS_FILE" + +#################################### +# 参数解析 +#################################### + +if [[ $# -eq 0 ]]; then + echo "用法: $0 {deploy|upgrade|verify} --controller --group [--limit ip1,ip2]" + exit 1 +fi + +ACTION="$1" +shift + +while [[ $# -gt 0 ]]; do + case "$1" in + --controller) + CONTROLLER_IP="$2" + shift 2 + ;; + --group) + VTAP_GROUP_ID="$2" + shift 2 + ;; + --limit) + LIMIT="$2" + shift 2 + ;; + *) + echo "未知参数: $1" + exit 1 + ;; + esac +done + +if [[ "$ACTION" != "deploy" && "$ACTION" != "upgrade" && "$ACTION" != "verify" ]]; then + echo "用法: $0 {deploy|upgrade|verify} --controller --group [--limit ip1,ip2]" + exit 1 +fi + +if [[ "$ACTION" != "verify" && ( -z "$CONTROLLER_IP" || -z "$VTAP_GROUP_ID" ) ]]; then + echo "❗ deploy/upgrade 必须传入 --controller 和 --group 参数" + exit 1 +fi + +#################################### +# 核心函数 +#################################### + +worker() { + local ip="$1" + local user="$2" + local pass="$3" + + echo "🔧 [$ACTION] 处理主机 $ip ($user)" + + if [[ "$ACTION" == "verify" ]]; then + verify_agent "$ip" "$user" "$pass" && { + echo "$ip" >> "$SUCCESS_FILE" + return + } || { + echo "$ip" >> "$FAILED_FILE" + return + } + fi + + remote_info=$(fetch_remote_info "$ip" "$user" "$pass") || { + echo "❌ $ip 获取远程信息失败" + echo "$ip" >> "$FAILED_FILE" + return + } + + arch=$(echo "$remote_info" | cut -d'|' -f1) + init=$(echo "$remote_info" | cut -d'|' -f2) + + if [[ "$init" == "unknown" ]]; then + echo "❌ $ip 不支持的初始化系统: $init" + echo "$ip" >> "$FAILED_FILE" + return + fi + + pkg_path=$(choose_agent_package "$arch" "$init") + + if [[ "$pkg_path" == "UNSUPPORTED" ]]; then + echo "❌ $ip 无匹配安装包: $arch/$init" + echo "$ip" >> "$FAILED_FILE" + return + fi + + install_agent "$ip" "$user" "$pass" "$pkg_path" && update_config "$ip" "$user" "$pass" && { + echo "✅ $ip $ACTION 完成" + echo "$ip" >> "$SUCCESS_FILE" + } || { + echo "❌ $ip 安装或配置失败" + echo "$ip" >> "$FAILED_FILE" + } + + echo "-------------------------------------------" +} + +fetch_remote_info() { + local ip="$1" user="$2" pass="$3" + + sshpass -p "$pass" ssh $SSH_OPTS "$user@$ip" bash <<'EOF' +arch=$(uname -m) +case "$arch" in + aarch64|arm64) arch="arm" ;; + *) arch="x86" ;; +esac + +if command -v systemctl >/dev/null; then init=systemd; +elif command -v initctl >/dev/null; then init=upstart; +else init=unknown; fi + +echo "${arch}|${init}" +EOF +} + +choose_agent_package() { + local arch="$1" init="$2" + + shopt -s nullglob + + declare -a patterns + + if [[ "$arch" == "arm" ]]; then + patterns=("$PKG_DIR"/deepflow-agent-*.$init-arm.* \ + "$PKG_DIR"/deepflow-agent-*.$init-arm64.* \ + "$PKG_DIR"/deepflow-agent-*.$init-aarch64.*) + else + patterns=("$PKG_DIR"/deepflow-agent-*.$init-x86.* \ + "$PKG_DIR"/deepflow-agent-*.$init.*) + fi + + files=() + + for pattern in "${patterns[@]}"; do + for file in $pattern; do + files+=("$file") + done + done + + if [[ ${#files[@]} -gt 0 ]]; then + latest=$(printf "%s\n" "${files[@]}" | sort -V | tail -1) + echo "🎯 选择安装包: $latest" >&2 + echo "$latest" + else + echo "UNSUPPORTED" + fi +} + +install_agent() { + local ip="$1" user="$2" pass="$3" pkg_path="$4" + local remote_pkg="/tmp/agent.${pkg_path##*.}" + + sshpass -p "$pass" scp $SSH_OPTS "$pkg_path" "$user@$ip:$remote_pkg" + + sshpass -p "$pass" ssh $SSH_OPTS "$user@$ip" bash </dev/null; then SUDO="sudo"; else SUDO=""; fi + +if [[ "$remote_pkg" == *.rpm ]]; then + \$SUDO rpm -Uvh --replacepkgs "$remote_pkg" +elif [[ "$remote_pkg" == *.deb ]]; then + \$SUDO dpkg -i "$remote_pkg" || \$SUDO apt-get install -f -y +else + echo "❌ 不支持的安装包格式" + exit 1 +fi + +if command -v systemctl &>/dev/null; then + \$SUDO systemctl enable $SERVICE_NAME + \$SUDO systemctl restart $SERVICE_NAME +elif command -v service &>/dev/null; then + \$SUDO service $SERVICE_NAME restart + \$SUDO chkconfig $SERVICE_NAME on +elif command -v initctl &>/dev/null; then + \$SUDO initctl restart $SERVICE_NAME || \$SUDO initctl start $SERVICE_NAME +else + echo "❌ 无法识别服务管理方式" +fi +EOF +} + +update_config() { + local ip="$1" user="$2" pass="$3" + sshpass -p "$pass" ssh $SSH_OPTS "$user@$ip" bash </dev/null; then SUDO="sudo"; else SUDO=""; fi +CONFIG_FILE="/etc/deepflow-agent.yaml" +\$SUDO mkdir -p \$(dirname \$CONFIG_FILE) +cat </dev/null +controller-ips: + - $CONTROLLER_IP +vtap-group-id: "$VTAP_GROUP_ID" +CFG +\$SUDO chmod 644 "\$CONFIG_FILE" +\$SUDO chown root:root "\$CONFIG_FILE" +EOF +} + +verify_agent() { + local ip="$1" user="$2" pass="$3" + echo "🔍 $ip 状态检查:" + sshpass -p "$pass" ssh $SSH_OPTS "$user@$ip" " + systemctl is-active $SERVICE_NAME 2>/dev/null || \ + service $SERVICE_NAME status || \ + initctl status $SERVICE_NAME + " +} + +#################################### +# 并发控制主逻辑 +#################################### + +sem(){ + while [[ $(jobs -r | wc -l) -ge $MAX_PARALLEL ]]; do + sleep 0.5 + done +} + +while read -r ip user pass; do + if [[ -n "$LIMIT" ]]; then + IFS=',' read -ra LIMIT_IPS <<< "$LIMIT" + skip=true + for lim_ip in "${LIMIT_IPS[@]}"; do + [[ "$ip" == "$lim_ip" ]] && skip=false + done + $skip && continue + fi + + sem + worker "$ip" "$user" "$pass" & +done < "$IP_LIST" + +wait + +TOTAL_SUCCESS=$(wc -l < "$SUCCESS_FILE") +TOTAL_FAIL=$(wc -l < "$FAILED_FILE") + +echo "🎯 全部任务执行完成: 成功 $TOTAL_SUCCESS 台,失败 $TOTAL_FAIL 台" +if [[ -s "$FAILED_FILE" ]]; then + echo "❗ 失败主机列表已保存: $FAILED_FILE" +fi diff --git a/scripts/install-single-gpu-k8s.sh b/scripts/install-single-gpu-k8s.sh new file mode 100644 index 0000000..8da3e45 --- /dev/null +++ b/scripts/install-single-gpu-k8s.sh @@ -0,0 +1,199 @@ +#!/bin/bash +set -euo pipefail + +# === 全局变量 === +MASTER_IP=$(hostname -I | awk '{print $1}') +USER=${USER:-$(whoami)} +SSH_KEY="${HOME}/.ssh/id_rsa" +K8S_VERSION="labring/kubernetes:v1.25.16" +CILIUM_VERSION="labring/cilium:v1.13.4" +HELM_VERSION="labring/helm:v3.9.4" +NVIDIA_DRIVER_VERSION="nvidia-driver-535" +NVIDIA_PLUGIN_VERSION="v0.17.1" +NERDCTL_VERSION="2.1.2" +PROXY_ADDR="http://127.0.0.1:1081" +USE_PROXY=${USE_PROXY:-false} + +# === 选项代理 === +configure_proxy() { + if [ "$USE_PROXY" = true ]; then + export http_proxy=$PROXY_ADDR + export https_proxy=$PROXY_ADDR + export HTTP_PROXY=$PROXY_ADDR + export HTTPS_PROXY=$PROXY_ADDR + echo "🌐 代理已启用: $PROXY_ADDR" + else + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY + echo "🌐 代理已关闭" + fi +} + +proxy_curl() { + if [ "$USE_PROXY" = true ]; then + curl --proxy "$PROXY_ADDR" "$@" + else + curl "$@" + fi +} + +install_base() { + echo "[1/8] 安装基础依赖" + sudo apt-get update -y + sudo apt-get install -y curl gnupg2 ca-certificates lsb-release \ + apt-transport-https software-properties-common openssh-client \ + openssh-server uidmap +} + +install_containerd() { + echo "[2/8] 安装 containerd + nerdctl" + sudo apt-get purge -y docker.io docker-ce docker-ce-cli containerd.io || true + sudo apt-get install -y containerd + + tmpdir=$(mktemp -d) + archive="nerdctl-full-${NERDCTL_VERSION}-linux-amd64.tar.gz" + url="https://github.com/containerd/nerdctl/releases/download/v${NERDCTL_VERSION}/${archive}" + echo "🔽 下载 nerdctl: $url" + proxy_curl -fLo "${tmpdir}/${archive}" "$url" + + echo "📆 解压 nerdctl 到 /usr/local" + sudo tar -xzf "${tmpdir}/${archive}" -C /usr/local + + sudo mkdir -p /etc/containerd + sudo containerd config default | sudo tee /etc/containerd/config.toml > /dev/null + sudo systemctl enable --now containerd + nerdctl --version && echo "✅ nerdctl 安装成功" || echo "❌ nerdctl 安装失败" +} + +install_nvidia() { + echo "[3/8] 安装 NVIDIA 驱动和容器工具" + distribution="ubuntu22.04" + proxy_curl -sL https://nvidia.github.io/nvidia-docker/gpgkey | \ + sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + + proxy_curl -sL https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker.list | \ + sed 's|^deb |deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] |' | \ + sudo tee /etc/apt/sources.list.d/nvidia-docker.list + + sudo apt-get update -y + sudo apt-get install -y ${NVIDIA_DRIVER_VERSION} nvidia-container-toolkit + sudo nvidia-ctk runtime configure --runtime=containerd --set-as-default + sudo systemctl restart containerd + + if ! command -v nvidia-smi >/dev/null; then echo "❌ nvidia-smi 未找到"; exit 1; fi + nvidia-smi || { echo "❌ NVIDIA 驱动有问题"; exit 1; } +} + +install_sealos() { + echo "[4/8] 安装 Sealos" + if ! command -v sealos &>/dev/null; then + proxy_curl -sfL https://raw.githubusercontent.com/labring/sealos/main/scripts/install.sh | bash + fi +} + +setup_ssh() { + echo "[5/8] 配置 SSH 免密" + [ ! -f "${SSH_KEY}" ] && ssh-keygen -f "${SSH_KEY}" -N "" + cat "${SSH_KEY}.pub" >> ~/.ssh/authorized_keys + chmod 600 ~/.ssh/authorized_keys && chmod 700 ~/.ssh + sudo systemctl enable --now ssh || sudo systemctl enable --now sshd +} + +deploy_k8s() { + echo "[6/8] 使用 Sealos 部署 K8s" + sealos run "${K8S_VERSION}" "${CILIUM_VERSION}" "${HELM_VERSION}" \ + --masters "${MASTER_IP}" --user "${USER}" --pk "${SSH_KEY}" \ + --env '{}' --cmd "kubeadm init --skip-phases=addon/kube-proxy" + + echo "[6.1] 禁用 sealos containerd, 启用系统 containerd" + sudo systemctl disable --now sealos-containerd || true + sudo systemctl enable --now containerd + sleep 3 + sudo systemctl status containerd --no-pager | grep Active +} + +deploy_plugin() { + echo "[7/8] 部署 NVIDIA Device Plugin" + plugin_url="https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/${NVIDIA_PLUGIN_VERSION}/deployments/static/nvidia-device-plugin.yml" + if [ "$USE_PROXY" = true ]; then + HTTPS_PROXY=$PROXY_ADDR HTTP_PROXY=$PROXY_ADDR \ + kubectl apply -f "$plugin_url" + else + kubectl apply -f "$plugin_url" + fi + sleep 15 + kubectl -n kube-system get pods | grep nvidia || echo "⚠️ 插件未启动" + kubectl describe node | grep -A10 Capacity | grep -i nvidia +} + +run_test() { + echo "[8/8] 运行 CUDA vectoradd GPU 测试" + kubectl apply -f - <