From ad0960d036c1c1e8d4a42d700117e32e605614d4 Mon Sep 17 00:00:00 2001 From: shenlan Date: Tue, 1 Jul 2025 11:47:11 +0800 Subject: [PATCH] Add roles for GPU operator and monitoring charts --- playbooks/roles/charts/gpu-operator/files/setup.sh | 11 +++++++++++ playbooks/roles/charts/gpu-operator/tasks/main.yml | 3 +++ playbooks/roles/charts/helm-repos/tasks/main.yml | 9 +++++++++ .../charts/kubernetes-dashboard/files/setup.sh | 11 +++++++++++ .../charts/kubernetes-dashboard/tasks/main.yml | 3 +++ .../roles/charts/metrics-server/files/setup.sh | 5 +++++ .../roles/charts/metrics-server/tasks/main.yml | 3 +++ playbooks/roles/charts/prometheus/files/setup.sh | 13 +++++++++++++ playbooks/roles/charts/prometheus/tasks/main.yml | 3 +++ 9 files changed, 61 insertions(+) create mode 100755 playbooks/roles/charts/gpu-operator/files/setup.sh create mode 100644 playbooks/roles/charts/gpu-operator/tasks/main.yml create mode 100644 playbooks/roles/charts/helm-repos/tasks/main.yml create mode 100755 playbooks/roles/charts/kubernetes-dashboard/files/setup.sh create mode 100644 playbooks/roles/charts/kubernetes-dashboard/tasks/main.yml create mode 100755 playbooks/roles/charts/metrics-server/files/setup.sh create mode 100644 playbooks/roles/charts/metrics-server/tasks/main.yml create mode 100755 playbooks/roles/charts/prometheus/files/setup.sh create mode 100644 playbooks/roles/charts/prometheus/tasks/main.yml diff --git a/playbooks/roles/charts/gpu-operator/files/setup.sh b/playbooks/roles/charts/gpu-operator/files/setup.sh new file mode 100755 index 0000000..302d75c --- /dev/null +++ b/playbooks/roles/charts/gpu-operator/files/setup.sh @@ -0,0 +1,11 @@ +#!/bin/bash +helm upgrade --install gpu-operator nvidia/gpu-operator \ + --namespace gpu-operator \ + --create-namespace \ + --set nodeSelector.kubernetes.io/gpu="true" \ + --set driver.enabled=true \ + --set toolkit.enabled=true \ + --set devicePlugin.enabled=true \ + --set operator.runtimeClass="nvidia-container-runtime" \ + --set operator.defaultRuntime=containerd \ + --set containerRuntime.socketPath=/var/snap/microk8s/common/run/containerd.sock diff --git a/playbooks/roles/charts/gpu-operator/tasks/main.yml b/playbooks/roles/charts/gpu-operator/tasks/main.yml new file mode 100644 index 0000000..e98dc40 --- /dev/null +++ b/playbooks/roles/charts/gpu-operator/tasks/main.yml @@ -0,0 +1,3 @@ +- name: Install GPU Operator + script: files/setup.sh + when: is_primary | bool diff --git a/playbooks/roles/charts/helm-repos/tasks/main.yml b/playbooks/roles/charts/helm-repos/tasks/main.yml new file mode 100644 index 0000000..3b02e75 --- /dev/null +++ b/playbooks/roles/charts/helm-repos/tasks/main.yml @@ -0,0 +1,9 @@ +- name: Enable community plugins and third-party helm charts + shell: | + helm repo add kubernetes-dashboard https://kubernetes.github.io/dashboard/ || true + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia || true + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts || true + helm repo add metrics-server https://kubernetes-sigs.github.io/metrics-server/ || true + helm repo update + ignore_errors: yes + when: is_primary | bool diff --git a/playbooks/roles/charts/kubernetes-dashboard/files/setup.sh b/playbooks/roles/charts/kubernetes-dashboard/files/setup.sh new file mode 100755 index 0000000..e29a439 --- /dev/null +++ b/playbooks/roles/charts/kubernetes-dashboard/files/setup.sh @@ -0,0 +1,11 @@ +#!/bin/bash +helm upgrade --install kubernetes-dashboard kubernetes-dashboard/kubernetes-dashboard \ + --create-namespace \ + --namespace kubernetes-dashboard \ + --set app.scheduling.nodeSelector."kubernetes\.io/hostname"=$1 \ + --set auth.nodeSelector."kubernetes\.io/hostname"=$1 \ + --set api.nodeSelector."kubernetes\.io/hostname"=$1 \ + --set web.nodeSelector."kubernetes\.io/hostname"=$1 \ + --set metricsScraper.nodeSelector."kubernetes\.io/hostname"=$1 \ + --set kong.nodeSelector."kubernetes\.io/hostname"=$1 \ + --set persistence.enabled=false diff --git a/playbooks/roles/charts/kubernetes-dashboard/tasks/main.yml b/playbooks/roles/charts/kubernetes-dashboard/tasks/main.yml new file mode 100644 index 0000000..0dcaee4 --- /dev/null +++ b/playbooks/roles/charts/kubernetes-dashboard/tasks/main.yml @@ -0,0 +1,3 @@ +- name: Install kubernetes dashboard + script: files/setup.sh {{ inventory_hostname }} + when: is_primary | bool diff --git a/playbooks/roles/charts/metrics-server/files/setup.sh b/playbooks/roles/charts/metrics-server/files/setup.sh new file mode 100755 index 0000000..36f59e4 --- /dev/null +++ b/playbooks/roles/charts/metrics-server/files/setup.sh @@ -0,0 +1,5 @@ +#!/bin/bash +helm upgrade --install metrics-server metrics-server/metrics-server \ + --namespace kube-system \ + --set nodeSelector."kubernetes\.io/hostname"=$1 \ + --set persistence.enabled=false diff --git a/playbooks/roles/charts/metrics-server/tasks/main.yml b/playbooks/roles/charts/metrics-server/tasks/main.yml new file mode 100644 index 0000000..5c293d4 --- /dev/null +++ b/playbooks/roles/charts/metrics-server/tasks/main.yml @@ -0,0 +1,3 @@ +- name: Install metrics server + script: files/setup.sh {{ inventory_hostname }} + when: is_primary | bool diff --git a/playbooks/roles/charts/prometheus/files/setup.sh b/playbooks/roles/charts/prometheus/files/setup.sh new file mode 100755 index 0000000..e07702b --- /dev/null +++ b/playbooks/roles/charts/prometheus/files/setup.sh @@ -0,0 +1,13 @@ +#!/bin/bash +helm upgrade --install prometheus prometheus-community/prometheus \ + --namespace chutes \ + --create-namespace \ + --set server.persistentVolume.enabled=false \ + --set alertmanager.persistentVolume.enabled=false \ + --set prometheus-pushgateway.persistentVolume.enabled=false \ + --set prometheus-server.persistentVolume.enabled=false \ + --set alertmanager.persistence.enabled=false \ + --set server.nodeSelector."kubernetes\.io/hostname"=$1 \ + --set alertmanager.nodeSelector."kubernetes\.io/hostname"=$1 \ + --set pushgateway.nodeSelector."kubernetes\.io/hostname"=$1 \ + --set kubeStateMetrics.nodeSelector."kubernetes\.io/hostname"=$1 diff --git a/playbooks/roles/charts/prometheus/tasks/main.yml b/playbooks/roles/charts/prometheus/tasks/main.yml new file mode 100644 index 0000000..4f97501 --- /dev/null +++ b/playbooks/roles/charts/prometheus/tasks/main.yml @@ -0,0 +1,3 @@ +- name: Install Prometheus + script: files/setup.sh {{ inventory_hostname }} + when: is_primary | bool