Compare commits

..

No commits in common. "main" and "release/v0.1" have entirely different histories.

312 changed files with 17243 additions and 2392 deletions

136
.github/workflows/bootstrap-env.yaml vendored Normal file
View File

@ -0,0 +1,136 @@
name: Bootstrap Environment Orchestrator
on:
workflow_dispatch:
inputs:
env:
description: "Environment lifecycle (dev / staging / prod)"
required: true
type: string
workspace:
description: "Workspace / region / cluster (e.g. cn-shanghai)"
required: true
type: string
identity_playbook:
description: "Identity service deployment playbook"
required: false
default: "deploy_zitadel_docker.yaml"
type: choice
options:
- deploy_zitadel_docker.yaml
- deploy_keycloak_docker.yaml
- skip
env:
DISPATCH_TOKEN: ${{ secrets.CROSS_REPO_DISPATCH_TOKEN }}
jobs:
# =================================================
# Step 1: Preflight - Infrastructure Readiness
# =================================================
preflight-infra:
name: Preflight - Infrastructure Readiness
runs-on: ubuntu-latest
steps:
- name: Dispatch infrastructure readiness check
uses: peter-evans/repository-dispatch@v4
with:
token: ${{ env.DISPATCH_TOKEN }}
repository: cloud-neutral-toolkit/Modern-Container-Application-Reference-Architecture
event-type: bootstrap.preflight.infra
client-payload: |
{
"env": "${{ inputs.env }}",
"workspace": "${{ inputs.workspace }}"
}
# =================================================
# Step 2: Preflight - Artifact / Image Check
# =================================================
preflight-artifacts:
name: Preflight - Artifact & Image Check
needs: preflight-infra
runs-on: ubuntu-latest
steps:
- name: Dispatch artifact validation
uses: peter-evans/repository-dispatch@v4
with:
token: ${{ env.DISPATCH_TOKEN }}
repository: cloud-neutral-toolkit/XControl
event-type: bootstrap.preflight.artifacts
client-payload: |
{
"env": "${{ inputs.env }}",
"workspace": "${{ inputs.workspace }}"
}
# =================================================
# Step 3: Provision - Runtime & Core Services
# =================================================
provision-runtime:
name: Provision - Runtime & Core Services
needs: preflight-artifacts
runs-on: ubuntu-latest
env:
ENV: ${{ inputs.env }}
WORKSPACE: ${{ inputs.workspace }}
IDENTITY_PLAYBOOK: ${{ inputs.identity_playbook }}
steps:
- name: Checkout deployment repository
uses: actions/checkout@v4
- name: Install Ansible
run: |
sudo apt-get update
sudo apt-get install -y ansible
# -----------------------------
# DNS
# -----------------------------
- name: Register DNS Records
run: |
ansible-playbook \
-i inventory/${ENV}/${WORKSPACE}/hosts.ini \
playbooks/alicloud_dns_record.yml \
--extra-vars "env=${ENV} workspace=${WORKSPACE}" \
-D -C
# -----------------------------
# Runtime / Base Layer
# -----------------------------
- name: Provision Runtime (Docker / Base Services)
run: |
ansible-playbook \
-i inventory/${ENV}/${WORKSPACE}/hosts.ini \
playbooks/setup-docker.yml \
--extra-vars "env=${ENV} workspace=${WORKSPACE}" \
-D -C
# -----------------------------
# Identity (pluggable)
# -----------------------------
- name: Deploy or Update Identity Service
if: ${{ env.IDENTITY_PLAYBOOK != 'skip' }}
run: |
ansible-playbook \
-i inventory/${ENV}/${WORKSPACE}/hosts.ini \
playbooks/${IDENTITY_PLAYBOOK} \
--extra-vars "env=${ENV} workspace=${WORKSPACE}" \
-D -C
# -----------------------------
# Post-check
# -----------------------------
- name: Post-Provision Status Check
run: |
ansible-playbook \
-i inventory/${ENV}/${WORKSPACE}/hosts.ini \
playbooks/check-runtime-status.yml \
--extra-vars "env=${ENV} workspace=${WORKSPACE}"

4
.gitignore vendored
View File

@ -7,7 +7,3 @@ playbooks/deepflow/*.tar.gz
playbooks/deepflow/deepflow-agent-playbook/*.zip
remotes.before.txt
# Python
__pycache__/
*.pyc

106
README.md
View File

@ -1,18 +1,100 @@
# Cloud-Neutral Toolkit GitOps
# ansible-playbook
This repository is the GitOps declaration layer for the Cloud-Neutral Toolkit.
This repository contains a collection of Ansible playbooks and roles for various infrastructure setups and service management tasks.
## Scope
For a quick overview of the directory layout see [docs/repo-structure.md](docs/repo-structure.md).
Additional documentation is stored under the `docs/` folder.
- Store declarative Kubernetes resources, Flux Kustomizations, and non-sensitive multi-environment values.
- Keep application charts and Helm templates in the dedicated chart repository.
- Keep imperative automation such as Ansible playbooks and inventories out of this repository.
## Playbook 角色说明
## Layout
1. playbooks/roles/docker适用于简单的、单机环境的部署主要使用 Docker 和 Docker Compose 进行容器化管理。
2. playbooks/roles/charts面向大规模的 Kubernetes 集群,使用 Helm 和标准化 Chart 部署模式进行高可用和可扩展的管理。
3. playbooks/roles/vhosts传统的非容器化部署方式通常涉及手动配置服务器和虚拟主机适用于不使用容器的应用场景。
- `infra/`: platform, infrastructure, and shared service declarations
- `apps/`: application release declarations
- `clusters/`: cluster-level overlays and entrypoints
- `docs/`: repository conventions and operational documentation
For a quick structure overview, see [docs/repo-structure.md](docs/repo-structure.md).
## Role Summary
| Role Name | Description | Docker | Charts | VHosts | CICD | Validate | Last Update |
|-------------------------|-------------------------------------------------------|--------|--------|--------|--------|--------|------------|
| `common` | 通用角色,包含一些常用的功能,如日志记录、监控等。 | | | ✔ | | yes | 2025-02-14 |
| `keycloak` | 用于管理身份认证和授权服务。 | ✔ | | | github | yes | 2024-11-10 |
| `harbor` | 容器镜像仓库角色,用于存储和管理容器镜像。 | ✔ | | | github | yes | 2024-11-14 |
| `app` | 参考模板。 | | | | | | |
| `nginx` | 用于设置 Nginx | | ✔ | ✔ | | | |
| `grafana` | 用于设置 Grafana | | ✔ | ✔ | | | |
| `grafana-loki` | 用于设置 Grafana-loki | | ✔ | ✔ | | | |
| `Grafana-tempo` | 用于设置 Grafana-tempo | | ✔ | ✔ | | | |
| `prometheus` | 用于设置 Prometheus | | ✔ | ✔ | | | |
| `prometheus-transfer` | 用于 Prometheus 数据传输设置。 | | | ✔ | | | |
| `vector` | 用于配置日志收集代理。 | | | ✔ | | | |
| `node-exporter` | 用于导出系统和硬件的监控数据。 | | ✔ | | | | |
| `observability-agent` | 用于管理 Observability 代理。 | | ✔ | ✔ | | | |
| `observability-server` | 用于设置 Observability 服务端。 | | ✔ | ✔ | | | |
| `wireguard-client` | 用于设置 WireGuard 客户端。 | | | ✔ | | | |
| `wireguard-gateway` | 用于设置 WireGuard 网关。 | | | ✔ | | | |
| `vault` | 用于管理敏感数据和密钥。 | | | ✔ | | | |
| `postgresql` | PostgreSQL 数据库角色,用于提供 PostgreSQL 数据库服务。 | | ✔ | | | | |
| `redis` | Redis 数据库角色,用于提供 Redis 数据库服务。 | | ✔ | | | | |
| `chartmuseum` | 图表仓库角色,用于存储和管理 Kubernetes 图表。 | | ✔ | | | | |
| `gitlab` | 代码仓库角色,用于存储和管理代码。 | | ✔ | | | | |
| `mysql` | MySQL 数据库角色,用于提供 MySQL 数据库服务。 | | ✔ | | | | |
| `argo-server` | 用于设置和管理 Argo Server。 | | ✔ | | | | |
| `deepflow` | 用于流量监控与网络性能分析的 DeepFlow 服务。 | | ✔ | | | | |
| `jenkins` | Jenkins 自动化构建工具角色,用于 CI/CD 管道。 | | ✔ | | | | |
| `chaos-mesh` | 用于 Chaos Engineering 测试的 Chaos Mesh 角色。 | | ✔ | | | | |
| `flagger-loadtester` | 用于负载测试的 Flagger Loadtester 角色。 | | ✔ | | | | |
| `splunk-otel-collector` | 用于配置 Splunk OpenTelemetry Collector。 | | ✔ | | | | |
| `openldap` | 用于设置和管理 OpenLDAP 身份认证服务。 | | ✔ | | | | |
| `alerting` | 用于设置和管理警报系统。 | | | ✔ | | | |
| `k3s` | 用于创建 Kubernetes 集群。 | | | ✔ | | | |
| `k3s-reset` | 用于重置 Kubernetes 集群。 | | | ✔ | | | |
| `k3s-addon` | 用于安装 Kubernetes 集群插件。 | | | ✔ | | | |
| `secret-manger` | 密钥管理角色,用于管理密钥。 | | | ✔ | | | |
| `cert-manager` | 证书管理角色,用于管理证书。 | | | ✔ | | | |
| `ssh-trust` | 配置 ops 主机与节点的 SSH 互信。 | | | ✔ | | | |
表格说明
- Docker是否属于 Docker 角色。
- Charts是否属于 Helm Chart 角色。
- VHosts是否属于虚拟主机管理相关角色。
- CICD是否启用 CICD 管道,标明是否集成了自动化流程。
- Validate是否经过验证测试。
- Last Update最后更新时间。
## Usage Examples
- Linux OS Setup
ansible-playbook -i inventory/hosts/all playbooks/common -D -C
ansible-playbook -i inventory/hosts/all playbooks/common -D
- Gather Network Information
ansible-playbook -i inventory gather_network_info.yml -e target_group=master
- Display network information on all nodes
ansible -i inventory all -m script -a 'roles/network_info/tasks/files/display_network_info.sh'
- Deploy Keycloak Server
ansible-playbook -i inventory/hosts/core playbooks/keycloak_server -D
- Set up WireGuard Gateway
ansible-playbook -i inventory/hosts/vpn playbooks/wireguard_gateway.yaml -D
- Set up Grafana Alloy
ansible-playbook -i inventory/k3s-cluster playbooks/init_grafana_alloy -D -C -l cn-k3s-server.svc.plus -e @playbooks/roles/alloy/files/loki_journal_sources_k3s_server.yml -e "ansible_become_pass='xxxx'"
- Setup VPN gateway
ansible-playbook -i inventory/hosts/all playbooks/common -l gateway -D
## Documentation
- [docs/gpu-k8s-role.md](docs/gpu-k8s-role.md) - How to run the GPU-enabled Kubernetes role.
- [docs/repo-structure.md](docs/repo-structure.md) - Overview of repository layout.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,332 @@
{
"__inputs": [
{
"name": "DS_LOKI",
"label": "Loki",
"description": "",
"type": "datasource",
"pluginId": "loki",
"pluginName": "Loki"
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
},
{
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"iconColor": "red",
"name": "flux events",
"target": {
"limit": 100,
"matchAny": false,
"tags": [
"flux"
],
"type": "tags"
}
}
]
},
"description": "Flux logs collected from Kubernetes, stored in Loki",
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": 29,
"iteration": 1653748775696,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": "${DS_LOKI}",
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "bars",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 24,
"x": 0,
"y": 0
},
"id": 4,
"options": {
"legend": {
"calcs": [],
"displayMode": "hidden",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": "${DS_LOKI}",
"expr": "sum(count_over_time({namespace=~\"$namespace\", stream=~\"$stream\", app =~\"$controller\"} | json | __error__!=\"JSONParserErr\" | level=~\"$level\" |= \"$query\" [$__interval]))",
"instant": false,
"legendFormat": "Log count",
"range": true,
"refId": "A"
}
],
"type": "timeseries"
},
{
"datasource": "${DS_LOKI}",
"description": "Logs from services running in Kubernetes",
"gridPos": {
"h": 25,
"w": 24,
"x": 0,
"y": 4
},
"id": 2,
"options": {
"dedupStrategy": "numbers",
"enableLogDetails": false,
"prettifyLogMessage": true,
"showCommonLabels": false,
"showLabels": false,
"showTime": false,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"targets": [
{
"datasource": "${DS_LOKI}",
"expr": "{namespace=~\"$namespace\", stream=~\"$stream\", app =~\"$controller\"} | json | __error__!=\"JSONParserErr\" | level=~\"$level\" |= \"$query\"",
"refId": "A"
}
],
"type": "logs"
}
],
"refresh": "10s",
"schemaVersion": 36,
"style": "light",
"tags": [
"flux"
],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "",
"value": ""
},
"description": "String to search for",
"hide": 0,
"label": "Search Query",
"name": "query",
"options": [
{
"selected": true,
"text": "",
"value": ""
}
],
"query": "",
"skipUrlSync": false,
"type": "textbox"
},
{
"allValue": "info|error",
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"hide": 0,
"includeAll": true,
"multi": false,
"name": "level",
"options": [
{
"selected": true,
"text": "All",
"value": "$__all"
},
{
"selected": false,
"text": "info",
"value": "info"
},
{
"selected": false,
"text": "error",
"value": "error"
}
],
"query": "info,error",
"queryValue": "",
"skipUrlSync": false,
"type": "custom"
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": [
"All"
],
"value": [
"$__all"
]
},
"datasource": "${DS_LOKI}",
"definition": "label_values(app)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "controller",
"options": [],
"query": "label_values(app)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": [
"flux-system"
],
"value": [
"flux-system"
]
},
"datasource": "${DS_LOKI}",
"definition": "label_values(namespace)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "namespace",
"options": [],
"query": "label_values(namespace)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"allValue": ".+",
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": "${DS_LOKI}",
"definition": "label_values(stream)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "stream",
"options": [],
"query": "label_values(stream)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"current": {
"selected": false,
"text": "Loki",
"value": "Loki"
},
"hide": 0,
"includeAll": false,
"label": "Datasource",
"multi": false,
"name": "DS_LOKI",
"options": [],
"query": "loki",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Flux Logs",
"uid": "flux-logs",
"version": 2
}

View File

@ -0,0 +1,16 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- podmonitor.yaml
configMapGenerator:
- name: flux-grafana-dashboards
files:
- dashboards/control-plane.json
- dashboards/cluster.json
- dashboards/logs.json
options:
labels:
grafana_dashboard: "1"
app.kubernetes.io/part-of: flux
app.kubernetes.io/component: monitoring

View File

@ -1,5 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: core-prod
resources:
- ../base
- namespace.yaml
- release.yaml

View File

@ -0,0 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: demo-c
labels:
app.kubernetes.io/component: demo-c

View File

@ -0,0 +1,30 @@
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: stable
namespace: demo-c
spec:
interval: 1m
url: https://charts.onwalk.net/
---
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: cp-app
namespace: demo-c
spec:
chart:
spec:
chart: app
version: "0.1.1"
sourceRef:
kind: HelmRepository
name: stable
namespace: demo-c
interval: 1m
values:
image:
repository: artifact.onwalk.net/base/scaffolding-design/c
tag: "dee1c17b11822997e16e71244b1a1e98fe919688"
ingress:
className: "nginx"

View File

@ -1,5 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: database
resources:
- certificate.yaml
- namespace.yaml
- release.yaml

View File

@ -0,0 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: demo-go
labels:
app.kubernetes.io/component: demo-go

View File

@ -0,0 +1,30 @@
apps/go-demo/release.yaml apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: stable
namespace: demo-go
spec:
interval: 1m
url: https://charts.onwalk.net/
---
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: stable
namespace: demo-go
spec:
chart:
spec:
chart: app
version: "0.1.1"
sourceRef:
kind: HelmRepository
name: stable
namespace: demo-go
interval: 1m
values:
image:
repository: artifact.onwalk.net/base/scaffolding-design/go
tag: "fe2a0fba3014709b26d8acd75bacb661bf2522a4"
ingress:
className: "nginx"

View File

@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- release.yaml

View File

@ -0,0 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: demo-js
labels:
app.kubernetes.io/component: demo

View File

@ -0,0 +1,30 @@
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: stable
namespace: demo-js
spec:
interval: 1m
url: https://charts.onwalk.net/
---
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: stable
namespace: demo-js
spec:
chart:
spec:
chart: app
version: "0.1.1"
sourceRef:
kind: HelmRepository
name: stable
namespace: demo-python
interval: 1m
values:
image:
repository: artifact.onwalk.net/base/scaffolding-design/javascript-frontend
tag: "fc998a6d433c45986dc7d51ab62bf7aa48613d62"
ingress:
className: "nginx"

View File

@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- release.yaml

View File

@ -0,0 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: demo-python
labels:
app.kubernetes.io/component: demo-python

View File

@ -0,0 +1,30 @@
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: stable
namespace: demo-python
spec:
interval: 1m
url: https://charts.onwalk.net/
---
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: stable
namespace: demo-python
spec:
chart:
spec:
chart: app
version: "0.1.1"
sourceRef:
kind: HelmRepository
name: stable
namespace: demo-python
interval: 1m
values:
image:
repository: artifact.onwalk.net/base/scaffolding-design/python
tag: "d72ba38f7a3a76b71eb50f00fe46a94497e6ecaa"
ingress:
className: "nginx"

View File

@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- release.yaml

View File

@ -0,0 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: demo-rust
labels:
app.kubernetes.io/component: demo-rust

View File

@ -0,0 +1,30 @@
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: stable
namespace: demo-rust
spec:
interval: 1m
url: https://charts.onwalk.net/
---
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: stable
namespace: demo-rust
spec:
chart:
spec:
chart: app
version: "0.1.1"
sourceRef:
kind: HelmRepository
name: stable
namespace: demo-rust
interval: 1m
values:
image:
repository: artifact.onwalk.net/base/scaffolding-design/rust
tag: "84a66d19f29c20c57127f5c896d00b0b84dcd986"
ingress:
className: "nginx"

View File

@ -0,0 +1,40 @@
apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
name: itsm-dev
namespace: itsm-dev
spec:
interval: 1m
chart:
spec:
version: "0.1.16"
chart: itsm
sourceRef:
kind: HelmRepository
name: stable
namespace: itsm-dev
interval: 1m
values:
novu:
web:
ingress:
enabled: true
hostname: novu-web.onwalk.net
ingressClassName: 'nginx'
apisix:
dashboard:
ingress:
enabled: true
className: "nginx"
hosts:
- host: apisix-dashboard.onwalk.net
paths:
- /*
etcd-adapter:
enabled: true
mysql:
host: mysql
port: 3306
username: apisix
password: apisix
database: apisix

View File

@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: itsm-dev
resources:
- release.yaml

37
apps/minio/release.yaml Normal file
View File

@ -0,0 +1,37 @@
apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
name: minio
namespace: itsm-dev
spec:
interval: 1m
chart:
spec:
version: "5.0.15"
chart: minio
sourceRef:
kind: HelmRepository
name: stable
namespace: itsm-dev
interval: 1m
values:
enabled: true
nameOverride: minio
mode: standalone
replicas: 2
ingress:
enabled: true
ingressClassName: "nginx"
hosts:
- minio.local
persistence:
enabled: true
size: 10Gi
existingSecret: minio-secret
resources:
requests:
memory: 50Mi
cpu: 50m
limits:
cpu: "100m"
memory: "100Mi"

View File

@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: itsm-dev
resources:
- release.yaml

42
apps/mongodb/release.yaml Normal file
View File

@ -0,0 +1,42 @@
apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
name: mongodb
namespace: itsm-dev
spec:
interval: 1m
chart:
spec:
version: "14.8.3"
chart: mongodb
sourceRef:
kind: HelmRepository
name: stable
namespace: itsm-dev
interval: 1m
values:
enabled: true
nameOverride: "mongodb"
architecture: standalone
useStatefulSet: true
global:
imageRegistry: ""
persistence:
enabled: true
auth:
enabled: true
rootUser: root
rootPassword: "mongodb"
usernames:
- novu
passwords:
- novu
databases:
- novu-db
resources:
requests:
memory: 100Mi
cpu: 100m
limits:
cpu: "500m"
memory: "500Mi"

View File

@ -0,0 +1,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- repository.yaml
- release.yaml

View File

@ -0,0 +1,16 @@
apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
name: flaggerloadtester
namespace: monitoring
spec:
interval: 1m
chart:
spec:
version: "0.30.0"
chart: flagger-loadtester
sourceRef:
kind: HelmRepository
name: flaggerload
namespace: monitoring
interval: 1m

View File

@ -0,0 +1,8 @@
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: flaggerload
namespace: monitoring
spec:
interval: 1m0s
url: https://flagger.app

View File

@ -1,5 +1,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: ingress
resources:
- repositories.yaml
- observability-stack
- repository.yaml
- release.yaml

View File

@ -0,0 +1,24 @@
apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
name: flagger
namespace: ingress
spec:
interval: 1m
chart:
spec:
version: "1.35.0"
chart: flagger
sourceRef:
kind: HelmRepository
name: flagger
namespace: ingress
interval: 1m
values:
prometheus:
install: false
meshProvider: nginx
metricsServer: "https://prometheus.svc-dev.ink"
serviceMonitor:
enabled: true
namespace: monitoring

View File

@ -0,0 +1,8 @@
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: flagger
namespace: ingress
spec:
interval: 1m0s
url: https://flagger.app

View File

@ -0,0 +1,275 @@
kube-state-metrics:
# For kube-prometheus-stacks that are already installed and configured with
# custom collectors, commenting out the collectors and extraArgs below will
# retain any existing kube-state-metrics configuration.
collectors: [ ]
extraArgs:
- --custom-resource-state-only=true
rbac:
extraRules:
- apiGroups:
- source.toolkit.fluxcd.io
- kustomize.toolkit.fluxcd.io
- helm.toolkit.fluxcd.io
- notification.toolkit.fluxcd.io
- image.toolkit.fluxcd.io
resources:
- gitrepositories
- buckets
- helmrepositories
- helmcharts
- ocirepositories
- kustomizations
- helmreleases
- alerts
- providers
- receivers
- imagerepositories
- imagepolicies
- imageupdateautomations
verbs: [ "list", "watch" ]
customResourceState:
enabled: true
config:
spec:
resources:
- groupVersionKind:
group: kustomize.toolkit.fluxcd.io
version: v1
kind: Kustomization
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a GitOps Toolkit resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
revision: [ status, lastAppliedRevision ]
source_name: [ spec, sourceRef, name ]
- groupVersionKind:
group: helm.toolkit.fluxcd.io
version: v2beta2
kind: HelmRelease
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a GitOps Toolkit resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
revision: [ status, lastAppliedRevision ]
chart_name: [ spec, chart, spec, chart ]
chart_source_name: [ spec, chart, spec, sourceRef, name ]
- groupVersionKind:
group: source.toolkit.fluxcd.io
version: v1
kind: GitRepository
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a GitOps Toolkit resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
revision: [ status, artifact, revision ]
url: [ spec, url ]
- groupVersionKind:
group: source.toolkit.fluxcd.io
version: v1beta2
kind: Bucket
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a GitOps Toolkit resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
revision: [ status, artifact, revision ]
endpoint: [ spec, endpoint ]
bucket_name: [ spec, bucketName ]
- groupVersionKind:
group: source.toolkit.fluxcd.io
version: v1beta2
kind: HelmRepository
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a GitOps Toolkit resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
revision: [ status, artifact, revision ]
url: [ spec, url ]
- groupVersionKind:
group: source.toolkit.fluxcd.io
version: v1beta2
kind: HelmChart
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a GitOps Toolkit resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
revision: [ status, artifact, revision ]
chart_name: [ spec, chart ]
chart_version: [ spec, version ]
- groupVersionKind:
group: source.toolkit.fluxcd.io
version: v1beta2
kind: OCIRepository
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a GitOps Toolkit resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
revision: [ status, artifact, revision ]
url: [ spec, url ]
- groupVersionKind:
group: notification.toolkit.fluxcd.io
version: v1beta3
kind: Alert
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a GitOps Toolkit resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
suspended: [ spec, suspend ]
- groupVersionKind:
group: notification.toolkit.fluxcd.io
version: v1beta3
kind: Provider
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a GitOps Toolkit resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
suspended: [ spec, suspend ]
- groupVersionKind:
group: notification.toolkit.fluxcd.io
version: v1
kind: Receiver
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a GitOps Toolkit resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
webhook_path: [ status, webhookPath ]
- groupVersionKind:
group: image.toolkit.fluxcd.io
version: v1beta2
kind: ImageRepository
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a GitOps Toolkit resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
image: [ spec, image ]
- groupVersionKind:
group: image.toolkit.fluxcd.io
version: v1beta2
kind: ImagePolicy
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a GitOps Toolkit resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
source_name: [ spec, imageRepositoryRef, name ]
- groupVersionKind:
group: image.toolkit.fluxcd.io
version: v1beta1
kind: ImageUpdateAutomation
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a GitOps Toolkit resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
source_name: [ spec, sourceRef, name ]

View File

@ -0,0 +1,18 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- repository.yaml
- release.yaml
- podmonitor-gitops-system.yaml
- podmonitor-ingress.yaml
configMapGenerator:
- name: flux-kube-state-metrics-config
files:
- kube-state-metrics-config.yaml
options:
labels:
app.kubernetes.io/part-of: flux
app.kubernetes.io/component: monitoring
configurations:
- kustomizeconfig.yaml

View File

@ -0,0 +1,6 @@
nameReference:
- kind: ConfigMap
version: v1
fieldSpecs:
- path: spec/valuesFrom/name
kind: HelmRelease

View File

@ -0,0 +1,30 @@
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: gitops-system
namespace: monitoring
labels:
app.kubernetes.io/part-of: flux
app.kubernetes.io/component: monitoring
spec:
namespaceSelector:
matchNames:
- gitops-system
selector:
matchExpressions:
- key: app
operator: In
values:
- helm-controller
- source-controller
- kustomize-controller
- notification-controller
- image-automation-controller
- image-reflector-controller
podMetricsEndpoints:
- port: "8080"
relabelings:
# https://github.com/prometheus-operator/prometheus-operator/issues/4816
- sourceLabels: [__meta_kubernetes_pod_phase]
action: keep
regex: Running

View File

@ -0,0 +1,18 @@
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: nginx-ingress-podmonitor
labels:
app.kubernetes.io/part-of: nginx
app.kubernetes.io/component: monitoring
spec:
selector:
matchLabels:
app.kubernetes.io/instance: nginx
namespaceSelector:
matchNames:
- ingress
podMetricsEndpoints:
- port: "9113"
interval: 30s
path: /metrics

View File

@ -0,0 +1,60 @@
apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
name: prometheus-agent
spec:
interval: 1m
chart:
spec:
version: "55.x"
chart: kube-prometheus-stack
sourceRef:
kind: HelmRepository
name: prometheus-community
interval: 10m
install:
crds: Create
upgrade:
crds: CreateReplace
driftDetection:
mode: enabled
ignore:
# Ignore "validated" annotation which is not inserted during install
- paths: [ "/metadata/annotations/prometheus-operator-validated" ]
target:
kind: PrometheusRule
valuesFrom:
- kind: ConfigMap
name: flux-kube-state-metrics-config
valuesKey: kube-state-metrics-config.yaml
# https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml
values:
global:
imageRegistry: "artifact.onwalk.net/base"
prometheus:
agentMode: true
prometheusSpec:
remoteWrite:
- name: remote_prometheus
url: 'https://prometheus.svc-dev.ink/api/v1/write'
retention: 24h
resources:
requests:
cpu: 200m
memory: 200Mi
podMonitorNamespaceSelector: { }
podMonitorSelector:
matchLabels:
app.kubernetes.io/component: monitoring
defaultRules:
create: false
grafana:
enabled: false
prometheus-windows-exporter:
enabled: false
alertmanager:
enabled: false
nodeExporter:
enabled: true
kubeStateMetrics:
enabled: true

View File

@ -0,0 +1,8 @@
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: prometheus-community
spec:
interval: 12h
type: oci
url: oci://ghcr.io/prometheus-community/charts

View File

@ -0,0 +1,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- repository.yaml
- release.yaml

View File

@ -0,0 +1,60 @@
apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
name: loki-stack
spec:
interval: 1m
# dependsOn:
# - name: kube-prometheus-stack
chart:
spec:
version: "2.x"
chart: loki-stack
sourceRef:
kind: HelmRepository
name: grafana-charts
interval: 60m
# https://github.com/grafana/helm-charts/blob/main/charts/loki-stack/values.yaml
# https://github.com/grafana/loki/blob/main/production/helm/loki/values.yaml
values:
promtail:
enabled: true
loki:
enabled: true
isDefault: false
ingress:
enabled: true
ingressClassName: nginx
hosts:
- host: loki.svc-dev.ink
paths:
- "/"
tls:
- secretName: obs-tls
hosts:
- loki.svc-dev.ink
ruler:
storage:
type: local
local:
directory: /rules
rule_path: /tmp/scratch
alertmanager_url: https://alertmanager.svc-dev.ink
ring:
kvstore:
store: inmemory
enable_api: true
remote_write:
enabled: true
client:
url: http://prometheus.svc-dev.ink/api/v1/write
serviceMonitor:
enabled: true
additionalLabels:
app.kubernetes.io/part-of: kube-prometheus-stack
config:
chunk_store_config:
max_look_back_period: 0s
table_manager:
retention_deletes_enabled: true
retention_period: 12h

View File

@ -0,0 +1,7 @@
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: grafana-charts
spec:
interval: 120m0s
url: https://grafana.github.io/helm-charts

View File

@ -1,4 +1,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- clusterissuer.yaml
- release.yaml

View File

@ -0,0 +1,101 @@
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: stable
namespace: monitoring
spec:
interval: 10m
url: https://charts.onwalk.net/
---
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: observabilityagent
namespace: monitoring
spec:
chart:
spec:
chart: observabilityagent
version: "0.1.7"
sourceRef:
kind: HelmRepository
name: stable
namespace: monitoring
interval: 1m
values:
fluent-bit:
enabled: false
telegraf:
enabled: true
config:
agent:
interval: "10s"
round_interval: true
metric_batch_size: 1000
metric_buffer_limit: 10000
collection_jitter: "0s"
flush_interval: "10s"
flush_jitter: "0s"
precision: ""
debug: false
quiet: false
logfile: ""
hostname: "$HOSTNAME"
omit_hostname: true
processors:
- enum:
mapping:
field: "status"
dest: "status_code"
value_mappings:
healthy: 1
problem: 2
critical: 3
outputs:
- influxdb:
urls:
- "https://influxdb.svc-dev.ink"
database: "telegraf"
inputs:
- net:
interfaces: *
- statsd:
service_address: ":8125"
percentiles:
- 50
- 95
- 99
metric_separator: "_"
allowed_pending_messages: 10000
percentile_limit: 1000
deepflow-agent:
enabled: true
deepflowServerNodeIPS:
- 10.0.1.3
deepflowK8sClusterID: d-rUJ4CUKMUt
prometheus:
enabled: true
server:
name: agent
retention: "30m"
extraFlags:
- web.enable-lifecycle
- enable-feature=expand-external-labels
remoteWrite:
- name: remote_prometheus
url: 'https://prometheus.svc-dev.ink/api/v1/write'
persistentVolume:
enabled: false
alertmanager:
enabled: false
prometheus-pushgateway:
enabled: false
kube-state-metrics:
enabled: false
prometheus-node-exporter:
enabled: false
promtail:
enabled: true
config:
clients:
- url: https://loki.svc-dev.ink/loki/api/v1/push

View File

@ -0,0 +1,94 @@
apiVersion: monitoring.coreos.com/v1alpha1
kind: PrometheusAgent
metadata:
annotations:
meta.helm.sh/release-name: prometheus-agent
meta.helm.sh/release-namespace: monitoring
creationTimestamp: "2023-12-27T12:13:56Z"
generation: 2
labels:
app: kube-prometheus-stack-prometheus
app.kubernetes.io/instance: prometheus-agent
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/part-of: kube-prometheus-stack
app.kubernetes.io/version: 55.5.0
chart: kube-prometheus-stack-55.5.0
helm.toolkit.fluxcd.io/name: prometheus-agent
helm.toolkit.fluxcd.io/namespace: monitoring
heritage: Helm
release: prometheus-agent
name: prometheus-agent-kube-prom-prometheus
namespace: monitoring
resourceVersion: "14691"
uid: 9bf6429e-2ae1-4568-95ee-0e2dc1a4071f
spec:
externalUrl: http://prometheus-agent-kube-prom-prometheus.monitoring:9090
hostNetwork: false
image: artifact.onwalk.net/base/prometheus/prometheus:v2.48.1
listenLocal: false
logFormat: logfmt
logLevel: info
paused: false
podMonitorNamespaceSelector: {}
podMonitorSelector:
matchLabels:
app.kubernetes.io/component: monitoring
portName: http-web
probeNamespaceSelector: {}
probeSelector:
matchLabels:
release: prometheus-agent
remoteWrite:
- name: remote_prometheus
url: https://prometheus.svc-dev.ink/api/v1/write
replicas: 1
resources:
requests:
cpu: 200m
memory: 200Mi
routePrefix: /
scrapeConfigNamespaceSelector: {}
scrapeConfigSelector:
matchLabels:
release: prometheus-agent
scrapeInterval: 30s
securityContext:
fsGroup: 2000
runAsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
seccompProfile:
type: RuntimeDefault
serviceAccountName: prometheus-agent-kube-prom-prometheus
serviceMonitorNamespaceSelector: {}
serviceMonitorSelector:
matchLabels:
release: prometheus-agent
shards: 1
version: v2.48.1
walCompression: true
status:
availableReplicas: 1
conditions:
- lastTransitionTime: "2023-12-27T13:20:17Z"
message: ""
observedGeneration: 2
reason: ""
status: "True"
type: Available
- lastTransitionTime: "2023-12-27T13:20:17Z"
message: ""
observedGeneration: 2
reason: ""
status: "True"
type: Reconciled
paused: false
replicas: 1
shardStatuses:
- availableReplicas: 1
replicas: 1
shardID: "0"
unavailableReplicas: 0
updatedReplicas: 1
unavailableReplicas: 0
updatedReplicas: 1

View File

@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: itsm-dev
resources:
- release.yaml

View File

@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: itsm-dev
resources:
- release.yaml

View File

@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: itsm-dev
resources:
- release.yaml

38
apps/redis/release.yaml Normal file
View File

@ -0,0 +1,38 @@
apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
name: redis
namespace: itsm-dev
spec:
interval: 1m
chart:
spec:
version: "18.12.1"
chart: redis
sourceRef:
kind: HelmRepository
name: stable
namespace: itsm-dev
interval: 1m
values:
enabled: true
nameOverride: "redis"
architecture: standalone
global:
imageRegistry: ""
redis:
password: "redis"
auth:
enabled: true
sentinel: false
password: ""
master:
persistence:
enabled: false
resources:
requests:
memory: 100Mi
cpu: 100m
limits:
cpu: "200m"
memory: "300Mi"

View File

@ -0,0 +1,8 @@
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: stable
namespace: monitoring
spec:
interval: 1m
url: https://charts.onwalk.net/

View File

@ -0,0 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- namespace.yaml
- helmrepo.yaml
- observability-agent.yaml

View File

@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring

View File

View File

@ -0,0 +1,12 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- ../../apps/monitor/observability-agent/
- ../../apps/monitor/kube-prometheus-stack/
- ../../apps/monitor/flagger/
- ../../apps/demo/c-app
- ../../apps/demo/js-app
- ../../apps/demo/python-app
- ../../apps/demo/go-app
- ../../apps/demo/rust-app

View File

@ -0,0 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
labels:
app.kubernetes.io/component: monitoring

View File

@ -0,0 +1,11 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- repository.yaml
- ../../apps/itsm-dev/
- ../../apps/redis/
- ../../apps/mysql/
- ../../apps/postgresql/
- ../../apps/mongodb/
- ../../apps/minio/

View File

@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: itsm-dev

View File

@ -0,0 +1,8 @@
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: stable
namespace: itsm-dev
spec:
interval: 1m0s
url: https://charts.onwalk.net

View File

@ -0,0 +1,53 @@
apiVersion: v1
kind: ConfigMap
metadata:
annotations:
meta.helm.sh/release-name: observability-server
meta.helm.sh/release-namespace: monitoring
labels:
app.kubernetes.io/component: server
app.kubernetes.io/instance: observability-server
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: prometheus
app.kubernetes.io/version: v2.48.1
helm.sh/chart: prometheus-25.8.2
name: observability-server-prometheus-server
namespace: monitoring
data:
alerting_rules.yml: |
groups:
- name: host-monitoring
rules:
- alert: HighLoad
expr: node_load1 > 2.0
for: 5m
labels:
severity: warning
annotations:
summary: High load on {{ $labels.instance }}
description: "Load is High (threshold: 2.0)"
- alert: HighCpuUsage
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 5m
labels:
severity: critical
annotations:
summary: High CPU usage on {{ $labels.instance }}
description: "CPU usage is > 80%"
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / node_memory_MemTotal_bytes * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: High memory usage on {{ $labels.instance }}
description: "Memory usage is High"
- alert: HighDiskUsage
expr: node_filesystem_avail_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10
for: 5m
labels:
severity: critical
annotations:
summary: High disk usage on {{ $labels.instance }}
description: "Disk usage is High"

View File

@ -0,0 +1,8 @@
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: stable
namespace: monitoring
spec:
interval: 1m
url: https://charts.onwalk.net/

View File

@ -0,0 +1,24 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
annotations:
nginx.ingress.kubernetes.io/ssl-redirect: "true"
name: flagger
namespace: monitoring
spec:
ingressClassName: nginx
rules:
- host: flaggerloadtester.svc-dev.ink
http:
paths:
- backend:
service:
name: flagger-loadtester
port:
number: 80
path: /
pathType: Prefix
tls:
- hosts:
- flaggerloadtester.svc-dev.ink
secretName: obs-tls

View File

@ -0,0 +1,14 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- ../../apps/monitor/flagger/
- ../../apps/monitor/flagger-loadtester/
- ../../apps/monitor/loki-stack
- namespace.yaml
- helmrepo.yaml
- observability-agent.yaml
- ingress-flagger.yaml
# - prometheus-server-configmap.yaml
# - alert-rules-patch.yaml
# - recording-rules-patch.yaml

View File

@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring

View File

@ -0,0 +1,27 @@
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: observabilityagent
namespace: monitoring
spec:
chart:
spec:
chart: observabilityagent
version: "0.1.7"
sourceRef:
kind: HelmRepository
name: stable
namespace: monitoring
interval: 1m
values:
fluent-bit:
enabled: false
deepflow-agent:
enabled: false
prometheus:
enabled: false
promtail:
enabled: true
config:
clients:
- url: https://loki.svc-dev.ink/loki/api/v1/push

View File

@ -0,0 +1,339 @@
apiVersion: v1
kind: ConfigMap
metadata:
annotations:
meta.helm.sh/release-name: observability-server
meta.helm.sh/release-namespace: monitoring
labels:
app.kubernetes.io/component: server
app.kubernetes.io/instance: observability-server
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: prometheus
app.kubernetes.io/version: v2.48.1
helm.sh/chart: prometheus-25.8.2
name: observability-server-prometheus-server
namespace: monitoring
data:
alerting_rules.yml: |
{}
alerts: |
{}
allow-snippet-annotations: "false"
prometheus.yml: |
global:
evaluation_interval: 1m
scrape_interval: 1m
scrape_timeout: 10s
rule_files:
- /etc/config/recording_rules.yml
- /etc/config/alerting_rules.yml
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- localhost:9090
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: kubernetes-apiservers
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: default;kubernetes;https
source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: kubernetes-nodes
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- replacement: kubernetes.default.svc:443
target_label: __address__
- regex: (.+)
replacement: /api/v1/nodes/$1/proxy/metrics
source_labels:
- __meta_kubernetes_node_name
target_label: __metrics_path__
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: kubernetes-nodes-cadvisor
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- replacement: kubernetes.default.svc:443
target_label: __address__
- regex: (.+)
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
source_labels:
- __meta_kubernetes_node_name
target_label: __metrics_path__
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
- honor_labels: true
job_name: kubernetes-service-endpoints
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
- action: drop
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: (.+?)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
replacement: __param_$1
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- action: replace
source_labels:
- __meta_kubernetes_service_name
target_label: service
- action: replace
source_labels:
- __meta_kubernetes_pod_node_name
target_label: node
- honor_labels: true
job_name: kubernetes-service-endpoints-slow
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: (.+?)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
replacement: __param_$1
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- action: replace
source_labels:
- __meta_kubernetes_service_name
target_label: service
- action: replace
source_labels:
- __meta_kubernetes_pod_node_name
target_label: node
scrape_interval: 5m
scrape_timeout: 30s
- honor_labels: true
job_name: prometheus-pushgateway
kubernetes_sd_configs:
- role: service
relabel_configs:
- action: keep
regex: pushgateway
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_probe
- honor_labels: true
job_name: kubernetes-services
kubernetes_sd_configs:
- role: service
metrics_path: /probe
params:
module:
- http_2xx
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_probe
- source_labels:
- __address__
target_label: __param_target
- replacement: blackbox
target_label: __address__
- source_labels:
- __param_target
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- source_labels:
- __meta_kubernetes_service_name
target_label: service
- honor_labels: true
job_name: kubernetes-pods
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
- action: drop
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
replacement: '[$2]:$1'
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label: __address__
- action: replace
regex: (\d+);((([0-9]+?)(\.|$)){4})
replacement: $2:$1
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
replacement: __param_$1
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: pod
- action: drop
regex: Pending|Succeeded|Failed|Completed
source_labels:
- __meta_kubernetes_pod_phase
- action: replace
source_labels:
- __meta_kubernetes_pod_node_name
target_label: node
- honor_labels: true
job_name: kubernetes-pods-slow
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
replacement: '[$2]:$1'
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label: __address__
- action: replace
regex: (\d+);((([0-9]+?)(\.|$)){4})
replacement: $2:$1
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
replacement: __param_$1
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: pod
- action: drop
regex: Pending|Succeeded|Failed|Completed
source_labels:
- __meta_kubernetes_pod_phase
- action: replace
source_labels:
- __meta_kubernetes_pod_node_name
target_label: node
scrape_interval: 5m
scrape_timeout: 30s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager.svc-dev.ink
recording_rules.yml: |
{}
rules: |
{}

View File

@ -0,0 +1,29 @@
apiVersion: v1
kind: ConfigMap
metadata:
annotations:
meta.helm.sh/release-name: observability-server
meta.helm.sh/release-namespace: monitoring
labels:
app.kubernetes.io/component: server
app.kubernetes.io/instance: observability-server
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: prometheus
app.kubernetes.io/version: v2.48.1
helm.sh/chart: prometheus-25.8.2
name: observability-server-prometheus-server
namespace: monitoring
data:
recording_rules.yml: |
groups:
- name: host-monitoring
rules:
- record: node_load1
expr: node_load1
- record: node_cpu_usage
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
- record: node_memory_usage
expr: (node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / node_memory_MemTotal_bytes * 100
- record: node_disk_usage
expr: 100 - (avg by (instance) (node_filesystem_avail_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"}) * 100)

View File

@ -0,0 +1,12 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- ../../apps/monitor/observability-agent/
- ../../apps/monitor/kube-prometheus-stack/
- ../../apps/monitor/flagger/
- ../../apps/demo/c-app
- ../../apps/demo/js-app
- ../../apps/demo/python-app
- ../../apps/demo/go-app
- ../../apps/demo/rust-app

View File

@ -0,0 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
labels:
app.kubernetes.io/component: monitoring

View File

@ -0,0 +1,12 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- ../../apps/monitor/observability-agent/
- ../../apps/monitor/kube-prometheus-stack/
- ../../apps/monitor/flagger/
- ../../apps/demo/c-app
- ../../apps/demo/js-app
- ../../apps/demo/python-app
- ../../apps/demo/go-app
- ../../apps/demo/rust-app

View File

@ -0,0 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
labels:
app.kubernetes.io/component: monitoring

View File

@ -0,0 +1,12 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- ../../apps/monitor/observability-agent/
- ../../apps/monitor/kube-prometheus-stack/
- ../../apps/monitor/flagger/
- ../../apps/demo/c-app
- ../../apps/demo/js-app
- ../../apps/demo/python-app
- ../../apps/demo/go-app
- ../../apps/demo/rust-app

View File

@ -0,0 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
labels:
app.kubernetes.io/component: monitoring

72
config/README.md Normal file
View File

@ -0,0 +1,72 @@
# Configuration Layout
This repository keeps environment-specific infrastructure configuration under the `config/` directory. The layout follows a consistent project → environment → cloud/provider → resource-module hierarchy so that each stack can be managed independently.
## Recommended standard layout
```
config/
└── <project>/
└── <env>/
└── <cloud>/
├── base.yaml
├── identity.yaml
├── network.yaml
├── security.yaml
├── storage.yaml
├── compute.yaml
├── observability.yaml
└── <feature>.yaml
```
- **Project**: top-level application or platform (for example `modern-container-app`, `cloudneutral-platform`, or `ai-infra-lab`).
- **Environment**: fully isolated deployment stages such as `dev`, `sit`, `uat`, and `prod`.
- **Cloud/Provider**: clear provider identifiers such as `aws-cloud`, `gcp-cloud`, or `vultr-vps`.
- **Resource modules**: YAML slices for base settings, identity, network, security, storage, compute, observability, and feature-specific needs.
## Applied layout for this repository
The current repo uses the `xzerolab` project with a `sit` environment. Provider-specific configurations are organized per cloud alongside shared assets for future environments.
```
config/
├── bootstrap.yaml
└── xzerolab/
└── sit/
├── aws-cloud/
│ ├── accounts/
│ │ ├── bootstrap.yaml
│ │ ├── dev-landingzone.yaml
│ │ └── dev.yaml
│ ├── provider_backend.yaml
│ └── resources/
│ ├── dev-alb/alb.yaml
│ ├── dev-kafka/msk.yaml
│ ├── dev-nlb/nlb.yaml
│ ├── dev-object/bucket.yaml
│ ├── dev-rds/rds.yaml
│ ├── dev-redis/redis.yaml
│ ├── ec2/dev.yaml
│ └── vpc/dev.yaml
├── gcp-cloud/
│ ├── accounts/
│ │ ├── bootstrap.yaml
│ │ ├── dev-landingzone.yaml
│ │ └── dev.yaml
│ └── resources/
│ ├── dev-alb/alb.yaml
│ ├── dev-kafka/msk.yaml
│ ├── dev-nlb/nlb.yaml
│ ├── dev-object/bucket.yaml
│ ├── dev-rds/rds.yaml
│ ├── dev-redis/redis.yaml
│ ├── ec2/dev.yaml
│ └── vpc/dev.yaml
└── vultr-vps/
├── accounts/
│ └── .gitkeep
└── resources/
└── .gitkeep
```
Use this layout to keep each environment and provider self-contained, making it easy for CI/CD workflows to target the exact configuration needed for a deployment.

View File

@ -0,0 +1,8 @@
audit:
actiontrail:
enabled: true
name: lz-mvp-actiontrail
oss_bucket_ref: lz-mvp-actiontrail-logs
oss_key_prefix: actiontrail
trail_region: cn-hangzhou
event_rw: All

View File

@ -0,0 +1,5 @@
alicloud:
region: cn-hangzhou
default_tags:
project: landingzone-mvp
owner: your-github-handle

View File

@ -0,0 +1,27 @@
config_service:
recorder:
name: lz-config-recorder
resource_types:
- ACS::ECS::Instance
- ACS::OSS::Bucket
- ACS::VPC::VSwitch
delivery_channel:
name: lz-config-delivery
display_name: LandingZoneBaseline
type: OSS
target_arn: acs:oss:cn-hangzhou:${AliUid}:lz-mvp-actiontrail-logs
assume_role_arn: acs:ram::${AliUid}:role/aliyunconfigdefaultrole
description: Deliver baseline compliance evaluations to OSS
status: 1
rules:
- name: lz-required-env-tag
description: Ensure env tag exists on core resources
source_identifier: ecs-instance-required-tag
source_owner: ALIYUN
risk_level: 2
trigger_types: ConfigurationItemChangeNotification
resource_types_scopes:
- ACS::ECS::Instance
input_parameters:
tagKey: env
maximum_execution_frequency: TwentyFour_Hours

View File

@ -0,0 +1,28 @@
identity:
users:
- name: ops-automation
display_name: Landing Zone Automation
comments: Dedicated RAM user for IaC pipelines
policies:
- name: AliyunOSSFullAccess
type: System
- name: AliyunVPCFullAccess
type: System
- name: AliyunConfigFullAccess
type: System
- name: audit-viewer
display_name: Landing Zone Auditor
comments: Read-only access for monitoring
policies:
- name: ReadOnlyAccess
type: System
groups:
- name: ops-admins
comments: Baseline operations team
policies:
- name: AliyunConfigFullAccess
type: System
- name: AliyunVPCFullAccess
type: System
users:
- ops-automation

View File

@ -0,0 +1,18 @@
network:
vpcs:
- name: lz-main-vpc
cidr_block: 10.10.0.0/16
description: Landing zone baseline VPC
tags:
env: shared
vswitches:
- name: lz-prod-subnet
cidr_block: 10.10.1.0/24
zone_id: cn-hangzhou-h
tags:
env: prod
- name: lz-test-subnet
cidr_block: 10.10.2.0/24
zone_id: cn-hangzhou-h
tags:
env: test

View File

@ -0,0 +1,18 @@
security:
groups:
- name: lz-base-sg
vpc: lz-main-vpc
description: Baseline security group allowing outbound traffic only
tags:
env: shared
ingress:
- protocol: tcp
port_range: "22/22"
cidr_ip: 0.0.0.0/0
description: Temporary SSH access for break-glass
policy: accept
egress:
- protocol: all
port_range: "-1/-1"
cidr_ip: 0.0.0.0/0
policy: accept

View File

@ -0,0 +1,17 @@
storage:
oss_buckets:
- name: lz-mvp-actiontrail-logs
bucket: lz-mvp-actiontrail-logs
storage_class: Standard
versioning:
status: Enabled
lifecycle_rules:
- id: archive-audit-logs
enabled: true
transitions:
- storage_class: IA
days: 180
- storage_class: Archive
days: 365
tags:
env: prod

View File

@ -0,0 +1,6 @@
aws:
region: us-east-1
default_tags:
project: landingzone-global
owner: your-github-handle
environment: shared

View File

@ -0,0 +1,27 @@
identity:
users:
- name: lz-automation
path: /landingzone/
tags:
role: automation
policies:
- arn: arn:aws:iam::aws:policy/AdministratorAccess
- name: lz-auditor
path: /landingzone/
tags:
role: audit
policies:
- arn: arn:aws:iam::aws:policy/SecurityAudit
groups:
- name: lz-operations
path: /landingzone/
policies:
- arn: arn:aws:iam::aws:policy/PowerUserAccess
users:
- lz-automation
- name: lz-audit
path: /landingzone/
policies:
- arn: arn:aws:iam::aws:policy/SecurityAudit
users:
- lz-auditor

View File

@ -0,0 +1,29 @@
network:
vpcs:
- name: lz-global-vpc
cidr_block: 10.20.0.0/16
tags:
env: shared
subnets:
- name: lz-public-a
cidr_block: 10.20.1.0/24
availability_zone: us-east-1a
type: public
tags:
tier: ingress
- name: lz-private-a
cidr_block: 10.20.11.0/24
availability_zone: us-east-1a
type: private
tags:
tier: application
- name: lz-private-b
cidr_block: 10.20.21.0/24
availability_zone: us-east-1b
type: private
tags:
tier: application
routes:
- subnet_type: public
destination_cidr_block: 0.0.0.0/0
gateway: internet_gateway

View File

@ -0,0 +1,24 @@
security:
groups:
- name: lz-base-sg
vpc: lz-global-vpc
description: Baseline security group allowing outbound traffic and limited inbound access
tags:
tier: baseline
ingress:
- protocol: tcp
from_port: 22
to_port: 22
cidr_blocks:
- 0.0.0.0/0
description: Temporary SSH access for break-glass
- protocol: tcp
port_range: "443/443"
cidr_blocks:
- 0.0.0.0/0
description: HTTPS access for shared services
egress:
- protocol: all
port_range: "-1/-1"
cidr_blocks:
- 0.0.0.0/0

View File

@ -0,0 +1,18 @@
storage:
buckets:
- name: lz-global-logs
acl: private
versioning: true
force_destroy: false
block_public_access: true
lifecycle_rules:
- id: expire-old-logs
enabled: true
transitions:
- storage_class: GLACIER
days: 90
expiration_days: 365
server_side_encryption:
sse_algorithm: AES256
tags:
purpose: audit-logs

17
config/bootstrap.yaml Normal file
View File

@ -0,0 +1,17 @@
region: ap-northeast-1
environment: bootstrap
account_name: xzerolab
account_id: 950604983695
state:
bucket_name: aws-cloud-iac-state
dynamodb_table_name: aws-cloud-iac-state-dynamodb-lock
iam:
role_name: IacDeployRole
terraform_user_name: github-ci-runner
tags:
Owner: Platform
Project: CloudNeutral

6
config/sit/base.yaml Normal file
View File

@ -0,0 +1,6 @@
aws:
profile: default
region: ap-northeast-1
key_pairs:
- name: dev_key
key_file: ~/.ssh/id_rsa.pub

37
config/sit/firewall.yaml Normal file
View File

@ -0,0 +1,37 @@
firewall_rules:
- name: allow-web-inbound
enabled: true
vpc_name: dev-vpc-1
source_ranges: ["0.0.0.0/0"]
egress_ranges: ["10.0.0.0/16"]
allow:
- protocol: tcp
ports: ["80", "443"]
- name: dev-vpc-1-default-inbound
enabled: true
vpc_name: dev-vpc-1
description: Allow ICMP, SSH, and VXLAN from all sources
source_ranges: ["0.0.0.0/0"]
allow:
- protocol: icmp
- protocol: tcp
ports: ["22"]
- protocol: udp
ports: ["4789"]
- protocol: udp
ports: ["51820"]
- name: dev-vpc-2-default-inbound
enabled: true
vpc_name: dev-vpc-2
description: Allow ICMP, SSH, and VXLAN from all sources
source_ranges: ["0.0.0.0/0"]
allow:
- protocol: icmp
- protocol: tcp
ports: ["22"]
- protocol: udp
ports: ["4789"]
- protocol: udp
ports: ["51820"]

48
config/sit/instances.yaml Normal file
View File

@ -0,0 +1,48 @@
instances:
- name: master-1
ami: ubuntu-24.04 # ✅ 可用 ami-xxx 或关键词(如 ubuntu-22.04
type: t3a.xlarge
disk_size_gb: 20
sg_names: ["dev-vpc-1-default-inbound"]
subnet: dev-vpc-1-public-subnet-1
lifecycle: spot # 可选: ondemand默认或 spot
ttl: 1h # 可选: 标记生命周期(不会自动销毁)
env: sit # 可选: dev/sit/prod 等环境标签
owner: devops # 可选: 资源责任人标签
associate_public_ip: true # ✅ 明确配置是否需要公网 IP
- name: slave-1
ami: ubuntu-24.04
type: t3.small
sg_names: ["dev-vpc-2-default-inbound"]
disk_size_gb: 20
subnet: dev-vpc-2-public-subnet-1
lifecycle: spot
ttl: 1h
env: sit
owner: devops
associate_public_ip: true
- name: agent-1
ami: ubuntu-24.04
type: t3.micro
disk_size_gb: 20
subnet: dev-vpc-1-public-subnet-1
sg_names: ["dev-vpc-1-default-inbound"]
lifecycle: spot
ttl: 1h
env: sit
owner: devops
associate_public_ip: true
- name: agent-2
ami: ubuntu-24.04
type: t3.micro
disk_size_gb: 20
subnet: dev-vpc-2-public-subnet-1
sg_names: ["dev-vpc-2-default-inbound"]
lifecycle: spot
ttl: 1h
env: sit
owner: devops
associate_public_ip: true

44
config/sit/vpc.yaml Normal file
View File

@ -0,0 +1,44 @@
vpcs:
- name: dev-vpc-1
cidr_block: 10.1.0.0/16
subnets:
- name: dev-vpc-1-public-subnet-1
cidr_block: 10.1.1.0/24
availability_zone: ap-northeast-1a
type: public
- name: dev-vpc-1-private-subnet-1
cidr_block: 10.1.101.0/24
availability_zone: ap-northeast-1c
type: private
routes:
- name: dev-vpc-1-public-route
destination_cidr_block: 0.0.0.0/0
subnet_type: public
gateway: internet_gateway
peering:
enabled: false
peer_vpc_id: null
peer_region: null
auto_accept: false
- name: dev-vpc-2
cidr_block: 10.2.0.0/16
subnets:
- name: dev-vpc-2-public-subnet-1
cidr_block: 10.2.1.0/24
availability_zone: ap-northeast-1a
type: public
- name: dev-vpc-2-private-subnet-1
cidr_block: 10.2.101.0/24
availability_zone: ap-northeast-1c
type: private
routes:
- name: dev-vpc-2-public-route
destination_cidr_block: 0.0.0.0/0
subnet_type: public
gateway: internet_gateway
peering:
enabled: false
peer_vpc_id: null
peer_region: null
auto_accept: false

19
config/sit/vpn-keys.md Normal file
View File

@ -0,0 +1,19 @@
只加密 private_key 字段
1. 原始 vpn-keys.yaml
yaml
keys:
- name: master-1
private_key: <master_private_key>
public_key: <master_public_key>
2. 使用 ansible-vault encrypt_string 加密 private_key
- ansible-vault encrypt_string 'private-key-xxxx' --name 'private_key'
- ansible-vault encrypt_string 'public_key-xxxx' --name 'public_key'
示例输出(加密后是 YAML 结构):
yaml
private_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
62326432376162336462343864333933356363373235623262306463326432363737623732613763
3962613662616565393463343030653733623066626137610a313465323462623261303031323337

145
config/sit/vpn-keys.yaml Normal file
View File

@ -0,0 +1,145 @@
keys:
- name: cn-hub
private_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
33643635306332303761356562383035353333373234393132313162613834323963313635326562
3932356235303234356561623762393862666438386565310a376235306238343139386532336162
65623164666665353435653432396530303634666438656566656466643866366139613961363631
6363306631393038320a613163313338313237383837303966356333303737643331616433396430
33316331333766613438356462313130326433363961316162313761616561616466363939613033
3837623938376434656434386135333739613939653133373733
public_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
38336537383061383333643431643261343739323864316235303366623930633366336139386636
6162336232336533636134353863386233303631626363360a376533336664636661373933623230
34333765346661383335663034393561646436333135613838373438396336633061396533613061
3031326364353036630a373862396266653961346334663139626633313362656131663163383563
34376231306239636536313830333962323934343035333263643234363363396164626366353061
3833613132373666303563623863373735396566666239316536
- name: global-hub
private_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
63343838666530633031313536616535313936373634396165376132333661616534663937626632
3530646463663462383130323930356239636438643035380a343433303064383531663332303839
32613733323263623836346266383363336361323036383536313031386435386534646661616463
6631346431316334620a643831313033326261333365623037306565663131373664343930623665
31346564363635323765336465646466663631376538626237386165326464326632323438663038
3937363832363731353834633663646538666232336239353936
public_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
34666430316566393939656436323231623935316331373264383830653934323261656136373666
3630356330396362323763383832376538633163636331650a376339326661363431353532303831
37336134303235633334643036326564313163626433613261333062336238316333363165386263
3666386330343261340a333662636630356635373938623335656462633039353565383133613935
35643661363334313733346430633432353736343463613264393433623135613833376435333661
3462643164356563346166656237613334616130386532393565
- name: deepflow-demo
private_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
36316136663466336564383766626434626338356130626537663163373530326332366335306136
3266383533373032623366396139653063626338646237310a353439346238653832646437313663
62623239623761326436613833313739386662356263353338666461363438613766663962386162
3539343836623936370a313439316335346235306633333333643738333461323963313038313161
62376566626335306335623134346361326364346433626234383162616636326265356364313938
6534613330643764613733333266313365633635663138636633
public_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
32326461646138373831356335656664643737313032656134663138323439313164353766363134
6232303034663064303235303363663661326433313536660a666133616438316436306463303163
64646530633639616266396563383362306235313662373565323963633039653931376431303565
6136396164346563660a643235646232353061323463396539383266333133343532396139373035
39653262653638363930383861353262303030373332313538383362393633663562303566373737
3062336434313031613534393033616330333363613863613464
- name: icp-aliyun
private_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
34383966663239613361363535616332303432393165643433663461633934363535626137326664
6532646433306636393734666164613864636636626630660a636636306435343661366234343661
30326362306537633561636265666232373437353034643462656538653835653831303263306662
3361323333353935350a316539303863646434336136333862626261363031336232666562326434
39303961383563623736383962363330363439313064613632383061313438373330356366323534
6533613662373736373131363463663734656261643839383862
public_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
65393861336537646335613534376635343838656233646333386438653766636539333436623665
6562396637666365613562373565383263353534343931350a323563346239666534303162353432
63646562363362396333333738333664376136303066316135633633323466326233613264623366
6166613531623135660a363465636137643337626137386661306237323731353839303734653436
32643065663739303161626261393062613764346662633365336162613134633131383062646133
6437313463376164386465663365386436633466363633383366
- name: tky-proxy
private_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
39303737303631303963613131373734373338663232366534303832646664326365353730313665
3664643734336466613839663239613433373837633064300a373634343034323739646565326464
32343237303731656666323332656138643533323338626631626630316435623564616330333237
6339626537376163360a376663653533663332353163303363386564373233666230323735343863
66363730653134343037363739353464663834373134656639303932646635336664303537376665
3961393930616464343632363039333465633364626433363761
public_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
61366364303934343039356565643939613032373932356264393739343832366231653335373132
3732346666336566396133343836393961336533323530310a636131316266653132346663306461
39613036396330376235623765313166303163393264373436316236366234666532343866383235
3230366539313162310a323130663530653339623366613336616433666136336463306237326461
36363536376230313135336463386566393964613238353134663432353762626166303938323266
3963383862363236643361346165373538323332363764633131
- name: us-proxy
private_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
34373039646561366365363831636438633462633536343834356263396331333864396161363630
6631373964666239663064633936333135653663306464320a316463363362313336373437383937
38663665323531346536363030333637663631623765373466386664623332616432613334623933
6362353736396662300a343430633865363637313732383065613836363231623862616535383033
38333861393761633437316435306263356131353133376532323661366465616130616332366436
3430663134636430613139333238343265613764616234383362
public_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
62316266633037313333333966646331613830633733616438666533303735613763376632336562
3864333538333535323862333230663664306561386534340a343038356565643530323061323034
35353663643465616633346363626430623435396263646339373137303830303031326462653966
3266313038373466300a643833373063363862643533393838613266666363326363383034653366
34633063616361653762323130363832353132613531326131323336353339616166396464303337
6338353132333964376163333537363337316438313266623933
- name: ca-proxy
private_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
39303965663333646238656661376238653732363366653264353234396635313464316563613761
3937323936393363373265653864313034343462626633360a303036643838366465623965623365
35646332626232356661343966623637613037666336376562323864306630396536646230623664
6431636530326362320a383965356336313563336261633030666534613936653037393737356637
30323935393662333533373561303661366437626264383837376562323466323531616165643233
3233643237303764346130323139613537666132646532643864
public_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
61343962366534343435356236663132656636313634393563663164323630646363666264626434
3439316233626333656362623332613433313130396430610a633839393561326438636533666162
63663330313934353462663334643365323766376337363835633439653064386237373531323637
6338333364366239350a313636636438653736336563383665366661343066373761333431343933
36303062643639613632383565383534306438363461336634343662646435666231343565616333
6239326436633462346466393862336332383665313134393738
- name: icp-huawei
private_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
34356563313165386632656365393865356631663936656337316136343437363538393463363639
3562343736663335643230626335346265336365613835370a373361633064356264623932393232
63386433643761373634333232393136316333353165336463323736366363313662333863656462
3136323033626666340a623730346234396664343863656335303263376562613230373363343938
36633838303966303434336165393838346531383362316161366431393765373765396137316466
3866643163393061613732623938613035396536333837353363
public_key: !vault |
$ANSIBLE_VAULT;1.1;AES256
63383631656563313335646566356237333737653232656439336230633037346566626663653333
6533663536666464616537376236383734313231393762640a643962666334326261386462653233
39386632343965346161623761393034313532633236613430663261366530363638653430383864
3535323031663634320a366134323832323034373430383264353066333666323932663230336333
65643263363538653033326236623434366631366339313964646263316536643237643535313663
3062623634613961636532636438393830613132656266306539

155
config/sit/vpn-overlay.yaml Normal file
View File

@ -0,0 +1,155 @@
# 基础网络参数
wg_network: 172.30.0.0/16
bridge_network: 10.253.0.0/16
vxlan_id: 100
hub_port: 51820
# 全局功能开关
features:
enable_vless: true # 是否通过 VLESS 中转 WG 流量
enable_multi_hub: true # 是否支持多 Hub 架构false 则为单 Hub star 架构)
enable_vxlan_between_sits: true # 是否开启 vxlan 桥接(站点接入 Hub
enable_vxlan_between_hubs: true # 是否开启 Hub 之间的 VXLAN Mesh
only_wireguard: false # 若为 true仅使用 WireGuard 点对点,忽略 gretap/vxlan
# WireGuard Hub 节点配置
hubs:
- name: cn-hub
interface: eth0
public_ip: 1.15.155.245
pod_cidr: 10.42.0.0/16
wireguard_cidr: 172.30.0.0/16
wg_ip: 172.30.0.1
br_ip: 10.253.253.1
local_ip: 172.30.0.1
remote_ip: 172.31.0.10
xray:
uuid: "18d270a9-533d-4b13-b3f1-e7f55540a9b2"
relay_address: "global-proxy.onwalk.net"
relay_port: '51820'
remote_domain: "global-proxy.onwalk.net"
cert_path: "/etc/ssl/onwalk.net.pem"
key_path: "/etc/ssl/onwalk.net.key"
wireguard_peer:
- master-1
- slave-1
- agent-1
- agent-1
- name: global-hub
interface: ens5
public_ip: 1.15.155.245
wg_ip: 172.31.0.1
br_ip: 10.253.253.2
local_ip: 172.31.0.1
remote_ip: 172.30.0.1
xray:
uuid: "18d270a9-533d-4b13-b3f1-e7f55540a9b2"
cert_path: "/etc/ssl/onwalk.net.pem"
key_path: "/etc/ssl/onwalk.net.key"
relay_address: "cn-proxy.onwalk.net"
relay_port: '51820'
remote_domain: "cn-proxy.onwalk.net"
wireguard_peer:
- master-1
- slave-1
- agent-1
- agent-1
sites:
- name: tky-proxy
interface: ens5
public_ip: 52.196.108.28
wg_ip: 172.31.0.2
br_ip: 10.253.254.2
local_ip: 172.31.0.2
remote_ip: 172.31.0.1
wireguard_peer:
- global-hub
allowed_ips: "172.30.0.0/16,172.31.0.0/16"
xray:
uuid: "18d270a9-533d-4b13-b3f1-e7f55540a9b2"
cert_path: "/etc/ssl/onwalk.net.pem"
key_path: "/etc/ssl/onwalk.net.key"
relay_address: "global-proxy.onwalk.net"
relay_port: '51820'
remote_domain: "global-proxy.onwalk.net"
- name: us-proxy
interface: enX0
public_ip: 54.183.32.0
wg_ip: 172.31.0.3
br_ip: 10.253.254.3
local_ip: 172.31.0.3
remote_ip: 172.31.0.1
wireguard_peer:
- global-hub
allowed_ips: "172.30.0.0/16,172.31.0.0/16"
xray:
uuid: "18d270a9-533d-4b13-b3f1-e7f55540a9b2"
remote_domain: "global-proxy.onwalk.net"
cert_path: "/etc/ssl/onwalk.net.pem"
key_path: "/etc/ssl/onwalk.net.key"
- name: ca-proxy
interface: ens5
wg_ip: 172.31.0.4
br_ip: 10.253.254.4
local_ip: 172.31.0.4
remote_ip: 172.31.0.1
wireguard_peer:
- global-hub
allowed_ips: "172.30.0.0/16,172.31.0.0/16"
xray:
uuid: "18d270a9-533d-4b13-b3f1-e7f55540a9b2"
remote_domain: "global-proxy.onwalk.net"
cert_path: "/etc/ssl/onwalk.net.pem"
key_path: "/etc/ssl/onwalk.net.key"
- name: deepflow-demo
interface: wlp0s20f3
public_ip: 172.30.0.10
wg_ip: 172.30.0.10
br_ip: 10.253.253.2
local_ip: 172.30.0.10
remote_ip: 172.30.0.1
wireguard_peer: cn-hub
allowed_ips: "172.30.0.0/16"
- name: icp-aliyun
interface: eth0
public_ip: 47.120.61.35
wg_ip: 172.30.0.11
pod_cidr: 10.42.0.0/16
wireguard_cidr: 172.30.0.0/16
br_ip: 10.253.253.11
local_ip: 172.30.0.11
remote_ip: 172.30.0.1
wireguard_peer: cn-hub
allowed_ips: "172.30.0.0/16"
xray:
uuid: "18d270a9-533d-4b13-b3f1-e7f55540a9b2"
cert_path: "/etc/ssl/onwalk.net.pem"
key_path: "/etc/ssl/onwalk.net.key"
relay_address: "cn-proxy.onwalk.net"
relay_port: '51820'
remote_domain: "cn-proxy.onwalk.net"
- name: icp-huawei
interface: eth0
public_ip: 139.9.139.22
pod_cidr: 10.42.0.0/16
wireguard_cidr: 172.30.0.0/16
wg_ip: 172.30.0.12
br_ip: 10.253.253.12
local_ip: 172.30.0.12
remote_ip: 172.30.0.1
wireguard_peer: cn-hub
allowed_ips: "172.30.0.0/16"
xray:
uuid: "18d270a9-533d-4b13-b3f1-e7f55540a9b2"
cert_path: "/etc/ssl/onwalk.net.pem"
key_path: "/etc/ssl/onwalk.net.key"
relay_address: "cn-proxy.onwalk.net"
relay_port: '51820'
remote_domain: "cn-proxy.onwalk.net"

5
config/vultr/base.yaml Normal file
View File

@ -0,0 +1,5 @@
vultr:
region: ewr
default_tags:
environment: baseline
project: modern-container-app

16
config/vultr/compute.yaml Normal file
View File

@ -0,0 +1,16 @@
compute:
instances:
- name: baseline-bastion
plan: vc2-1c-1gb
region: ewr
os_id: 1743
hostname: baseline-bastion
label: baseline-bastion
enable_ipv6: false
backups: disabled
firewall_group: baseline-fw
vpcs:
- baseline-vpc
tags:
- bastion
- baseline

View File

@ -0,0 +1,7 @@
network:
vpcs:
- name: baseline-vpc
description: Baseline landing zone VPC
region: ewr
v4_subnet: 10.50.0.0
v4_subnet_mask: 16

View File

@ -0,0 +1,16 @@
security:
firewall_groups:
- name: baseline-fw
description: Baseline perimeter firewall rules
rules:
- name: allow-ssh
protocol: tcp
ip_type: v4
cidr: 0.0.0.0/0
port: "22"
notes: Allow SSH for operations
- name: allow-icmp
protocol: icmp
ip_type: v4
cidr: 0.0.0.0/0
notes: Allow ICMP diagnostics

View File

@ -0,0 +1,19 @@
account_id: 950604983695
name: dev
environment: dev
region: ap-northeast-1
role_to_assume: "arn:aws:iam::950604983695:role/IacDeployRole"
logging_bucket: org-dev-logs
shared_vpc_account: "950604983695" # 单账号,所以保持一致
tags:
Environment: dev
Owner: Platform
CostCenter: "DEV"
Project: CloudNeutral
backend:
bucket: aws-cloud-iac-state
dynamodb_table: aws-cloud-iac-state-dynamodb-lock

Some files were not shown because too many files have changed in this diff Show More