diff --git a/apps/monitoring/alloy-config.yaml b/apps/monitoring/alloy-config.yaml new file mode 100644 index 0000000..e95ce78 --- /dev/null +++ b/apps/monitoring/alloy-config.yaml @@ -0,0 +1,135 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: alloy-config + namespace: monitoring +data: + config.alloy: | + // Kubernetes pod discovery + discovery.kubernetes "k8s_pods" { + role = "pod" + } + + // Relabel for Prometheus scrape + discovery.relabel "prometheus_pods" { + targets = discovery.kubernetes.k8s_pods.targets + rule { + source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_scrape"] + action = "keep" + regex = "true" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_path"] + action = "replace" + target_label = "__metrics_path__" + regex = "(.+)" + } + rule { + source_labels = ["__address__", "__meta_kubernetes_pod_annotation_prometheus_io_port"] + action = "replace" + regex = "([^:]+)(?::\\d+)?;(\\d+)" + replacement = "$1:$2" + target_label = "__address__" + } + rule { + source_labels = ["__meta_kubernetes_namespace"] + action = "replace" + target_label = "namespace" + } + rule { + source_labels = ["__meta_kubernetes_pod_name"] + action = "replace" + target_label = "pod" + } + } + + // Scrape Flux controllers (flux-system namespace, port 8080) + discovery.kubernetes "flux_pods" { + role = "pod" + namespaces { + names = ["flux-system"] + } + } + + discovery.relabel "flux_scrape" { + targets = discovery.kubernetes.flux_pods.targets + rule { + source_labels = ["__meta_kubernetes_pod_container_port_number"] + action = "keep" + regex = "8080" + } + rule { + source_labels = ["__meta_kubernetes_namespace"] + action = "replace" + target_label = "namespace" + } + rule { + source_labels = ["__meta_kubernetes_pod_name"] + action = "replace" + target_label = "pod" + } + } + + // Scrape kube-state-metrics + prometheus.scrape "kube_state_metrics" { + targets = [{ + __address__ = "kube-state-metrics.monitoring.svc.cluster.local:8080", + }] + forward_to = [prometheus.remote_write.selendis.receiver] + scrape_interval = "30s" + scrape_timeout = "10s" + } + + // Scrape Flux controllers + prometheus.scrape "flux" { + targets = discovery.relabel.flux_scrape.output + forward_to = [prometheus.remote_write.selendis.receiver] + scrape_interval = "30s" + scrape_timeout = "10s" + job_name = "flux" + } + + // Scrape node-exporter DaemonSet + prometheus.scrape "node_exporter" { + targets = [{ + __address__ = "prometheus-node-exporter.monitoring.svc.cluster.local:9100", + }] + forward_to = [prometheus.remote_write.selendis.receiver] + scrape_interval = "30s" + scrape_timeout = "10s" + } + + // Scrape Synapse metrics + prometheus.scrape "synapse" { + targets = [{ + __address__ = "matrix-stack-synapse-main.matrix.svc.cluster.local:9000", + }] + forward_to = [prometheus.remote_write.selendis.receiver] + scrape_interval = "30s" + scrape_timeout = "10s" + } + + // Kubernetes pod logs to Loki + loki.source.kubernetes "k8s_logs" { + targets = discovery.kubernetes.k8s_pods.targets + forward_to = [loki.write.selendis.receiver] + } + + // Remote write to Selendis Prometheus + prometheus.remote_write "selendis" { + endpoint { + url = "http://10.0.0.3:9090/api/v1/write" + write_relabel_config { + source_labels = ["__name__"] + regex = "go_.*|process_.*" + action = "drop" + } + } + } + + // Remote write logs to Selendis Loki + loki.write "selendis" { + endpoint { + url = "http://10.0.0.3:3100/loki/api/v1/push" + } + } diff --git a/apps/monitoring/alloy.yaml b/apps/monitoring/alloy.yaml new file mode 100644 index 0000000..3a2e11b --- /dev/null +++ b/apps/monitoring/alloy.yaml @@ -0,0 +1,33 @@ +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: alloy + namespace: monitoring +spec: + interval: 1h + chart: + spec: + chart: alloy + version: "0.x" + sourceRef: + kind: HelmRepository + name: grafana + namespace: flux-system + values: + alloy: + configMap: + name: alloy-config + replicaCount: 1 + serviceAccount: + create: true + name: alloy + rbac: + create: true + podAnnotations: + prometheus.io/scrape: "false" + resources: + limits: + memory: 512Mi + requests: + cpu: 100m + memory: 256Mi diff --git a/apps/monitoring/helm-repos.yaml b/apps/monitoring/helm-repos.yaml new file mode 100644 index 0000000..a87c018 --- /dev/null +++ b/apps/monitoring/helm-repos.yaml @@ -0,0 +1,18 @@ +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: HelmRepository +metadata: + name: prometheus-community + namespace: flux-system +spec: + interval: 1h + url: https://prometheus-community.github.io/helm-charts + +--- +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: HelmRepository +metadata: + name: grafana + namespace: flux-system +spec: + interval: 1h + url: https://grafana.github.io/helm-charts diff --git a/apps/monitoring/kube-state-metrics.yaml b/apps/monitoring/kube-state-metrics.yaml new file mode 100644 index 0000000..22d452d --- /dev/null +++ b/apps/monitoring/kube-state-metrics.yaml @@ -0,0 +1,22 @@ +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: kube-state-metrics + namespace: monitoring +spec: + interval: 1h + chart: + spec: + chart: kube-state-metrics + version: "5.x" + sourceRef: + kind: HelmRepository + name: prometheus-community + namespace: flux-system + values: + replicas: 1 + service: + port: 8080 + prometheus: + monitor: + enabled: false diff --git a/apps/monitoring/kustomization.yaml b/apps/monitoring/kustomization.yaml new file mode 100644 index 0000000..eb57c15 --- /dev/null +++ b/apps/monitoring/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - namespace.yaml + - helm-repos.yaml + - kube-state-metrics.yaml + - node-exporter.yaml + - alloy-config.yaml + - alloy.yaml diff --git a/apps/monitoring/namespace.yaml b/apps/monitoring/namespace.yaml new file mode 100644 index 0000000..d325236 --- /dev/null +++ b/apps/monitoring/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring diff --git a/apps/monitoring/node-exporter.yaml b/apps/monitoring/node-exporter.yaml new file mode 100644 index 0000000..e5df563 --- /dev/null +++ b/apps/monitoring/node-exporter.yaml @@ -0,0 +1,29 @@ +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: prometheus-node-exporter + namespace: monitoring +spec: + interval: 1h + chart: + spec: + chart: prometheus-node-exporter + version: "4.x" + sourceRef: + kind: HelmRepository + name: prometheus-community + namespace: flux-system + values: + hostNetwork: true + hostPID: true + hostRootFsMount: + enabled: true + service: + port: 9100 + targetPort: 9100 + prometheus: + monitor: + enabled: false + tolerations: + - effect: NoSchedule + operator: Exists diff --git a/clusters/matrix/flux-system/kustomization.yaml b/clusters/matrix/flux-system/kustomization.yaml index 948d3a4..61edcce 100644 --- a/clusters/matrix/flux-system/kustomization.yaml +++ b/clusters/matrix/flux-system/kustomization.yaml @@ -4,4 +4,5 @@ resources: - gotk-components.yaml - gotk-sync.yaml - infra-sync.yaml + - monitoring-sync.yaml - production-sync.yaml diff --git a/clusters/matrix/flux-system/monitoring-sync.yaml b/clusters/matrix/flux-system/monitoring-sync.yaml new file mode 100644 index 0000000..d9f921b --- /dev/null +++ b/clusters/matrix/flux-system/monitoring-sync.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: monitoring-apps + namespace: flux-system +spec: + interval: 10m + path: ./apps/monitoring + prune: true + sourceRef: + kind: GitRepository + name: flux-system + dependsOn: + - name: infra-apps