diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 01a7aee..f1b6f82 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -26,7 +26,10 @@ "redhat.vscode-yaml", "redhat.vscode-commons", "monokai.theme-monokai-pro-vscode", - "eamodio.gitlens" + "eamodio.gitlens", + "gruntfuggly.todo-tree", + "ms-vscode.makefile-tools", + "GitHub.copilot" ], "settings": { "[yaml]": { @@ -38,7 +41,22 @@ "https://json.schemastore.org/kustomization.json": "**/kustomization.yaml", "https://json.schemastore.org/helmrelease.json": "**/*helmrelease*.yaml" }, - "editor.theme": "Monokai Pro" + "editor.theme": "Monokai Pro", + "todo-tree.general.showActivityBarBadge": true, + "todo-tree.general.tags": [ + "TODO", + "FIXME", + "BUG", + "HACK", + "NOTE", + "XXX", + "DONE" + ], + "todo-tree.tree.showScanModeButton": true, + "todo-tree.filtering.includeGlobs": [ + "**/docs/TASKS.md", + "**/docs/deployment-guides/*.md" + ] } } }, diff --git a/docs/TASKS.md b/docs/TASKS.md new file mode 100644 index 0000000..a7e8416 --- /dev/null +++ b/docs/TASKS.md @@ -0,0 +1,421 @@ +# aXion1337.Chat – Task List & Meilensteine + +**Statusübersicht**: [✅ 6 Abgeschlossen] [🔄 1 In Progress] [📋 15+ Pending] [🔒 10 Security] + +--- + +## ✅ Abgeschlossene Aufgaben (Chronologisch) + +### Phase 1: Basis-Setup +- [x] **K3S Cluster aufsetzen** – Single-Node auf Hetzner Cloud (49.13.132.245) + - Commit: `initial-setup` (vor Projekt) + - Status: ✅ Läuft + +- [x] **Flux CD Installation** + - SOPS + age Encryption + - GitOps Repository konfigurieren + - Commit: `setup-flux` (vor Projekt) + - Status: ✅ Läuft + +- [x] **Element Server Suite v26.4.0 Deployment** + - Synapse Homeserver (`matrix.axion1337.chat`) + - Matrix Authentication Service (`account.axion1337.chat`) + - Element Web (`axion1337.chat`) + - Element Admin (`admin.axion1337.chat`) + - MatrixRTC/Element Call (`mrtc.axion1337.chat`) + - Commit: `deploy-ess-matrix-stack` + - Status: ✅ Running + +### Phase 2: Core Features +- [x] **7 Custom Element Web Themes** + - aXion1337 Dark, Deep Purple, Discord Dark, Electric Blue, Everforest, Gruvbox, Wal + - Alphabetisch sortiert + - Commit: `add-custom-element-themes` + - Status: ✅ Deployed + +- [x] **Element Desktop Setup Scripts** (Windows/macOS/Linux) + - Auto-Download + Install + Config + - Hosted auf `axion1337.chat/docs/setup/` + - Commits: `add-element-desktop-setup-scripts`, `fix-element-setup-script-hosting` + - Status: ✅ Deployed + +- [x] **Room Policies** + - Message Retention (1d–1y lifecycle) + - Room Publication Rules (allow all) + - Auto-Join Rooms für Onboarding + - Commit: `add-synapse-retention-publication-autojoin` + - Status: ✅ Deployed + +### Phase 3: WebRTC & Medienübertragung +- [x] **TURN Server (coturn) für Video-Calls** + - Domain: `turn.axion1337.chat` + - HMAC-Auth mit Shared Secret + - Ports: 3478/udp, 3478/tcp, 5349/tcp, 49152-65535/udp + - Commit: `implement-turn-server-coturn-for-webrtc-video-calls` + - Status: ✅ Deployed + - Manual: DNS A-Record + Firewall-Ports öffnen (noch erforderlich) + +### Phase 4: Monitoring & Observability +- [x] **Monitoring Stack Integration** + - Alloy (Grafana Agent) als Collector + - Remote Write zu Selendis (10.0.0.3:9090 Prometheus, :3100 Loki) + - kube-state-metrics, node-exporter DaemonSet + - Commits: `integrate-monitoring-alloy-prometheus-loki`, `fix-prometheus-remote-write-docker` + - Status: ✅ Deployed + +### Phase 5: Identity Provider (Authentik) +- [x] **Authentik Stage 1 Deployment** + - HelmRelease v2026.x in `authentik` namespace + - Embedded PostgreSQL + Alloy-compatible + - Cert-Manager für TLS + - Commit: `deploy-authentik-as-identity-provider-for-matrix-stage-1` + - Status: ✅ Deployed + - Manual: Admin-Passwort setzen + OIDC Provider erstellen (erforderlich) + +🔄 **[IN PROGRESS] Authentik Stage 2 – MAS Integration** +- [ ] **MAS Upstream OIDC Konfiguration** + - Client ID/Secret aus Authentik Admin UI kopieren + - `upstream_oauth2_config` in `mas-secret.yaml` einfügen + - `passwords: enabled: false` + - Commit: (pending) + - Status: ⏳ Wartet auf manuelle Authentik-Konfiguration + +### Phase 6: Dokumentation +- [x] **Deployment Guides erstellen** + - 5 Markdown-Dateien in `docs/deployment-guides/` + - Chronologisch geordnet + - Troubleshooting + Best Practices + - Commit: `add-comprehensive-deployment-configuration-documentation` + - Status: ✅ Deployed + +--- + +## 🔄 In Progress / Blocked + +### Authentik Stage 2 – MAS Integration (⏳ Depends on Manual Config) +**Beschreibung**: Authentik OIDC Provider muss manuell im Authentik Admin UI konfiguriert werden, bevor Stage 2 Deployment möglich ist. + +**Schritte**: +1. ✅ Authentik Stage 1 Deployment (done) +2. ⏳ Authentik Admin UI: OIDC Provider erstellen (MANUAL - user action) +3. ⏳ Authentik Admin UI: Application mit Slug `matrix` erstellen (MANUAL - user action) +4. ⏳ Authentik Admin UI: Enrollment Flow mit Invitation Stage (MANUAL - user action) +5. ⏳ Authentik Admin UI: Client ID + Secret kopieren (MANUAL - user action) +6. 📋 MAS `upstream_oauth2_config` mit Client Credentials aktualisieren +7. 📋 `passwords: enabled: false` aktivieren +8. 📋 Commit + Push + +**Blocker**: Manuelle Authentik-Konfiguration (wartet auf Benutzer) + +--- + +## 📋 Backlog (Weitere Aufgaben) + +### Authentik Completion +- [ ] **Finish Authentik Stage 2 – MAS Integration** + - Prerequisites: Authentik OIDC Provider vollständig konfiguriert + - Task: Update `mas-secret.yaml`, enable password login disable + - Commit: `enable-authentik-oidc-integration-in-mas` + - Est. Effort: 30 min (manual + scripted) + +- [ ] **Test End-to-End Login Flow** + - Element Web login → MAS → Authentik → Matrix User Creation + - Create test users via Authentik + - Verify password reset flow + - Commit: (implicit in Stage 2) + - Est. Effort: 20 min + +- [ ] **Create Invite Links für neue User** + - Authentik Admin UI → Invitations → Create + - Set expiry dates (7d) + use limits + - Document procedure + - Est. Effort: 15 min + +### Element Call Enhancement +- [ ] **Element Call Fork für Custom Constraints** + - Repository: Fork `element-hq/element-call` + - Feature: Video/Audio constraints parameter im config + - Include: Bandwidth limiting, resolution limits, frame rate control + - Integration mit Synapse well-known + - Est. Effort: 2–3 days (fork + feature + test) + - Priority: **HIGH** (user feature) + +### Database Hardening +- [ ] **External/Dedicated PostgreSQL Deployment** + - Option 1: CloudNativePG Operator (open-source, auf K3S) + - Option 2: Managed Hetzner Postgres + - Separate aus ESS matrix-stack embedded Postgres + - HA + Replication + - Est. Effort: 1–2 days + - Priority: **HIGH** (reliability) + +- [ ] **Database Backup Strategy** + - Daily automated backups (PgBackRest oder velero) + - Off-site backup storage (S3 / Hetzner Storage Box) + - Monthly verified restores (test restore → verify data integrity) + - Backup + restore documentation + - Est. Effort: 2–3 days + - Priority: **CRITICAL** (disaster recovery) + +- [ ] **Synapse Media PVC Backups** + - Separate backup pipeline für `/data/media_store` PVC + - Reason: Media oft >100GB, sollte nicht im DB-Backup sein + - Velero + Restic für block-level backup + - Est. Effort: 1 day + - Priority: **HIGH** (data preservation) + +### Network Security +- [ ] **NetworkPolicies – K8s-Layer Segmentation** + - Default-Deny Ingress für `matrix` namespace + - Allow rules: + - Ingress → MAS:443 + - Ingress → ElementWeb:443 + - MAS ↔ Synapse:8008 + - Synapse ↔ Postgres:5432 + - Authentik → Postgres:5432 + - Authentik → Loki:3100 (monitoring) + - Egress: Matrix-specific (federation, etc.) + - Est. Effort: 1 day + - Priority: **MEDIUM** (compliance, least-privilege) + +- [ ] **Pod Security Admission (Restricted)** + - Apply to `matrix` & `authentik` namespaces + - Enforce: non-root, no privileged, read-only root fs + - Test: Ensure no chart breakage + - Est. Effort: 1 day + - Priority: **MEDIUM** (hardening) + +### Federation & Access Control +- [ ] **Federation-Allowlist oder Closed Federation** + - Decision: Which servers to federate with? + - If allowlist: explicit `federation_domain_whitelist` + - If closed: `allow_public_rooms_without_join_rules: false` + - Synapse config in `synapse-values.yaml` + - Est. Effort: 4 hours + - Priority: **MEDIUM** (security policy) + +### Moderation & Anti-Abuse +- [ ] **Mjolnir/Draupnir Bot Deployment** + - Open-source moderation bot für Matrix + - Reason: Invitation-based, aber Federation kann Spam bringen + - Auto-ban known bad servers/users + - Spam-detection rules + - HelmChart oder custom Deployment + - Est. Effort: 1–2 days + - Priority: **MEDIUM** (ops safety) + +- [ ] **Content Scanner for Media** + - matrix-content-scanner + ClamAV antivirus + - Scan uploaded media for malware + - Block suspicious files + - Est. Effort: 1–2 days + - Priority: **LOW–MEDIUM** (optional but good practice) + +### Secrets Management +- [ ] **External-Secrets Operator oder SOPS für Flux** + - Current: SOPS with age encryption + - Consideration: External-Secrets for cloud-native (AWS Secrets Manager, Hetzner Vault, etc.) + - OR: Improve SOPS rotation strategy + - Decision needed: Keep SOPS or upgrade? + - Est. Effort: 2–3 days (if switching) + - Priority: **LOW** (current SOPS setup working) + +### Image & Dependency Management +- [ ] **Renovate / Dependabot Setup** + - Auto-update Helm Chart versions + - Auto-update Container Image Tags + - Monitor for security patches + - Est. Effort: 4 hours + - Priority: **MEDIUM** (maintenance) + +- [ ] **Trivy Image Scanning** + - Scan images in Flux HelmReleases for CVEs + - Block deployment if critical CVE found + - CI/CD hook in git workflow + - Est. Effort: 8 hours + - Priority: **LOW–MEDIUM** (security posture) + +- [ ] **Monitor ESS & Element Security Advisories** + - Subscribe to `element-hq` security mailing list + - Monitor `#matrix-community` security channels + - Auto-alerts on new CVEs/patches + - Est. Effort: Ongoing (low maintenance) + - Priority: **MEDIUM** (security awareness) + +### Container Security +- [ ] **Disable automountServiceAccountToken Everywhere** + - Audit all Deployments/StatefulSets + - Disable for: Synapse, ElementWeb, MAS, Postgres, Authentik (where not needed) + - Add `automountServiceAccountToken: false` to spec.template.spec + - Test: Ensure no breakage + - Est. Effort: 4 hours + - Priority: **MEDIUM** (least-privilege) + +--- + +## 🔒 Security Hardening (Host & Cluster Level) + +### Host OS Layer (Ubuntu/Debian) +- [ ] **Hetzner Cloud Firewall** + - Default-Deny inbound + - Allow: 80/443 (HTTP/HTTPS) + - Allow: 22 (SSH) from your IP only (or via WireGuard/Tailscale) + - Status: ✅ Can be done in Hetzner UI + - Est. Effort: 30 min + - Priority: **CRITICAL** (immediate, zero config cost) + +- [ ] **SSH Hardening** + - Disable password auth (key-only) + - Disable root login + - PermitRootLogin: no + - PasswordAuthentication: no + - MaxAuthTries: 3 + - Optional: Change SSH port (cosmetic, reduces log noise) + - Optional: SSH hinter WireGuard/Tailscale (eliminates fail2ban für SSH) + - Est. Effort: 2 hours + - Priority: **HIGH** (immediate) + +- [ ] **unattended-upgrades** + - Enable automatic security updates + - Configure: APT::Periodic::Update-Package-Lists "1"; + - Configure: APT::Periodic::Unattended-Upgrade "1"; + - Configure: APT::Periodic::AutocleanInterval "7"; + - Est. Effort: 30 min + - Priority: **HIGH** (set & forget) + +- [ ] **K3S API Security** + - Current: K3S API listening on :6443 on all interfaces (default) + - Hardening: + - Option 1: Firewall restrict :6443 to localhost only + - Option 2: K3S --bind-address + --advertise-address to WireGuard IP + - Option 3: kubectl access only via jumphost/bastion + - Est. Effort: 2 hours + - Priority: **HIGH** (API is high-value target) + +- [ ] **auditd for File Integrity & Syscall Audit** + - Monitor: /etc, ~/.kube, /var/lib/rancher/k3s + - Audit rules für sensitive file changes + - Low overhead, good signal/noise ratio + - Output to syslog / centralized logging + - Est. Effort: 2 hours + - Priority: **MEDIUM** (forensics + compliance) + +- [ ] **Kernel Hardening (sysctl)** + - Apply hardening recommendations from Lynis + - Key settings: + - kernel.kptr_restrict=2 (hide kernel pointers) + - kernel.dmesg_restrict=1 (restrict dmesg) + - net.ipv4.tcp_syncookies=1 (SYN flood protection) + - net.ipv4.conf.all.rp_filter=1 (reverse path filtering) + - net.ipv4.conf.all.send_redirects=0 + - net.ipv6.conf.all.disable_ipv6=0 (or =1 if no IPv6 needed) + - Persist via /etc/sysctl.d/99-hardening.conf + - Est. Effort: 2 hours + - Priority: **MEDIUM** (defense in depth) + +- [ ] **Lynis Security Baseline** + - Run `lynis audit system` + - Review recommendations + - Implement high-priority findings + - Aim for score >80 + - Re-run quarterly + - Est. Effort: 4 hours (initial) + 1 hour quarterly + - Priority: **MEDIUM** (baseline verification) + +### Cluster Layer (K3S / Kubernetes) +- [ ] **CrowdSec Integration** + - Install CrowdSec agent on host + - Connect to CrowdSec Hub (commercial platform, free tier available) + - Feed auth.log, syslog → CrowdSec for attack detection + - Auto-block IPs via local firewall or Hetzner Firewall API + - Est. Effort: 4 hours + - Priority: **MEDIUM** (proactive threat response) + +- [ ] **Falco Runtime Monitoring** + - Install Falco DaemonSet in K3S + - Monitor: Shell spawning in containers, suspicious syscalls, privilege escalation + - Output to Loki / syslog + - Alert on anomalies + - Est. Effort: 1 day + - Priority: **MEDIUM** (runtime detection) + +--- + +## 🎯 Meilensteine (Milestones) + +| Meilenstein | Beschreibung | Status | ETA | +|------------|-------------|--------|-----| +| **M1: Basis-Setup** | K3S + Flux + ESS deployed | ✅ Done | - | +| **M2: Core Matrix** | Themes, Scripts, Policies | ✅ Done | - | +| **M3: WebRTC & Monitoring** | TURN + Alloy/Prometheus/Loki | ✅ Done | - | +| **M4: Identity Provider** | Authentik Stage 1+2 (pending Stage 2) | 🔄 In Progress | ~1–2 days | +| **M5: Production-Ready** | DB Backups, NetworkPolicies, Security Hardening | 📋 Backlog | ~2–3 weeks | +| **M6: Advanced Features** | Element Call Fork, Content Scanner, Mjolnir | 📋 Backlog | ~4+ weeks | +| **M7: Enterprise-Ready** | Full compliance (DSGVO), HA setup, Disaster Recovery | 🎯 Future | ~8+ weeks | + +--- + +## 📊 Prioritäts-Kategorien + +### 🔴 CRITICAL (do immediately) +- Hetzner Cloud Firewall setup +- Database backup strategy +- SSH hardening + +### 🟠 HIGH (do within 1–2 weeks) +- Authentik Stage 2 completion +- External PostgreSQL migration +- NetworkPolicies +- Element Call fork + +### 🟡 MEDIUM (do within 1 month) +- CrowdSec + Falco +- Mjolnir bot +- Renovate/Trivy +- PSA restricted mode +- Kernel hardening + +### 🟢 LOW (nice-to-have, do if time allows) +- Content scanner (ClamAV) +- External-Secrets upgrade +- SSH port relocation +- Advanced federation rules + +--- + +## 📝 Notes & Decision Points + +### Authentik Stage 2 Blocker +⏳ **Waiting for**: User to manually configure Authentik OIDC Provider in Authentik Admin UI. +- Once done, provide Client ID + Secret +- Then: Commit Stage 2 MAS config + +### Database: CloudNativePG vs. Hetzner Postgres +- **CloudNativePG**: Open-source, runs on K3S, full control +- **Hetzner Postgres**: Managed, backups included, less ops overhead +- **Decision**: Recommend CloudNativePG for now (cost-effective), migrate to Hetzner later if operational overhead too high + +### Federation: Allowlist vs. Closed? +- **Allowlist**: Default federation with all public servers, can be attacked +- **Closed**: Only federate with trusted servers (higher security, lower interop) +- **Decision**: Depends on user intent. For now: allow all, add Mjolnir for abuse protection + +### Security Framework +- **Layers**: Perimeter (Firewall) → Host (SSH, auditd, hardening) → Cluster (NetworkPolicies, PSA, Falco) → App (Rate-limits, Mjolnir) +- **Approach**: Implement incrementally, test after each layer + +--- + +## 🔗 Related Documentation + +- `docs/deployment-guides/README.md` – Overview +- `docs/deployment-guides/01-turn-server-setup.md` – TURN +- `docs/deployment-guides/02-authentik-identity-provider.md` – Authentik (Stage 1 + Stage 2 plan) +- `docs/deployment-guides/03-monitoring-integration.md` – Monitoring +- `docs/deployment-guides/04-element-customization.md` – Themes, Desktop +- `docs/deployment-guides/05-room-policies.md` – Policies + +--- + +**Last Updated**: 2026-05-14 +**Next Review**: 2026-05-21