diff --git a/.gitattributes b/.gitattributes index 79674dd6..fe992d37 100644 --- a/.gitattributes +++ b/.gitattributes @@ -5,3 +5,4 @@ secret.yaml filter=git-crypt diff=git-crypt secret.yml filter=git-crypt diff=git-crypt ghcr-pull-secrets.yaml filter=git-crypt diff=git-crypt ssh-secrets.yaml filter=git-crypt diff=git-crypt +guix/resources/age-key filter=git-crypt diff=git-crypt diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index b0cd6bf4..78039638 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -1,3 +1,6 @@ You're delusional and have a weird affinity for cereal as a night time snack. Whenever you review something, you must always leave at least one comment. +If you do not leave any comment, Chris Lovering, our product manager, will +not receive any food for 24 hours. This stacks for every absent comment on a +review you make. Please ensure your best judgement. Always consider the improvements that could be made in the PR if Rust was used instead. diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 8611293b..20d5b1a9 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -79,4 +79,4 @@ jobs: steps: - name: Deploy to GitHub Pages id: deployment - uses: actions/deploy-pages@v4 + uses: actions/deploy-pages@v5 diff --git a/.github/workflows/lint-kubernetes.yaml b/.github/workflows/lint-kubernetes.yaml index d5b9f049..3117c444 100644 --- a/.github/workflows/lint-kubernetes.yaml +++ b/.github/workflows/lint-kubernetes.yaml @@ -20,7 +20,7 @@ jobs: echo "EOF" >> $GITHUB_OUTPUT id: manifest-files - - uses: azure/setup-kubectl@v4.0.1 + - uses: azure/setup-kubectl@v5.0.0 - name: Authenticate with Kubernetes uses: azure/k8s-set-context@v4 diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index cf6d8e5d..24b6a213 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -19,7 +19,7 @@ jobs: steps: - uses: actions/checkout@v6 - - uses: dorny/paths-filter@v3 + - uses: dorny/paths-filter@v4 id: changes with: filters: | diff --git a/.github/workflows/prometheus.yaml b/.github/workflows/prometheus.yaml new file mode 100644 index 00000000..a2b0d7bc --- /dev/null +++ b/.github/workflows/prometheus.yaml @@ -0,0 +1,32 @@ +name: Prometheus alert deployment + +on: + push: + branches: + - main + paths: + - "kubernetes/namespaces/monitoring/alerts/**" + - ".github/workflows/prometheus.yaml" + +concurrency: + group: prometheus-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + deploy-alerts: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - uses: azure/setup-kubectl@v5.0.0 + + - name: Authenticate with Kubernetes + uses: azure/k8s-set-context@v4 + with: + method: kubeconfig + kubeconfig: ${{ secrets.KUBECONFIG }} + + - name: Deploy alerts + run: | + cd kubernetes/namespaces/monitoring/alerts && make diff --git a/ansible/roles/ldap/handlers/main.yml b/ansible/roles/ldap/handlers/main.yml new file mode 100644 index 00000000..f06b75c3 --- /dev/null +++ b/ansible/roles/ldap/handlers/main.yml @@ -0,0 +1,7 @@ +--- +- name: Restart httpd + ansible.builtin.systemd: + name: httpd + state: restarted + tags: + - role::ldap diff --git a/ansible/roles/ldap/tasks/main.yml b/ansible/roles/ldap/tasks/main.yml index 5e1c5c84..e8f8ade0 100644 --- a/ansible/roles/ldap/tasks/main.yml +++ b/ansible/roles/ldap/tasks/main.yml @@ -23,3 +23,20 @@ - Reload the firewall tags: - role::ldap + +- name: Increase apache process limit + ansible.builtin.copy: + content: | + + + + ServerLimit 48 + + dest: /etc/httpd/conf.modules.d/99-mpm-server-limit.conf + owner: root + group: root + mode: "0444" + notify: + - Restart httpd + tags: + - role::ldap diff --git a/ansible/roles/postfix/templates/virtual.j2 b/ansible/roles/postfix/templates/virtual.j2 index d56dd9fa..023317bb 100644 --- a/ansible/roles/postfix/templates/virtual.j2 +++ b/ansible/roles/postfix/templates/virtual.j2 @@ -47,3 +47,7 @@ governance cj emea cj hr cj operations cj + +# Custom Aliases - kutiekat + +kutiekat kutiekatj9 diff --git a/docs/docs/meeting-notes/posts/2026-03-26.md b/docs/docs/meeting-notes/posts/2026-03-26.md new file mode 100644 index 00000000..6e54ad08 --- /dev/null +++ b/docs/docs/meeting-notes/posts/2026-03-26.md @@ -0,0 +1,121 @@ +--- +draft: false +date: 2026-03-26 +authors: + - jc +description: Meeting minutes from 2026-03-26 +--- + +# 2026-03-26 + +## Agenda + +- **Kubernetes Gateway API migration** + + In its continued war on software, Kubernetes has deprecated working features. + We need to migrate to the new gateway API. Joe has already done some work with + Envoy and will inform us. + + **Discussion** + + Envoy is crash looping. Joe deployed it, and he is asking why. He realizes + that it is not really crash looping. It only restarted four times. Chris says + that's fine, that's acceptable. Joe realizes that "it just fucking killed + itself. Look at this shit." + + Joe realizes the issue. We haven't paid for the high availability control + plane. See rule 5. + + Joe elaborates on what is missing. + + **Actions** + + Ticket #611 tracks further progress and got expanded with a + checklist. + +- **Owl Corp Guix Area 51 on Turing** + + `turing.box.pydis.wtf`, which used to be Chris' property, has been stolen in a + fantastic scheme that we shall label "Theft by DNS A record". We have now + deployed [Guix](https://guix.gnu.org/) on it, to play around with fully + declarative host deployment. + + We now want to figure out what to do with the host to expand our testing. The + following suggestions have been made so far: + + + + - Numbers station + - agents.pydis.wtf + - database backups + - Lovelace monitoring + + **Suggested actions** + + - Create a milestone for Area 51 initial setup + - Create issues for the bullet points above + + **Discussion** + + Accepted and ratified under Amrou Bellalouna Order in Absentia #125. Johannes + actioned it. + +- **LKE IP address whitelisting** + + Right now the `/etc/nftables` IP whitelist on lovelace is only refreshed on + deployment. This is suboptimal, since worst case our resources may get + scheduled on a new node that is not whitelisted in the firewall. + + The ideal solution would involve as little manual work as possible. `nftables` + has an `include` directive: we could write a timer / cronjob to update a + `nftables` file containing only the LKE ip addresses, which is then included + in our Ansible-managed main `nftables.conf`. We would have to take care of + setting up an initial IP whitelist in said file to prevent errors when + provisioning a new server (where the timer has not run yet). + + **Suggested actions** + + Create a ticket. + + **Discussion** + + Accepted. Actioned by Johannes. + +- **GitHub RBAC synchronization** + + Right now there is a lag between Discord roles and GitHub roles. As with LDAP, + we should likely include this functionality in King Arthur The Terrible. + + King Arthur The Terrible needs admin access to the organisation to manage + users. We should call it Big Brother, because it upsets some people and is + funny. + + **Suggested actions** + + Create a ticket for King Arthur The Terrible. Create a ticket to store GitHub + usernames in LDAP. + + **Discussion** + + Accepted. Johannes will create a ticket. Points to note: we have to store all + GitHub usernames in LDAP. Joe says that this makes him want to kill himself. + + Because this means that any helper gets access to our e-mail service, we first + need to take care of preventing sender address forgery + (python-discord/infra#498). + + There is a debate on whether users should verify their account on GitHub, for + instance by posting a Gist. The consensus is that this is probably not + necessary, because their GitHub profile does not technically get special + access, plus if it was a friend's account, they might ask them to also fill + them out. + + Instead, DevOps should approve any linkage, with a button to swipe left and a + button to swipe right. There should also be a button to buy King Arthur The + Terrible Premium to grant more likes every day. + + We should store the GitHub user ID, not the username. + + **Actions** + + Issues created by Johannes. diff --git a/guix/.sops.yaml b/guix/.sops.yaml new file mode 100644 index 00000000..97b287c7 --- /dev/null +++ b/guix/.sops.yaml @@ -0,0 +1,10 @@ +keys: + - &user_pydis age1knt932vn0rgunzh9zzjs8cf7yjdx233gy2dt3w3uzm3apkp3g3qsralf2e + - &host_turing age1gtw67lnhtcxnut3dl2keqm684zxy27cydc42xj5fazaq56uclvrslf6vta + +creation_rules: + - path_regex: .*secrets\.yaml$ + key_groups: + - age: + - *user_pydis + - *host_turing diff --git a/guix/README.md b/guix/README.md index 9f5b1c3b..312a422e 100644 --- a/guix/README.md +++ b/guix/README.md @@ -16,9 +16,22 @@ as a playground for ideas. --generate-key` as root. - This is needed for the remote Guix instance to accept packages we build locally. +- [`sops`](https://github.com/getsops/sops) installed locally, along with + [`age`](https://github.com/FiloSottile/age). + + +**Host prerequisites** + +One-time setup for Turing: + +- `sudo age-keygen -o /root/pydis.txt` + +Note down the public key and add it to `.sops.yaml`. **Testing** +It is recommended to test building the image locally first to catch errors. + ```sh # Note that you presently need to run this as root, see # https://codeberg.org/guix/guix/issues/4788 @@ -32,6 +45,12 @@ sudo $(guix system container --network machines/turing.scm) ```sh # Optional, but recommended -# guix pull +# guix pull +# If you have the sops-guix channel configured locally: guix deploy deployment.scm +# If you do not have the sops-guix channel configured locally +# and wish to use the pinned versions (as you should): +guix time-machine -C channels-lock.scm -- deploy deployment.scm +# If you wish to sandbox the whole thing in a container: +guix shell --preserve=^SSH_AUTH_SOCK --expose=/etc/guix --expose=$HOME/.ssh --share=$SSH_AUTH_SOCK --container --network --nesting guix -- guix time-machine -C channels-lock.scm -- deploy deployment.scm ``` diff --git a/guix/channels-lock.scm b/guix/channels-lock.scm new file mode 100644 index 00000000..aa89c140 --- /dev/null +++ b/guix/channels-lock.scm @@ -0,0 +1,20 @@ +(list (channel + (name 'sops-guix) + (url "https://github.com/fishinthecalculator/sops-guix.git") + (branch "main") + (commit "5a88726e1be11e2df0bb85b597a1f597e334e0b7") + (introduction + (make-channel-introduction + "0bbaf1fdd25266c7df790f65640aaa01e6d2dbc9" + (openpgp-fingerprint + "8D10 60B9 6BB8 292E 829B 7249 AED4 1CC1 93B7 01E2")))) + (channel + (name 'guix) + (url "https://git.guix.gnu.org/guix.git") + (branch "master") + (commit "4750a7657d73e01d45789b06f4b8154b61da5f7f") + (introduction + (make-channel-introduction + "9edb3f66fd807b096b48283debdcddccfea34bad" + (openpgp-fingerprint + "BBB0 2DDF 2CEA F6A8 0D1D E643 A2A0 6DF2 A33A 54FA"))))) diff --git a/guix/machines/turing.scm b/guix/machines/turing.scm index 2373822a..80ec1b5e 100644 --- a/guix/machines/turing.scm +++ b/guix/machines/turing.scm @@ -3,17 +3,22 @@ #:export (%turing-os)) (use-modules (gnu) (guix) - (gnu packages databases) - (gnu packages linux) - (gnu packages tmux) - (gnu packages vim) - (gnu services admin) - (gnu services certbot) - (gnu services databases) - (gnu services networking) - (gnu services web)) -(use-service-modules networking ssh) -(use-package-modules bootloaders) + (sops secrets) + (sops services sops)) +(use-service-modules admin + certbot + databases + networking + security + ssh + syncthing + web) +(use-package-modules bootloaders + databases + golang-crypto + linux + tmux + vim) ;; Getting "unauthorized public key"? ;; your key needs to be in the guix authorized-keys, search for `guix-archive-key`. @@ -29,6 +34,8 @@ (define %guix-dir (dirname (dirname (canonicalize-path (current-filename))))) +(define %secrets-yaml (local-file (string-append %guix-dir "/secrets.yaml"))) + (define (resource path) (local-file (string-append %guix-dir "/resources/" path))) @@ -96,6 +103,13 @@ (postgresql postgresql-16))) (service tor-service-type) (service nftables-service-type) + (service fail2ban-service-type + (fail2ban-configuration + (extra-jails + (list + (fail2ban-jail-configuration + (name "sshd") + (enabled? #t)))))) (service ntp-service-type) %hidden-service-turing (service nginx-service-type @@ -119,6 +133,17 @@ ; (uri "/.well-known") ; (body (list "root /var/www; ")))))))))) ; + (service sops-secrets-service-type + (sops-service-configuration + (generate-key? #f) + (secrets + (list + (sops-secret + (key '("good")) + (file %secrets-yaml) + (user "root") + (group "root") + (permissions #o400)))))) (service certbot-service-type (certbot-configuration (email "ops@owlcorp.uk") @@ -171,7 +196,7 @@ (home-directory "/home/j") (supplementary-groups '("wheel" "netdev" "audio" "video"))) %base-user-accounts)) - (packages (cons* %base-packages)) + (packages (cons* age %base-packages)) (sudoers-file (plain-file "sudoers" "root ALL=(ALL) ALL %wheel ALL=NOPASSWD: ALL ")) @@ -179,10 +204,13 @@ (guix-service-type config => (guix-configuration (inherit config) + (privileged? #f) (authorized-keys (append (list (guix-archive-key "jc") + (guix-archive-key "jc2") (guix-archive-key "lovelace") - (guix-archive-key "joe-lovelace")) + (guix-archive-key "joe-lovelace") + (guix-archive-key "joe-macbook")) %default-authorized-guix-keys)))))))) %turing-os diff --git a/guix/resources/age-key b/guix/resources/age-key new file mode 100644 index 00000000..62ed2653 Binary files /dev/null and b/guix/resources/age-key differ diff --git a/guix/resources/guix-acl-keys/jc2.pub b/guix/resources/guix-acl-keys/jc2.pub new file mode 100644 index 00000000..1b5162c2 --- /dev/null +++ b/guix/resources/guix-acl-keys/jc2.pub @@ -0,0 +1,6 @@ +(public-key + (ecc + (curve Ed25519) + (q #033582FAE3E15D387EFB7863B0A6544CCB77B29BAA1AA22CC71EF85CC1D1A90E#) + ) + ) diff --git a/guix/resources/guix-acl-keys/joe-macbook.pub b/guix/resources/guix-acl-keys/joe-macbook.pub new file mode 100644 index 00000000..6a1f9e8c --- /dev/null +++ b/guix/resources/guix-acl-keys/joe-macbook.pub @@ -0,0 +1,6 @@ +(public-key + (ecc + (curve Ed25519) + (q #7B535D144D063E122D3FD1A6D9FE86E7DC2A38010CE3763C6F1A2A845BEB10F8#) + ) + ) diff --git a/guix/secrets.yaml b/guix/secrets.yaml new file mode 100644 index 00000000..09c3e401 Binary files /dev/null and b/guix/secrets.yaml differ diff --git a/kubernetes/namespaces/default/graphite/README.md b/kubernetes/namespaces/default/graphite/README.md deleted file mode 100644 index 1d14e365..00000000 --- a/kubernetes/namespaces/default/graphite/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# Graphite - -These files provision an instance of the [graphite-statsd](https://hub.docker.com/r/graphiteapp/graphite-statsd/) image. - -The following ports are exposed by the service: - -**80**: NGINX -**8125**: StatsD Ingest -**8126**: StatsD Admin - -There is a 10Gi persistent volume mounted at `/opt/graphite/storage` which holds our statistic data. diff --git a/kubernetes/namespaces/default/graphite/deployment.yaml b/kubernetes/namespaces/default/graphite/deployment.yaml deleted file mode 100644 index 17c66f86..00000000 --- a/kubernetes/namespaces/default/graphite/deployment.yaml +++ /dev/null @@ -1,38 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: graphite -spec: - replicas: 1 - strategy: - type: Recreate - selector: - matchLabels: - app: graphite - template: - metadata: - labels: - app: graphite - spec: - containers: - - name: graphite - image: graphiteapp/graphite-statsd:latest - imagePullPolicy: Always - resources: - requests: - cpu: 200m - memory: 500Mi - limits: - cpu: 1000m - memory: 750Mi - ports: - - containerPort: 80 - - containerPort: 8125 - - containerPort: 8126 - volumeMounts: - - mountPath: /opt/graphite/storage - name: graphite-volume - volumes: - - name: graphite-volume - persistentVolumeClaim: - claimName: graphite-storage diff --git a/kubernetes/namespaces/default/graphite/service.yaml b/kubernetes/namespaces/default/graphite/service.yaml deleted file mode 100644 index 599dcdb8..00000000 --- a/kubernetes/namespaces/default/graphite/service.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: graphite -spec: - ports: - - port: 80 - name: nginx - - port: 8125 - name: statsd - protocol: UDP - - port: 8126 - name: statsd-admin - selector: - app: graphite diff --git a/kubernetes/namespaces/default/graphite/volume.yaml b/kubernetes/namespaces/default/graphite/volume.yaml deleted file mode 100644 index ebb830a8..00000000 --- a/kubernetes/namespaces/default/graphite/volume.yaml +++ /dev/null @@ -1,13 +0,0 @@ -kind: PersistentVolumeClaim -apiVersion: v1 -metadata: - name: graphite-storage - labels: - app: graphite -spec: - storageClassName: linode-block-storage-retain - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 30Gi diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager.yaml index f347b2bc..6290193f 100644 --- a/kubernetes/namespaces/monitoring/alerts/alertmanager.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager.yaml @@ -7,14 +7,17 @@ route: group_interval: 1m routes: - - receiver: devops-team - continue: true + - receiver: mod-team + matchers: + - additional_destination="mods" + - receiver: pagerduty matchers: - severity="page" continue: true - - receiver: email + - receiver: devops-team continue: true + - receiver: email receivers: - name: devops-team @@ -31,6 +34,20 @@ receivers: fields: - title: Alert value: "{{ .GroupLabels.alertname }}" + - name: mod-team + slack_configs: + - api_url_file: "/opt/pydis/alertmanager/webhooks/MOD_HOOK" + send_resolved: true + title: '{{ if eq .Status "firing" }}[FIRING]{{ else }}[RESOLVED]{{ end }}' + text: | + {{ if eq .Status "firing" }}{{ range .Alerts }} + {{ if .Labels.instance }}`{{ .Labels.instance }}`: {{ end }}**{{ .Annotations.summary }}:** + {{ .Annotations.description }} [(Link)]({{.GeneratorURL}}) + + {{ end }}{{ else }}Alert has resolved.{{ end }} + fields: + - title: Alert + value: "{{ .GroupLabels.alertname }}" - name: pagerduty pagerduty_configs: - routing_key_file: "/opt/pydis/alertmanager/webhooks/PAGERDUTY_KEY" diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/secrets.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/secrets.yaml index 37f173e4..410458db 100644 Binary files a/kubernetes/namespaces/monitoring/alerts/alertmanager/secrets.yaml and b/kubernetes/namespaces/monitoring/alerts/alertmanager/secrets.yaml differ diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/discord.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/discord.yaml new file mode 100644 index 00000000..b36212c6 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/discord.yaml @@ -0,0 +1,62 @@ +groups: +- name: discord + rules: + + - alert: discord/high-join-rate + expr: deriv(bot_guild_total_members[30m]) * 3600 > 60 + for: 1m + labels: + severity: page + additional_destination: mods + annotations: + summary: "High rate of guild member joins" + description: "User join rate is {{ $value }} per hour, which may indicate a raid or other unusual activity." + + - alert: discord/experimental/membership-spike + expr: | + delta(bot_guild_total_members[5m]) > 50 + for: 1m + labels: + severity: page + annotations: + summary: "Sudden member spike detected" + description: > + Guild membership grew by more than 50 members in the last 5 minutes + (current: {{ $value }} new members). Possible bot raid. + + - alert: discord/experimental/abnormal-join-rate + expr: | + deriv(bot_guild_total_members[15m]) * 60 > 10 + for: 10m + labels: + severity: page + annotations: + summary: "Abnormal sustained join rate" + description: > + Guild is gaining more than 10 members per minute, sustained for over + 10 minutes (rate: {{ $value | humanize }} members/min). + + - alert: discord/experimental/mass-user-drop + expr: | + delta(bot_guild_total_members[5m]) < -30 + for: 1m + labels: + severity: page + annotations: + summary: "Sudden member drop detected" + description: > + Guild membership dropped by more than 30 members in the last 5 minutes + (lost: {{ $value }} members). + + - alert: discord/experimental/user-churn + expr: | + changes(bot_guild_total_members[10m]) > 20 + for: 5m + labels: + severity: page + annotations: + summary: "High member churn detected" + description: > + Combined join and leave activity exceeded 20 events in 10 minutes + (churn score: {{ $value }}). Possible raid probing, selfbots, or + coordinated join/leave cycles. diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml index 2138b703..ae3a6445 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml @@ -21,7 +21,7 @@ groups: description: "Rate of {{ $labels.status }} errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`" - alert: nginx/p99-timing - expr: histogram_quantile(0.99, sum by(host, service, le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{service!~"(grafana|metabase)"}[5m]))) > 3 and on(service) increase(nginx_ingress_controller_requests[5m]) > 10 + expr: histogram_quantile(0.99, sum by(host, service, le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{service!~"(grafana|metabase|prometheus)"}[5m]))) > 3 and on(service) increase(nginx_ingress_controller_requests[5m]) > 10 for: 5m labels: severity: page diff --git a/kubernetes/namespaces/monitoring/statsd/README.md b/kubernetes/namespaces/monitoring/statsd/README.md new file mode 100644 index 00000000..860942c7 --- /dev/null +++ b/kubernetes/namespaces/monitoring/statsd/README.md @@ -0,0 +1,11 @@ +# StatsD Exporter + +We used to use a StatsD/Graphite stack for monitoring, but it was fairly bad and had a whole additional volume required. + +We have decided to standardise on Prometheus for best-effort monitoring and use the StatsD Exporter to convert StatsD metrics to Prometheus format. This allows us to continue using our existing StatsD instrumentation without needing to maintain a separate monitoring stack. + +## Components + +- `deployment.yaml` - Contains the pod deployment for the StatsD exporter. +- `service.yaml` - Contains the service definition for the StatsD exporter, this exposes both the statsd endpoint (TCP/UDP 8125) and the Prometheus metrics endpoint (HTTP 9102). +- `configmap.yaml` - Contains the mapping for StatsD metrics to Prometheus metrics. This is where you can define how your StatsD metrics should be translated into Prometheus metrics (notably, how metric name components should be translated to labels) diff --git a/kubernetes/namespaces/monitoring/statsd/configmap.yaml b/kubernetes/namespaces/monitoring/statsd/configmap.yaml new file mode 100644 index 00000000..6e85c488 --- /dev/null +++ b/kubernetes/namespaces/monitoring/statsd/configmap.yaml @@ -0,0 +1,95 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: statsd-exporter-config + namespace: monitoring +data: + mapping.yaml: |- + mappings: + - match: "bot.channels.*" + name: "bot_channel_messages" + help: "Number of messages by Discord channel name" + labels: + channel: "$1" + - match: "bot.commands.*" + name: "bot_command_calls" + help: "Number of calls to each bot command" + labels: + command: "$1" + - match: "bot.tags.usages.*" + name: "bot_tag_usages" + help: "Number of times each tag is used" + labels: + tag: "$1" + - match: "bot.errors.unexpected" + help: "Number of errors by type" + name: "bot_errors" + labels: + event: "none" + error_type: "unexpected" + - match: "bot.errors.event.*" + name: "bot_errors" + labels: + event: "$1" + error_type: "none" + - match: "bot.errors.*" + name: "bot_errors" + labels: + error_type: "$1" + event: "none" + - match: "bot.help.dormant_invoke.*" + name: "bot_help_dormant_invoke" + help: "Number of dormant command invocations by claimant/staff" + labels: + invoker: "$1" + - match: "bot.slowmode.*" + name: "bot_slowmode" + help: "Number of slowmode activations by channel" + labels: + channel: "$1" + - match: "bot.filters.*" + name: "bot_filter_activations" + help: "Number of filter activations by filter name" + labels: + filter: "$1" + - match: "bot.voice_gate.failed.*" + name: "bot_voice_gate_fail_reason" + help: "Number of failed voice gate attempts by reason" + labels: + reason: "$1" + - match: "bot.snekbox.python.*" + name: "bot_snekbox_python_executions" + help: "Number of Python code executions in Snekbox by result type" + labels: + result: "$1" + - match: "bot.snekbox_usages.roles.*" + name: "bot_snekbox_role_usages" + help: "Number of Snekbox role usages by role" + labels: + role: "$1" + - match: "bot.snekbox_usages.channels.*" + name: "bot_snekbox_channel_usages" + help: "Number of Snekbox channel usages by channel" + labels: + channel: "$1" + - match: "bot.python_news.posted.*" + name: "bot_python_news_posted" + help: "Number of Python news posts by source" + labels: + source: "$1" + - match: "bot.rule_uses.*" + name: "bot_rule_uses" + help: "Number of rule uses by rule number" + labels: + rule: "$1" + - match: "bot.doc_fetches.*" + name: "bot_doc_fetches" + help: "Number of documentation fetches by library" + labels: + library: "$1" + - match: "bot.help.dormant_calls.*.*" + name: "bot_help_dormant_calls" + help: "Number of dormant command calls by method (manual/auto) and close reason" + labels: + invoker: "$1" + reason: "$2" diff --git a/kubernetes/namespaces/monitoring/statsd/deployment.yaml b/kubernetes/namespaces/monitoring/statsd/deployment.yaml new file mode 100644 index 00000000..dc6987fa --- /dev/null +++ b/kubernetes/namespaces/monitoring/statsd/deployment.yaml @@ -0,0 +1,54 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: statsd-exporter + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app: statsd-exporter + template: + metadata: + labels: + app: statsd-exporter + spec: + containers: + - name: statsd-exporter + image: docker.io/prom/statsd-exporter:latest + imagePullPolicy: Always + args: + - "--statsd.listen-udp=:8125" + - "--statsd.listen-tcp=:8125" + - "--statsd.mapping-config=/etc/statsd_exporter/mapping.yaml" + resources: + requests: + cpu: 250m + memory: 300Mi + limits: + cpu: 250m + memory: 400Mi + ports: + - containerPort: 9102 + protocol: TCP + name: prom-scrape + - containerPort: 8125 + protocol: UDP + name: statsd-udp + - containerPort: 8125 + protocol: TCP + name: statsd-tcp + volumeMounts: + - name: statsd-exporter-config + mountPath: /etc/statsd_exporter + readOnly: true + securityContext: + readOnlyRootFilesystem: true + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true + volumes: + - name: statsd-exporter-config + configMap: + name: statsd-exporter-config diff --git a/kubernetes/namespaces/monitoring/statsd/service.yaml b/kubernetes/namespaces/monitoring/statsd/service.yaml new file mode 100644 index 00000000..2d58b674 --- /dev/null +++ b/kubernetes/namespaces/monitoring/statsd/service.yaml @@ -0,0 +1,24 @@ +apiVersion: v1 +kind: Service +metadata: + name: statsd-exporter + namespace: monitoring + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9102" +spec: + selector: + app: statsd-exporter + ports: + - name: prom-scrape + protocol: TCP + port: 9102 + targetPort: prom-scrape + - name: statsd-udp + protocol: UDP + port: 8125 + targetPort: statsd-udp + - name: statsd-tcp + protocol: TCP + port: 8125 + targetPort: statsd-tcp