Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 60 additions & 2 deletions charts/cozystack/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,43 @@ machine:
{{- toYaml . | nindent 6 }}
{{- end }}
{{- /* extraSysctls MUST NOT collide with the preset's built-in
sysctls; same rationale as extraKubeletExtraArgs. */ -}}
sysctls; same rationale as extraKubeletExtraArgs. $builtinSysctls
is the single source of truth for the preset-owned keys — keep
it in sync with the literal sysctls block rendered further down.

Always-on DRBD/LINSTOR tuning: Cozystack always runs DRBD (the
drbd module is loaded unconditionally below), and these knobs
resolve the TCP-port exhaustion the Cozystack team observed on
production clusters under DRBD reconnect storms (node reboots,
resync). tcp_orphan_retries/tcp_fin_timeout speed up reclamation
of orphaned and FIN-WAIT sockets so a reconnect storm cannot
outrun cleanup; netdev_* widen the receive backlog so bursty
replication traffic isn't dropped under load.

vm.nr_hugepages is treated as preset-owned even when its gate
(.Values.nr_hugepages) is inactive, so operators always route it
through the dedicated `nr_hugepages` key. The tcp_keepalive_*
triplet is preset-owned only while .Values.tcpKeepaliveTuning is
set (see below), so it can be operator-supplied via extraSysctls
when the toggle is off. */ -}}
{{- $builtinSysctls := list
"vm.nr_hugepages"
"net.ipv4.neigh.default.gc_thresh1"
"net.ipv4.neigh.default.gc_thresh2"
"net.ipv4.neigh.default.gc_thresh3"
"net.ipv4.tcp_orphan_retries"
"net.ipv4.tcp_fin_timeout"
"net.core.netdev_max_backlog"
"net.core.netdev_budget"
"net.core.netdev_budget_usecs" }}
{{- if $.Values.tcpKeepaliveTuning }}
{{- $builtinSysctls = concat $builtinSysctls (list
"net.ipv4.tcp_keepalive_time"
"net.ipv4.tcp_keepalive_intvl"
"net.ipv4.tcp_keepalive_probes") }}
{{- end }}
{{- range $k, $_ := .Values.extraSysctls }}
{{- if or (eq $k "vm.nr_hugepages") (eq $k "net.ipv4.neigh.default.gc_thresh1") (eq $k "net.ipv4.neigh.default.gc_thresh2") (eq $k "net.ipv4.neigh.default.gc_thresh3") }}
{{- if has $k $builtinSysctls }}
{{- fail (printf "values.yaml: extraSysctls.%s collides with the cozystack preset's built-in machine.sysctls — keys never override (yaml.v3 rejects duplicate map keys on decode). Remove the entry from extraSysctls, or fork the chart preset if you need a different default." $k) }}
{{- end }}
{{- end }}
Expand All @@ -71,6 +105,16 @@ machine:
net.ipv4.neigh.default.gc_thresh1: "4096"
net.ipv4.neigh.default.gc_thresh2: "8192"
net.ipv4.neigh.default.gc_thresh3: "16384"
net.ipv4.tcp_orphan_retries: "3"
net.ipv4.tcp_fin_timeout: "30"
net.core.netdev_max_backlog: "5000"
net.core.netdev_budget: "600"
net.core.netdev_budget_usecs: "8000"
{{- if $.Values.tcpKeepaliveTuning }}
net.ipv4.tcp_keepalive_time: "600"
net.ipv4.tcp_keepalive_intvl: "10"
net.ipv4.tcp_keepalive_probes: "6"
{{- end }}
{{- with .Values.extraSysctls }}
{{- toYaml . | nindent 4 }}
{{- end }}
Expand Down Expand Up @@ -187,6 +231,20 @@ cluster:
- {{ . }}
{{- end }}
{{- end }}
{{- /* etcd backend quota, tunable via values. Raises etcd's 2GiB
default backend ceiling so a LINSTOR-heavy control plane —
thousands of DRBD-resource CRDs in aggregate — does not trip
etcd's NOSPACE alarm and drop into read-only mode. This is a
ceiling, not a reservation: a small cluster's DB stays small
and costs no extra RAM/disk. 8GiB is etcd's documented upper
bound (it warns above that). Blank the value to fall back to
etcd's own default. Note: this governs total DB size, not the
size of any single object — per-object writes are still gated
by kube-apiserver's fixed 3MiB request-body limit. */ -}}
{{- with (.Values.etcd | default dict).quotaBackendBytes }}
extraArgs:
quota-backend-bytes: {{ . | quote }}
{{- end }}
{{- end }}
{{- end }}

Expand Down
41 changes: 36 additions & 5 deletions charts/cozystack/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,31 @@ certSANs: []
nr_hugepages: 0
allocateNodeCIDRs: true

# Opt-in aggressive TCP keepalive tuning, OFF by default. When true the
# preset adds net.ipv4.tcp_keepalive_time=600 / intvl=10 / probes=6,
# reaping a dead idle socket in ~660s instead of the kernel default
# ~2h. These sysctls are kernel-wide: they change failure detection for
# EVERY long-lived idle TCP connection on the node (NFS mounts, DB
# clients, MQ consumers), not just DRBD. DRBD already detects dead peers
# in seconds via its own protocol-level ping (ping-int/ping-timeout), so
# this is a generic socket backstop rather than a DRBD requirement —
# hence opt-in. Enable on clusters where faster node-wide dead-socket
# detection is worth the shorter idle timeout.
tcpKeepaliveTuning: false

# Control-plane etcd tuning. Emitted only on controlplane nodes.
etcd:
# Max etcd backend DB size before etcd raises a NOSPACE alarm and
# rejects writes. The default raises etcd's own 2GiB ceiling to 8GiB
# (etcd's documented upper bound) so a LINSTOR-heavy cluster — many
# DRBD-resource CRDs in aggregate — doesn't exhaust the backend. This
# is a ceiling, not a reservation: a small DB stays small and costs no
# extra RAM/disk, so it is safe to leave at the default. Set to "" to
# fall back to etcd's built-in default. Governs total DB size only —
# single-object writes are gated by kube-apiserver's fixed 3MiB
# request-body limit, which is not configurable.
quotaBackendBytes: "8589934592"

# Operator-supplied extension points: each `extra*` key ADDS to a
# load-bearing default the cozystack preset ships. For every key
# below: leaving it at the empty default is a no-op; the preset's
Expand Down Expand Up @@ -135,11 +160,17 @@ extraKernelModules: []
extraKubeletExtraArgs: {}

# Extra kernel sysctls added to machine.sysctls alongside the
# preset's built-ins (net.ipv4.neigh.default.gc_thresh{1,2,3} and
# vm.nr_hugepages — the latter is gated by the dedicated `nr_hugepages`
# values key above; set THAT key, not extraSysctls.vm.nr_hugepages,
# even when the gate is currently inactive — the collision check
# treats vm.nr_hugepages as preset-owned regardless of the gate state).
# preset's built-ins. Preset-owned keys (collision-protected):
# - net.ipv4.neigh.default.gc_thresh{1,2,3}
# - vm.nr_hugepages (gated by the dedicated `nr_hugepages` key above;
# set THAT key, not extraSysctls.vm.nr_hugepages — the collision
# check treats vm.nr_hugepages as preset-owned regardless of gate)
# - net.ipv4.tcp_orphan_retries, net.ipv4.tcp_fin_timeout,
# net.core.netdev_max_backlog, net.core.netdev_budget,
# net.core.netdev_budget_usecs (always-on DRBD/LINSTOR tuning)
# - net.ipv4.tcp_keepalive_{time,intvl,probes} — preset-owned ONLY
# while `tcpKeepaliveTuning` below is true; when it is false these
# three are free for you to set here.
# Operator keys must be DISJOINT from the built-in set; a collision
# fails the render. Values must be YAML strings (Talos expects
# strings even for numeric sysctls). Example:
Expand Down
47 changes: 46 additions & 1 deletion docs/manual-test-plan.md
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ Expected:

- `.machine.kernel.modules` lists the built-in six (`openvswitch`, `drbd` with `usermode_helper=disabled`, `zfs`, `spl`, `vfio_pci`, `vfio_iommu_type1`) AND `nf_conntrack` — append, never override.
- `.machine.kubelet.extraConfig` carries the built-ins (`cpuManagerPolicy: static`, `maxPods: 512`) AND `feature-gates: NodeSwap=true`. Operator keys MUST NOT collide with built-ins; a collision fails the render at template time.
- `.machine.sysctls` carries the built-in `gc_thresh1/2/3` and `vm.nr_hugepages` (when set) AND `net.core.somaxconn`. Same rejection-on-collision rule.
- `.machine.sysctls` carries the built-in `gc_thresh1/2/3`, the always-on DRBD tuning (`net.ipv4.tcp_orphan_retries`, `net.ipv4.tcp_fin_timeout`, `net.core.netdev_max_backlog`, `net.core.netdev_budget`, `net.core.netdev_budget_usecs`), `vm.nr_hugepages` (when set), AND `net.core.somaxconn`. Same rejection-on-collision rule (the DRBD keys are preset-owned too). See B9 for the keepalive triplet and etcd quota.
- `.machine.files[].path` lists `/etc/cri/conf.d/20-customization.part`, `/etc/lvm/lvm.conf`, AND `/etc/example/operator.conf`.

Verify the rejection path explicitly. Set an operator key that collides with a built-in (e.g. `extraSysctls: { "net.ipv4.neigh.default.gc_thresh1": "9000" }`) and re-render:
Expand Down Expand Up @@ -316,6 +316,51 @@ Expected: all four prints "OK". Set any single `extra*` key non-empty, re-render

Regression anchor: contract tests `TestContract_Machine_Extra*_Generic_NonEmptyEmitsBlock` pin the on-state for each block; `TestContract_Machine_NoCozystackOpinionsOnGeneric` pins the off-state at default values. A regression that emits an empty `modules: []` / `sysctls: {}` / `files: []` block in the default render would fail the latter.

### B9. DRBD sysctl tuning, opt-in TCP keepalive, and etcd backend quota (cozystack)

The cozystack preset ships always-on DRBD/LINSTOR network sysctls, an opt-in aggressive TCP-keepalive triplet, and a tunable etcd backend quota. Render the cozystack controlplane preset at defaults:

```bash
talm template -f nodes/controlplane-0.yaml > /tmp/render.yaml
yq '.machine.sysctls' /tmp/render.yaml
yq '.cluster.etcd' /tmp/render.yaml
```

Expected at defaults:

- `.machine.sysctls` includes `net.ipv4.tcp_orphan_retries: "3"`, `net.ipv4.tcp_fin_timeout: "30"`, `net.core.netdev_max_backlog: "5000"`, `net.core.netdev_budget: "600"`, `net.core.netdev_budget_usecs: "8000"` (always on).
- `.machine.sysctls` does NOT contain any `net.ipv4.tcp_keepalive_*` key — the triplet is opt-in (`tcpKeepaliveTuning: false` by default).
- `.cluster.etcd.extraArgs.quota-backend-bytes` is `"8589934592"` (8 GiB) on a controlplane render.

Render the worker preset (`yq '.cluster.etcd' /tmp/render-worker.yaml` → `null`): the whole `etcd` block, and thus `quota-backend-bytes`, is controlplane-only.

Enable the keepalive triplet and lower the etcd quota:

```yaml
# values.yaml additions:
tcpKeepaliveTuning: true
etcd:
quotaBackendBytes: "2147483648" # 2 GiB
```

Re-render and verify:

- `.machine.sysctls` now also carries `net.ipv4.tcp_keepalive_time: "600"`, `net.ipv4.tcp_keepalive_intvl: "10"`, `net.ipv4.tcp_keepalive_probes: "6"`.
- `.cluster.etcd.extraArgs.quota-backend-bytes` is now `"2147483648"`.

Blank the quota to fall back to etcd's own default:

```bash
# values.yaml: etcd: { quotaBackendBytes: "" }
talm template -f nodes/controlplane-0.yaml | yq '.cluster.etcd.extraArgs'
```

Expected: `null` — a blank quota omits the `extraArgs` block entirely rather than emitting an empty value.

Collision check: with `tcpKeepaliveTuning: true`, set `extraSysctls: { "net.ipv4.tcp_keepalive_time": "1200" }` and re-render — the render fails with the `collides with the cozystack preset's built-in machine.sysctls` error (the keepalive keys become preset-owned once the toggle is on). With `tcpKeepaliveTuning: false`, the same `extraSysctls` entry is accepted and renders as the sole keepalive sysctl.

Regression anchor: `TestContract_Machine_Sysctls_DRBDTuning_Cozystack`, `TestContract_Machine_Sysctls_TCPKeepalive_*`, and `TestContract_Cluster_Etcd_QuotaBackendBytes_*` pin every branch above; `TestContract_Machine_Sysctls_DRBDTuning_AbsentOnGeneric` / `TestContract_Cluster_Etcd_QuotaBackendBytes_AbsentOnGeneric` pin that the generic preset stays free of these opinions.

## C. Apply (auth path)

This section is the smoke-test for the apply pipe itself; the per-gate matrix lives in **Section C-safety** below.
Expand Down
59 changes: 59 additions & 0 deletions pkg/engine/contract_cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ func TestContract_Cluster_NoControlplaneBlocksOnWorker(t *testing.T) {
assertNotContains(t, out, "controllerManager:")
assertNotContains(t, out, "scheduler:")
assertNotContains(t, out, "etcd:")
assertNotContains(t, out, "quota-backend-bytes")
assertNotContains(t, out, "allowSchedulingOnControlPlanes")
assertNotContains(t, out, "proxy:")
assertNotContains(t, out, "discovery:")
Expand Down Expand Up @@ -371,6 +372,64 @@ func TestContract_Cluster_DiscoveryDisabled_Cozystack(t *testing.T) {
}
}

// Contract: cozystack control-plane nodes raise etcd's backend quota to
// 8GiB (values.etcd.quotaBackendBytes default) via etcd.extraArgs. This
// lifts etcd's own 2GiB ceiling so a LINSTOR-heavy cluster doesn't trip
// the NOSPACE alarm. Quoted string (Talos etcd extraArgs are string
// maps). Controlplane only — the block sits inside the controlplane
// guard, so worker configs never carry it.
func TestContract_Cluster_Etcd_QuotaBackendBytes_Default_Cozystack(t *testing.T) {
for _, cell := range cozystackControlplaneCells() {
t.Run(cell.name, func(t *testing.T) {
out := renderChartTemplate(t, cell.chartPath, cell.templateFile, cell.talosVersion)
assertContains(t, out, "etcd:")
assertContains(t, out, "extraArgs:")
assertContains(t, out, `quota-backend-bytes: "8589934592"`)
})
}
}

// Contract: the quota is tunable — an operator override on
// etcd.quotaBackendBytes renders verbatim, so a small cluster can lower
// the ceiling (or a larger one raise it) without forking the chart.
func TestContract_Cluster_Etcd_QuotaBackendBytes_Tunable_Cozystack(t *testing.T) {
out := renderCozystackWith(t, helmEngineEmptyLookup, map[string]any{
"advertisedSubnets": []any{testAdvertisedSubnet},
"etcd": map[string]any{
"quotaBackendBytes": "2147483648",
},
})
assertContains(t, out, `quota-backend-bytes: "2147483648"`)
assertNotContains(t, out, `quota-backend-bytes: "8589934592"`)
}

// Contract: blanking etcd.quotaBackendBytes omits the extraArg entirely
// (falls back to etcd's built-in default) rather than emitting an empty
// or malformed value. The whole etcd.extraArgs block is gated on a
// non-empty quota.
func TestContract_Cluster_Etcd_QuotaBackendBytes_OmittedWhenBlank_Cozystack(t *testing.T) {
out := renderCozystackWith(t, helmEngineEmptyLookup, map[string]any{
"advertisedSubnets": []any{testAdvertisedSubnet},
"etcd": map[string]any{
"quotaBackendBytes": "",
},
})
assertContains(t, out, "etcd:")
assertNotContains(t, out, "quota-backend-bytes")
}

// Contract: the generic preset carries no etcd quota opinion — a
// regression that leaked the cozystack default into the generic helper
// would surface here.
func TestContract_Cluster_Etcd_QuotaBackendBytes_AbsentOnGeneric(t *testing.T) {
for _, cell := range genericControlplaneCells() {
t.Run(cell.name, func(t *testing.T) {
out := renderChartTemplate(t, cell.chartPath, cell.templateFile, cell.talosVersion)
assertNotContains(t, out, "quota-backend-bytes")
})
}
}

// === generic-only contracts: pin that generic stays minimal ===

// Contract: generic chart does NOT expose clusterDomain in values.yaml
Expand Down
Loading
Loading