diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index e9148530..d0a96a78 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -44,7 +44,7 @@ jobs:
         uses: golangci/golangci-lint-action@v7
         with:
           version: v2.12.2
-          args: --timeout=5m
+          args: --timeout=10m
 
   dco:
     runs-on: ubuntu-latest
diff --git a/README.md b/README.md
index fc3ac6fb..a0092369 100644
--- a/README.md
+++ b/README.md
@@ -170,6 +170,18 @@ cluster:
 
 > **Version compatibility (`templateOptions.talosVersion` / `--talos-version`).** This setting must match the **Talos version actually running on the target node** — i.e. the maintenance ISO/PXE the node booted from for `apply -i`, or the installed Talos for an authenticated apply. It is **not** the same as `install.image`, which only controls what gets written to disk after a successful apply. When the configured contract is newer than the running binary, machinery injects fields (e.g. `machine.install.grubUseUKICmdline` from v1.12) that the running parser does not know, and the apply fails on the node side with `failed to parse config: unknown keys found during decoding: ...`. `talm apply` runs a best-effort pre-flight check against the running version and prints a `warning: pre-flight: ...` line with a hint when it detects this mismatch; if the warning is missed, the same hint is appended to the apply error. Either reboot the node into a maintenance image that matches the configured contract, or lower `templateOptions.talosVersion` / `--talos-version` to match what is running.
 
+> **Apply-time safety gates.** `talm apply` and `talm upgrade` run additional gates around each operation:
+>
+> 1. **Declared-resource existence** (`--skip-resource-validation` opt-out, default on). Before sending the config to the node, the gate walks the rendered MachineConfig, extracts every reference to a host-side resource (network links from v1.12 multi-doc — `LinkConfig.name`, `BondConfig.links[]`, `VLANConfig.parent`, `BridgeConfig.links[]`, `Layer2VIPConfig.link`, `HCloudVIPConfig.link`, `DHCPv4Config.name` / `DHCPv6Config.name` / `EthernetConfig.name`; v1.11 legacy `machine.network.interfaces[].interface`; install disk via `machine.install.disk` literal or `machine.install.diskSelector`; `UserVolumeConfig.provisioning.diskSelector`), and verifies each against the node's COSI `LinkStatus`/`Disk` snapshots. A reference that doesn't resolve fails the apply with a `[blocker]` line listing the available names so the typo or migration miss is fixable from the values without re-running discovery. Disk selectors must match at least one (non-readonly, non-CDROM, non-virtual) disk — zero matches block, multiple matches warn (install picks the first). Virtual-link-creator documents (`BondConfig.name`, `VLANConfig.name`, `BridgeConfig.name`, `WireguardConfig.name`, `DummyLinkConfig.name`, `LinkAliasConfig.name`) are intentionally NOT validated against existing links — those `.name` fields describe new virtual links the apply is creating, not references to pre-existing host resources. Out of scope today: `machine.disks[].device` (extra-disk partitioning); track in a follow-up if you need it. Pass `--skip-resource-validation` for recovery into a maintenance image with mismatched hardware or pre-staging values for hardware that isn't installed yet.
+>
+> 2. **Pre-apply drift preview** (`--skip-drift-preview` opt-out, default on). Reads the node's current MachineConfig via COSI and prints a `+`/`-`/`~`/`=` diff of what's about to change, keyed by `(kind, name)`. Informational only — never blocks. The `-` lines are the most useful: they surface stale documents from a previous apply that the new render no longer emits (e.g. an `eth1` LinkConfig lingering after a migration to `eth0`). Reading the current config requires the auth path — `MachineConfig` is a Sensitive COSI resource and is unreachable on the `--insecure` maintenance connection; the gate prints `drift verification unavailable on maintenance connection` and proceeds in that case. **`--dry-run` runs this gate** — the diff is read-only and "show me what would change" is exactly the dry-run contract.
+>
+> 3. **Post-apply state verification** (`--skip-post-apply-verify` opt-out, **default off** until the Talos-mutated-field allowlist lands — see [#172](https://github.com/cozystack/talm/issues/172)). After `ApplyConfiguration` returns success, re-reads the on-node MachineConfig and structurally compares it against the bytes that were sent. Divergence blocks the apply chain with a per-document diff, primarily catching silent doc drops (Talos parser ignored an unknown field) and controller reverts. Disabled by default because Talos mutates a handful of leaf fields post-apply (cert hashes, timestamps) that would surface as false-positive divergence without an allowlist. The verify runs only on `--mode=no-reboot`. `--mode=staged`, `--mode=try`, `--mode=reboot`, and `--mode=auto` all skip the gate — each for a documented reason: staged stores rather than activates; try auto-rolls back; reboot kills the COSI connection mid-verify; auto is promoted by Talos to REBOOT internally when the change requires it, so the verify would race the reboot. `--dry-run` skips it too.
+>
+> 4. **Post-upgrade version verify** (`--skip-post-upgrade-verify` opt-out, default on — the gate runs). After `talm upgrade` reports success, waits 90s for the node to finish booting then reads `runtime.Version` COSI and compares the running version's `(Major, Minor)` contract against the contract parsed from the target image tag. Point releases share a minor contract; cross-minor mismatch surfaces as a hint-bearing blocker. Catches the silent A/B rollback case where the upgrade RPC acks success but Talos rolled back to the previous partition (cross-vendor image, missing extensions, failed boot readiness check, slow boot exceeding the reconcile window). Best-effort surrender on digest-pinned images and unparseable tags. See [#175](https://github.com/cozystack/talm/issues/175) for the reproduction.
+>
+> The skip flags don't suppress each other — pass them independently. On the `--insecure` (maintenance) path the gates are functionally unreachable for charts that drive discovery via `lookup` — those COSI lookups require an authenticated connection and the render itself errors before any gate runs. Charts that render fully offline (no `lookup` calls) reach the gates on `--insecure` as well, with the Phase 2 hooks degrading gracefully because the `MachineConfig` resource is Sensitive.
+
 Apply config:
 ```bash
 talm apply -f nodes/node1.yaml -i
diff --git a/docs/apply-safety-gates-test-plan.md b/docs/apply-safety-gates-test-plan.md
new file mode 100644
index 00000000..72d894c9
--- /dev/null
+++ b/docs/apply-safety-gates-test-plan.md
@@ -0,0 +1,208 @@
+# Apply-time safety gates: test plan
+
+A reference checklist for validating changes to the apply-time safety gates introduced in #172 / PR #173. Covers the contract tests that ship with the package plus the manual real-Talos validation steps that surface issues unit tests cannot.
+
+## Build under test
+
+```bash
+cd ~/git/github.com/cozystack/talm && go build -o /tmp/talm-safety ./
+```
+
+Run all matrix cells against the binary at `/tmp/talm-safety`. Use a 3-node Talos v1.12.6 cluster for live runs (any OCI / cloud / bare-metal stand with reachable talosconfig works).
+
+## Phase 1 — declared-resource existence
+
+### Link references
+
+| Case | How to trigger | Expected |
+| --- | --- | --- |
+| Typoed `LinkConfig.name` | Add `LinkConfig{name: eth9999}` to a node body | `[blocker] declared link "eth9999" not found …` plus available-links hint |
+| Typoed bond slave | Add `BondConfig{name: bond0, links: [ghost0, ens5]}` | Blocker on `ghost0` only; `bond0` (new bond) NOT flagged |
+| Typoed VLAN parent | Add `VLANConfig{name: ens5.99, parent: ghost0, vlanID: 99}` (YAML key is `parent`, NOT `link` — `vlan.go ParentLinkConfig`) | Blocker on `parent: ghost0`; `name: ens5.99` not flagged (new VLAN) |
+| Typoed bridge slave | Add `BridgeConfig{name: br99, links: [ghost0]}` (YAML key is `links`, NOT `ports` — `bridge.go BridgeLinks`) | Blocker on `ghost0` only; `br99` not flagged |
+| Typoed Layer2VIP link | Set `vipLink: ghost0` in values | Blocker on `link: ghost0` |
+| Legacy v1.11 interface | `machine.network.interfaces[].interface: eth9999` | Blocker; same hint shape |
+
+### Disk references
+
+| Case | How to trigger | Expected |
+| --- | --- | --- |
+| Bad literal disk | Set `machine.install.disk: /dev/sdz` | Blocker, hint lists real disks (sda, sdb) — **must omit** virtual class (dm-*, drbd*, loop*) |
+| Bad model selector | `diskSelector: {model: Samsumg}` | Blocker "matches zero disks", hint lists candidate disks with size |
+| Impossible size | `diskSelector: {size: ">= 99TB"}` | Blocker "matches zero disks" |
+| Lowercase units | `diskSelector: {size: ">= 100gb"}` | Matches as if `>= 100GB` (humanize.ParseBytes case-insensitive) |
+| Mixed case + spaces | `diskSelector: {size: "<= 200000MiB"}` | Parsed correctly |
+| Multiple matches | `diskSelector: {type: ssd}` on host with several SSDs | Warning (not blocker) "matches multiple disks; install picks the first match" |
+| Type semantics | `type: nvme/sd/hdd/ssd` per-disk Transport+Rotational | Mirror Talos `v1alpha1_provider.go:1325-1351` mapping |
+| Readonly excluded | Selector + a readonly disk on host | Readonly disk not counted as match |
+| CDROM excluded | Selector + a CD drive on host | CD not counted as match |
+| Virtual excluded | Selector on cozystack host with many dm/drbd/loop | dm/drbd/loop not counted; hint omits them |
+
+### Hint length budget
+
+| Case | Trigger | Expected |
+| --- | --- | --- |
+| Few candidates (≤10) | Storage host with 4 disks, bad selector | Hint lists all candidates inline; no `... and N more` suffix |
+| Many candidates (>10) | Host with 25+ links (bonds, VLANs, bridges) + bad link ref | Hint shows first 10 alphabetically; tail collapsed as `... and 15 more`; total chars on the hint line stays under ~400 |
+| Boundary case (exactly 11) | 11 links on the host, bad ref | First 10 inline + `... and 1 more` (the suffix fires at >10, not at >=10) |
+| Empty candidate set | Selector matches zero, no real candidates either (mock) | Hint says `<none>` rather than empty trailing space |
+
+### Opt-out
+
+| Case | Trigger | Expected |
+| --- | --- | --- |
+| `--skip-resource-validation` | Pass with bad selector + bad link | No Phase 1 output; render proceeds |
+
+## Phase 2A — pre-apply drift preview
+
+### Diff classification
+
+| Case | Trigger | Expected |
+| --- | --- | --- |
+| Identical desired/on-node | First-run apply after the same render | `0 addition, 0 removal, 0 update, N unchanged.` |
+| Removed doc | Apply config that drops a previously-emitted doc (e.g. dropping a LinkConfig that was on-node) | `- LinkConfig{name: …}` line |
+| Added doc | Apply config that adds a fresh doc | `+ LinkConfig{name: …}` line |
+| Updated leaf | Change one nested field (e.g. `clusterDomain`) | `~ MachineConfig` plus `cluster.network.dnsDomain: cozy.local -> cozy.example` |
+| Identical inputs include Equal entries | Verified via Diff API; OpEqual entries returned, FilterChanged drops them | — |
+| Distinguish absent vs null | YAML `extraField: null` added to one side | FieldChange.HasOld=false / HasNew=true; formatter renders `(absent) -> <nil>` |
+| Stable ordering | Re-run on same inputs | Identical output bytes |
+
+### Path / mode interactions
+
+| Case | Trigger | Expected |
+| --- | --- | --- |
+| Dry-run shows preview | `talm apply --dry-run -f node.yaml` | Phase 2A runs; this is the "show me what would change" contract |
+| `--mode=no-reboot` | `talm apply --mode=no-reboot -f node.yaml` | Phase 2A runs |
+| `--mode=auto` | `talm apply --mode=auto -f node.yaml` | Phase 2A runs |
+| `--mode=reboot` | `talm apply --mode=reboot -f node.yaml` | Phase 2A runs (preview is read-only and shows what the reboot will activate) |
+| `--mode=staged` | `talm apply --mode=staged -f node.yaml` | Phase 2A runs (operator still wants to see what got staged) |
+| `--mode=try` | `talm apply --mode=try -f node.yaml` | Phase 2A runs (mirrors --mode=auto from the preview's perspective) |
+| Insecure path | `talm apply -i -f node.yaml` (where chart can render offline) | `talm: drift verification unavailable on maintenance connection`; no block |
+| `--skip-drift-preview` | Pass with any change | Preview suppressed entirely |
+
+### Output pretty-print
+
+| Case | Trigger | Expected |
+| --- | --- | --- |
+| Scalar field change | Change `clusterDomain` in values | `cluster.network.dnsDomain: cozy.local -> cozy.example` inline |
+| Map field change | Add a key to `machine.nodeLabels` | YAML flow mapping `{role: control-plane, tier: primary}`, NOT `map[role:control-plane tier:primary]` |
+| Multi-element slice add-or-remove | Add a SAN to `cluster.apiServer.certSANs` | `cluster.apiServer.certSANs: added [192.0.2.6]` — set-diff form, NOT a full-slice dump |
+| Duplicate-cleanup on slice | Remove a duplicate entry from `certSANs` | `cluster.apiServer.certSANs: removed [127.0.0.1]` — multiset semantics surface a single removal |
+| Both add and remove | Replace one SAN with another | `cluster.apiServer.certSANs: removed [old.example], added [new.example]` |
+| Reorder-only change | Same elements in a different order | `cluster.apiServer.certSANs: reordered (3 element(s))` — explicit signal, NOT silent OpUpdate |
+| Slice appearing from absent | Field added that wasn't there before | `(absent) -> [a, b, c]` — flow-style list, NOT `[a b c]` |
+
+## Phase 2B — post-apply state verification
+
+Default off until the Talos-mutated-field allowlist lands. Enable explicitly with `--skip-post-apply-verify=false`.
+
+| Case | Trigger | Expected |
+| --- | --- | --- |
+| Clean apply | Apply config matching on-node, `--skip-post-apply-verify=false` | Silent success (no output, no error) |
+| Mode=staged | `--mode=staged --skip-post-apply-verify=false` | Phase 2B skipped (staged store doesn't change ActiveID) |
+| Mode=try | `--mode=try --skip-post-apply-verify=false` | Phase 2B skipped (rollback timer races verify) |
+| Mode=reboot | `--mode=reboot --skip-post-apply-verify=false` | Phase 2B skipped (reboot kills the COSI connection mid-verify) |
+| Mode=auto | `--mode=auto --skip-post-apply-verify=false` | Phase 2B skipped — Talos promotes AUTO to REBOOT internally when the change requires it, so the verify would race the reboot (same shape as the explicit REBOOT skip). Acceptable cost: AUTO applies that don't reboot also lose their verify; pass `--mode=no-reboot` to opt back in |
+| Mode=no-reboot | Real apply with verify enabled | Phase 2B runs (the only mode where the verify is guaranteed to reach a stable post-apply ActiveID) |
+| Dry-run | `--dry-run --skip-post-apply-verify=false` | Phase 2B skipped (no real apply) |
+| Reader error | Simulated COSI hiccup on auth path | Hint-bearing blocker `post-apply: re-reading on-node MachineConfig`, exit non-zero (the gate is here to catch silent rollbacks — error is not swallowed) |
+| Insecure path | `talm apply -i --skip-post-apply-verify=false` | `drift verification unavailable on maintenance connection` line; no block |
+
+## Phase 2C — post-upgrade version verify
+
+On by default for `talm upgrade`. The gate fires after talosctl upgrade returns success, waits 90s for the node to finish booting, reads `runtime.Version` COSI, and compares the running version's contract against the contract parsed from the target image tag. Catches the silent A/B rollback case where Talos rolls back to the previous partition (cross-vendor image, missing extensions, failed boot readiness check) yet talosctl's RPC already acked.
+
+| Case | Trigger | Expected |
+| --- | --- | --- |
+| Same-minor upgrade | `talm upgrade -f node.yaml` to a same-minor image (e.g. v1.12.6 -> v1.12.7) | Silent success; contracts match at minor level |
+| Cross-minor mismatch | upgrade to `siderolabs/installer:v1.13.0` on a node that rolls back to v1.12 | Hint-bearing blocker citing both versions + two-hypothesis hint (rollback OR slow boot) |
+| `--skip-post-upgrade-verify` | Pass with any image | Phase 2C suppressed entirely |
+| `--insecure` upgrade | Maintenance path — auth-only COSI unreachable | Phase 2C skipped entirely (hard early-return in `shouldRunPostUpgradeVerify(insecure=true, …)`). Distinct from the Phase 2A/2B insecure path, which still calls the COSI reader and degrades gracefully with a "drift verification unavailable on maintenance connection" line — Phase 2C drops the call site itself because there is no graceful degradation path for "verify the version after upgrade" without auth |
+| `--stage` upgrade | New partition not yet activated until reboot — `runtime.Version` would always be the OLD value | Phase 2C skipped via `shouldRunPostUpgradeVerify(staged=true, …)`; guaranteed false positive without skip |
+| Digest-pinned image | `--image foo/bar@sha256:abc...` | Phase 2C surrenders silently (no tag to parse the target version from) |
+| Image with no tag | `--image foo/bar` | Phase 2C surrenders silently |
+| Slow boot beyond 90s | Cold OCI instance or large image pull | Same blocker as cross-minor mismatch BUT hint instructs the operator to re-run `talm get version` after a minute to distinguish a slow boot from a real rollback |
+| Real read failure (connection refused, RPC error) | Reader returns `("", false, err)` | Hint-bearing blocker — a node that auto-rolled back or hung mid-boot looks like "connection refused" from the COSI client, so the read failure IS the rollback signal. Same two-hypothesis hint as a detected mismatch (rollback OR slow boot). The blocker wraps the underlying err so the operator sees the cause |
+| By-design unreachable | Reader returns `("", false, nil)` (cosiVersionReader does not produce this; reserved for future custom readers that need to surrender silently) | Soft warning line `post-upgrade verification skipped, could not read running version from the node`, no block. Distinguishable from the real-read-failure case via the err — three-valued contract makes the contract explicit |
+| Zero target nodes | `--nodes` empty and talosconfig context has no nodes either | Explanatory "skipped, no target nodes resolved" line (no silent no-op) |
+| Reconcile wait line | Any non-skipped run | "post-upgrade verify: waiting 1m30s for the node to finish booting..." printed up front so the operator's terminal isn't a mystery hang |
+
+## Real-Talos validation
+
+Before requesting human review, exercise the gates against a live Talos node.
+
+### Setup
+
+```bash
+cd /path/to/your/talm/values/tree
+```
+
+Use a 3-node Talos cluster (replace placeholders below with your own node IPs — examples use RFC 5737 documentation ranges). The vendored talm library in `charts/talm/` may need `talm init --update --preset cozystack` to pick up new helpers; pass `--force` for non-interactive refresh (CI, scripted), or run under a tty for the interactive prompt.
+
+### Sanity check
+
+```bash
+/tmp/talm-safety template -f nodes/node0.yaml > /tmp/rendered.yaml
+test -s /tmp/rendered.yaml || echo "render failed"
+```
+
+### Phase 1 (auth path)
+
+```bash
+# Clean run — should silently pass:
+/tmp/talm-safety apply --dry-run -f nodes/node0.yaml
+
+# Inject a bad link ref (cp + edit a temp file inside the talm project):
+cp nodes/node0.yaml nodes/_test-bad.yaml
+echo -e "---\napiVersion: v1alpha1\nkind: LinkConfig\nname: eth9999" >> nodes/_test-bad.yaml
+/tmp/talm-safety apply --dry-run -f nodes/_test-bad.yaml  # expect [blocker]
+rm nodes/_test-bad.yaml
+```
+
+### Phase 2A (drift preview)
+
+```bash
+# Dry-run against a clean cluster — should report 0/0/0 unchanged:
+/tmp/talm-safety apply --dry-run -f nodes/node0.yaml
+
+# Force a leaf change via values.yaml (back up then revert):
+sed -i.bak 's/^clusterDomain: .*/clusterDomain: cozy.example/' values.yaml
+/tmp/talm-safety apply --dry-run -f nodes/node0.yaml | grep -E "^  [+\-~=]|^      "
+mv values.yaml.bak values.yaml
+```
+
+### Phase 2B (real apply with verify enabled)
+
+```bash
+/tmp/talm-safety apply --mode=no-reboot --skip-post-apply-verify=false -f nodes/node0.yaml
+# Expected: drift preview + 'Applied configuration without a reboot' + silent post-apply verify
+```
+
+### Multi-node + mix
+
+```bash
+/tmp/talm-safety apply --dry-run -f nodes/node0.yaml -f nodes/node1.yaml -f nodes/node2.yaml
+# Each node renders its own preview; per-node independence.
+```
+
+### Insecure path
+
+`talm apply -i` exercises the maintenance connection. The cozystack reference chart uses live discovery (`lookup "disks"`), which fails on insecure (no auth for COSI). The render errors before the gate runs — that's existing talm behaviour, not a regression.
+
+## Implementation health
+
+Run as part of every push:
+
+```bash
+go test ./...
+go test -race ./pkg/applycheck/... ./pkg/commands/...
+golangci-lint run ./...
+GOOS=windows golangci-lint run ./...
+go vet ./...
+```
+
+## Known limitations / follow-ups
+
+- **Talos-mutated-field allowlist** (open in #172): Phase 2B reports cert hashes / timestamps as divergence today; the verify is off by default until an allowlist lands.
+- **`talm upgrade` has no pre-upgrade gates** (Phase 2C runs *after*, not before): the upgrade flow wraps `talosctl upgrade` and doesn't route through `buildApplyClosure` / `applyOneFileDirectPatchMode`, so Phase 1 / Phase 2A do not run. Phase 2C (post-upgrade version verify) was added precisely to catch the silent-rollback class without that refactor. Full pre-upgrade gates would require reproducing the gate calls in `upgrade_handler.go` or refactoring the apply flow.
+- **Phase 1/2 on `--insecure`**: the safety gates can't run before the chart renders, and the chart's `lookup` calls need an authenticated COSI connection. Insecure path = effectively no gates today.
diff --git a/docs/manual-test-plan.md b/docs/manual-test-plan.md
new file mode 100644
index 00000000..b746b888
--- /dev/null
+++ b/docs/manual-test-plan.md
@@ -0,0 +1,781 @@
+# talm manual test plan
+
+A QA-oriented matrix for exercising `talm` end-to-end against a real Talos cluster. Designed to surface bugs that unit + contract tests miss — encoding edge cases, real-disk topology quirks, multi-node interactions, recovery flows.
+
+The narrow apply-safety-gates checklist lives at [`apply-safety-gates-test-plan.md`](./apply-safety-gates-test-plan.md); this document covers the whole CLI surface.
+
+## How to use this plan
+
+1. Build the binary under test:
+
+   ```bash
+   cd ~/git/github.com/cozystack/talm && go build -o /tmp/talm-safety ./
+   ```
+
+2. Have a reachable Talos cluster (3 controlplane nodes recommended so you can exercise reset / etcd-member-removal without losing quorum). A small OCI / cloud / bare-metal v1.12.x stand is enough.
+
+3. Work through the sections below in order. Each section lists a command, the expected outcome, and the failure modes to watch for. **Regression anchors** at the bottom of some sections call out specific behaviours to assert against — formulated as forward-looking checks the operator runs, not as a retrospective of past findings.
+
+4. After every destructive action (reset / wipe / rotate-ca) run the sanity-check block at the end of this document.
+
+## Conventions
+
+- `$NODE` is the node's reachable IP. Use the same value for `--nodes` and `--endpoints` unless you're exercising a multi-node bug.
+- `$TALM_REPO` is `~/git/github.com/cozystack/talm`.
+- `$PROJECT` is your talm project root (a directory with `Chart.yaml` and `nodes/`).
+- `--dry-run` works on `apply` and `rotate-ca`. Use it first whenever the command can mutate cluster state.
+
+## A. Project bootstrap
+
+### A1. `talm init` from scratch
+
+```bash
+mkdir -p /tmp/talm-init-test && cd /tmp/talm-init-test
+/tmp/talm-safety init --preset cozystack --name test-cluster \
+  --endpoints https://192.0.2.1:6443
+```
+
+Expected: creates `Chart.yaml`, `charts/talm/`, `templates/`, `nodes/`, `secrets.yaml` + `secrets.encrypted.yaml`, `talosconfig` + `talosconfig.encrypted`, `talm.key`, `values.yaml`. Prints the "Security Information" banner reminding the operator to back up `talm.key`.
+
+Watch for:
+
+- Missing files in the listing.
+- `talm.key` written without the security-information banner.
+- `.gitignore` not updated.
+
+### A2. `talm init` second run without `--force`
+
+```bash
+cd /tmp/talm-init-test
+/tmp/talm-safety init --preset cozystack --name test-cluster
+```
+
+Expected: error citing each conflicting file, hint mentioning both `--force` and `--update`. Exit non-zero.
+
+### A3. `talm init --update --preset cozystack` non-tty
+
+```bash
+cd $PROJECT
+/tmp/talm-safety init --update --preset cozystack < /dev/null
+```
+
+Expected: hint-bearing error pointing at `--force`. NOT a raw `reading interactive overwrite confirmation: EOF`.
+
+### A4. `talm init --update --preset cozystack --force` non-tty
+
+```bash
+cd $PROJECT
+/tmp/talm-safety init --update --preset cozystack --force < /dev/null
+```
+
+Expected: one `Overwriting <path> (--force)` line per diff; no prompt; exit zero.
+
+### A5. Encrypt / decrypt round-trip
+
+```bash
+cd /tmp/talm-init-test
+/tmp/talm-safety init --decrypt
+test -f secrets.yaml && test -f talosconfig
+/tmp/talm-safety init --encrypt
+test -f secrets.encrypted.yaml && test -f talosconfig.encrypted
+```
+
+Expected: per-file `Decrypting X -> Y` / `Encrypting X -> Y` lines; both round-trips succeed.
+
+### A6. Decrypt without `talm.key`
+
+```bash
+cd /tmp/talm-init-test && mv talm.key /tmp/talm.key.backup
+/tmp/talm-safety init --decrypt
+mv /tmp/talm.key.backup talm.key
+```
+
+Expected: clear error mentioning the missing key path.
+
+### A7. Cleanup
+
+```bash
+rm -rf /tmp/talm-init-test
+```
+
+**Regression anchor**: A6's error must reference the missing-key path explicitly. A bare `read key file: open ...: no such file or directory` with no follow-up hint is a UX regression — the operator should see a clear recovery path (`run \`talm init\` to generate a new key, or restore your backup`).
+
+## B. Render / template
+
+### B1. Happy-path render
+
+```bash
+cd $PROJECT
+/tmp/talm-safety template -f nodes/node0.yaml | head -10
+```
+
+Expected: rendered MachineConfig YAML starting with the project modeline. Exit zero.
+
+### B2. Render with CLI override
+
+```bash
+/tmp/talm-safety template -f nodes/node0.yaml \
+  --set clusterDomain=overridden.local | grep dnsDomain
+```
+
+Expected: `dnsDomain: overridden.local` appears in output.
+
+### B3. Render against missing file
+
+```bash
+/tmp/talm-safety template -f nodes/_doesnotexist.yaml
+```
+
+Expected: clear error with hint about the missing path. Exit non-zero.
+
+### B4. In-place rewrite (`-I`)
+
+```bash
+cp nodes/node0.yaml /tmp/inplace-before.yaml
+/tmp/talm-safety template -I -f nodes/node0.yaml
+diff /tmp/inplace-before.yaml nodes/node0.yaml
+cp /tmp/inplace-before.yaml nodes/node0.yaml  # restore
+```
+
+Expected: `Updated.` on stdout. The diff shows that the rendered body replaces the previous contents — **including any user-added comments**. This is by design (`-I` is rewrite, not merge), but operators often trip on it; note in your test report.
+
+### B5. Render with stale chart preset
+
+When the local `charts/talm/` is older than the talm binary's embedded preset, `talm template` succeeds against the local preset — it does NOT auto-bump. The operator must run `init --update`. Confirm by inspecting `talm version` against `Chart.yaml`.
+
+**Regression anchor**: `template -I` is rewrite, not merge — verify by adding a `# my comment` line above the modeline in `nodes/node0.yaml`, running B4, and confirming the comment is GONE in the new body. If the comment survives, a behaviour change shipped (could be either an intentional new `--preserve-comments` flag or an undocumented merge mode — neither should appear silently).
+
+## C. Apply (auth path)
+
+The apply-safety gates are detailed in [`apply-safety-gates-test-plan.md`](./apply-safety-gates-test-plan.md). This section is the minimal smoke-test for the apply pipe itself.
+
+### C1. Dry-run apply
+
+```bash
+/tmp/talm-safety apply --dry-run -f nodes/node0.yaml
+```
+
+Expected: drift-preview section, then `Dry run summary:` and the diff the apply would produce. Exit zero.
+
+### C2. Real apply, no-reboot mode
+
+```bash
+/tmp/talm-safety apply --mode=no-reboot \
+  --skip-post-apply-verify=false -f nodes/node0.yaml
+```
+
+Expected: drift preview, `Applied configuration without a reboot`. Phase 2B is silent on a clean apply.
+
+### C3. Multi-file apply
+
+```bash
+/tmp/talm-safety apply --dry-run \
+  -f nodes/node0.yaml -f nodes/node1.yaml -f nodes/node2.yaml
+```
+
+Expected: each node renders / diffs independently; per-node gate output sections.
+
+### C4. Stage mode
+
+```bash
+/tmp/talm-safety apply --mode=staged --skip-post-apply-verify=false \
+  -f nodes/node0.yaml
+```
+
+Expected: Phase 2B auto-skipped (staged config doesn't change ActiveID); output ends with `Staged configuration to be applied after the next reboot`.
+
+## D. Apply (insecure / maintenance path)
+
+### D1. Apply with chart that uses discovery
+
+```bash
+/tmp/talm-safety apply -i --dry-run -f nodes/node0.yaml
+```
+
+Expected: render fails because `lookup "disks"` / `lookup "links"` require auth. Hint mentions reachability.
+
+### D2. Drift-preview degrade on insecure path (when render succeeds)
+
+When a chart renders fully offline (no `lookup`), `talm apply -i` runs through to the gates. Phase 2A prints `drift verification unavailable on maintenance connection` and proceeds; Phase 2B same.
+
+**Regression anchor**: D2's offline-renderable behaviour is also covered by unit-level mocking — see `pkg/commands/preflight_apply_safety_test.go` for the in-process equivalent. Surface that file's tests in the manual suite when D2 is impractical to exercise live.
+
+## E. Upgrade
+
+### E1. Stage an upgrade to the same image
+
+```bash
+/tmp/talm-safety upgrade --stage -f nodes/node0.yaml
+```
+
+Note: `--stage --wait` (the default) actually triggers a reboot to apply the staged upgrade. Plan for a 1-2 minute outage of the node under test. The cluster should stay healthy if you have 3+ control plane nodes and other nodes hold quorum.
+
+Expected: events stream from BOOTING through `post check passed`. Node returns to running state.
+
+### E2. Upgrade with bad image
+
+```bash
+/tmp/talm-safety upgrade --image ghcr.io/cozystack/cozystack/talos:doesnotexist \
+  --stage -f nodes/node0.yaml
+```
+
+Expected: `error validating installer image ... not found`. Talos itself catches this; talm passes through the error.
+
+## F. CA rotation
+
+### F1. Rotate CA dry-run
+
+```bash
+/tmp/talm-safety rotate-ca --dry-run --nodes $NODE --endpoints $NODE
+```
+
+Expected: every per-step line ends with `(dry-run)`; final line mentions `re-run with \`--dry-run=false\` to apply the changes`. Possibly trailing `failed to create new client with rotated Talos CA` — harmless under dry-run.
+
+### F2. Real CA rotation
+
+```bash
+/tmp/talm-safety rotate-ca --dry-run=false --nodes $NODE --endpoints $NODE
+```
+
+Expected: `CA rotation completed successfully!`. `secrets.yaml`, `secrets.encrypted.yaml`, `talosconfig`, `kubeconfig` updated on disk.
+
+### F3. Apply after rotation
+
+```bash
+/tmp/talm-safety apply --dry-run -f nodes/node0.yaml
+```
+
+Expected: works against the rotated CA. No `tls: certificate required` errors.
+
+## G. META partition
+
+### G1. Read / list
+
+```bash
+/tmp/talm-safety get metakey --nodes $NODE --endpoints $NODE
+```
+
+Expected: table of META keys with their values.
+
+### G2. Write a test key
+
+```bash
+/tmp/talm-safety meta write 0x0a "test-value" --nodes $NODE --endpoints $NODE
+/tmp/talm-safety get metakey 0x0a --nodes $NODE --endpoints $NODE
+```
+
+Expected: written; reads back the value.
+
+### G3. Delete the test key
+
+```bash
+/tmp/talm-safety meta delete 0x0a --nodes $NODE --endpoints $NODE
+/tmp/talm-safety get metakey 0x0a --nodes $NODE --endpoints $NODE
+```
+
+Expected: delete succeeds; read returns `NotFound`.
+
+## H. Reset and recovery
+
+### H1. Bootstrap on running cluster
+
+```bash
+/tmp/talm-safety bootstrap --nodes $NODE --endpoints $NODE
+```
+
+Expected: refuses with `etcd data directory is not empty`.
+
+### H2. Reset a control-plane node (graceful + reboot, system labels only)
+
+⚠️ Destructive. Run only against a cluster you can afford to lose one node from. Requires `--system-labels-to-wipe=STATE` (and optionally `EPHEMERAL`) for a recoverable reset — `--wipe-mode=all` (the default) removes META too, which makes self-recovery impossible.
+
+```bash
+/tmp/talm-safety reset --graceful=true --reboot \
+  --system-labels-to-wipe=STATE \
+  --system-labels-to-wipe=EPHEMERAL \
+  --nodes $NODE --endpoints $OTHER_NODE
+```
+
+Expected: etcd member departs (`talm etcd members` from another node shows 2 members), node reboots, `post check passed`.
+
+### H3. Etcd quorum after reset
+
+```bash
+/tmp/talm-safety etcd members --nodes $OTHER_NODE --endpoints $OTHER_NODE
+```
+
+Expected during the reset: 2 of 3 members. After Talos brings the reset node back from META (typical Linux/Talos auto-rejoin path): 3 members; the reset node may carry a placeholder hostname (`talos-XXXXX`) until the next apply.
+
+### H4. Rejoin after reset
+
+```bash
+/tmp/talm-safety apply --dry-run -f nodes/node-resetted.yaml
+```
+
+Expected: `0 addition, 0 removal, 0 update, N unchanged` when META preserved the full config; otherwise drift will reflect the missing state (re-apply to fix).
+
+### H5. Insecure path on a freshly-wiped node
+
+```bash
+/tmp/talm-safety apply -i -f nodes/node-fresh.yaml
+```
+
+Expected: render error from `lookup "disks"` requiring auth, OR drift-preview degrade line if the chart is offline-renderable.
+
+## I-pre. Cluster-wide diagnostics & helpers
+
+### I0-1. Read-only command sweep
+
+A non-destructive bake to confirm every wrapper returns within ~5s on a healthy 3-node cluster. Useful after every major refactor in `pkg/commands/talosctl_wrapper.go`.
+
+```bash
+NODE=$NODE
+for cmd in version "get machineconfig -o yaml" containers processes \
+           "health --server=false" "interfaces" "disks" "etcd members" \
+           "list /system/state" memory mounts stats service cgroups \
+           "dmesg --tail" netstat routes "usage /var/log" \
+           "logs kubelet" "logs etcd" "events --tail=3" \
+           "image list" "etcd status" "etcd alarm list"; do
+  timeout 8 /tmp/talm-safety $cmd --nodes $NODE --endpoints $NODE 2>&1 | head -1
+done
+```
+
+Expected: every command prints either a header row (table) or an error from the node side. None should hang past the timeout.
+
+### I0-2. Concurrent dry-run apply
+
+```bash
+for i in 1 2 3; do
+  /tmp/talm-safety apply --dry-run -f nodes/node0.yaml 2>&1 | grep -E "^talm:" &
+done
+wait
+```
+
+Expected: 3 independent renders, all complete, no race-condition diagnostics.
+
+### I0-3. CLI nodes/endpoints override modeline
+
+```bash
+/tmp/talm-safety apply --dry-run \
+  --nodes $OTHER_NODE --endpoints $OTHER_NODE \
+  -f nodes/node0.yaml | grep "^- talm"
+```
+
+Expected: log line shows `nodes=[$OTHER_NODE]` not the modeline value. The CLI takes precedence.
+
+### I0-4. Reboot a node (no config change)
+
+```bash
+/tmp/talm-safety reboot --nodes $NODE --endpoints $NODE
+```
+
+⚠️ Destructive timing — the node will be unreachable for ~30-60s. Cluster keeps quorum if at least one other controlplane is healthy.
+
+Expected: returns once events check completes; etcd member list shows the node back in.
+
+### I0-5. Wipe a non-system disk
+
+```bash
+/tmp/talm-safety wipe disk <devname> --nodes $NODE --endpoints $NODE
+```
+
+Expected: refuses with `FailedPrecondition: blockdevice "<dev>" is in use by disk "..."` if it's mounted / part of LVM / part of DRBD. Wipe succeeds only on truly idle block devices. The error itself is the regression pin: a wipe that DIDN'T refuse would risk destroying the cluster's persistent state.
+
+## I. Shell completion
+
+### I1. Generate completion for each shell
+
+```bash
+for sh in bash zsh fish powershell; do
+  /tmp/talm-safety completion $sh > /tmp/talm-completion.$sh
+  case $sh in
+    bash|zsh) bash -n /tmp/talm-completion.$sh && echo "$sh OK" ;;
+    *) echo "$sh: $(wc -l < /tmp/talm-completion.$sh) lines" ;;
+  esac
+done
+```
+
+Expected: every shell prints a script that parses (for bash/zsh syntax-check confirms). Non-zero output sizes.
+
+## J-pre. Set / values / overlay variations
+
+### J0-1. `--set` vs `--set-string` for IP-shaped values
+
+```bash
+/tmp/talm-safety template -f nodes/node0.yaml --set floatingIP=0700
+```
+
+Expected: with the post-#163 chart, fails fast with `talm: floatingIP "0700" is not a valid IPv4 / IPv6 literal`. Pre-#163 chart silently renders an invalid VIP.
+
+**Operator footgun**: `--set floatingIP=198.51.100.1` *may* be parsed as the float `198.51 × 100.1` by Helm's loose type-coercion. For IPs use `--set-string floatingIP="198.51.100.1"` or put it in `values.yaml`.
+
+### J0-2. `--set-literal` keeps dotted keys intact
+
+```bash
+/tmp/talm-safety template -f nodes/node0.yaml \
+  --set-literal "label.with.dots=raw"
+```
+
+Expected: key `label.with.dots` (single literal) appears in values rather than nested `label → with → dots`.
+
+### J0-3. `--set-file` reads file content as string
+
+```bash
+echo "from-file" > /tmp/_v.txt
+/tmp/talm-safety template -f nodes/node0.yaml --set-file someKey=/tmp/_v.txt
+rm /tmp/_v.txt
+```
+
+Expected: file content available as `.Values.someKey` during render.
+
+### J0-4. `--values` external overlay
+
+```bash
+echo "clusterDomain: overlay.local" > /tmp/_overlay.yaml
+/tmp/talm-safety template -f nodes/node0.yaml --values /tmp/_overlay.yaml
+rm /tmp/_overlay.yaml
+```
+
+Expected: `dnsDomain: overlay.local` in rendered config.
+
+## J. Read-only diagnostics (safe everywhere)
+
+| Command | Expected |
+| --- | --- |
+| `talm version` | Client + Server tags + Go versions |
+| `talm get links` | LinkStatus rows per node |
+| `talm get disks` | Disk rows; check `transport`, `bus_path`, `rotational`, `cdrom`, `readonly` |
+| `talm get metakey` | META keys |
+| `talm get machineconfig` | Active MachineConfig (auth only — Sensitive) |
+| `talm containers` | Talos system containers + Kubernetes pods |
+| `talm processes` | PID list with CPU/RES mem |
+| `talm health` | Cluster health summary |
+| `talm interfaces` | Host network interfaces |
+| `talm disks` | Block devices via talosctl wrapper |
+| `talm etcd members` | etcd member list (auth only) |
+
+Each command should return promptly (sub-second) for read-only paths.
+
+## K. Cross-version upgrade
+
+### K1. Preflight version-mismatch warning
+
+```bash
+# Bump talosVersion in Chart.yaml to one minor ahead of running.
+sed -i 's|talosVersion: "v1.12"|talosVersion: "v1.13"|' Chart.yaml
+/tmp/talm-safety apply --dry-run -f nodes/node0.yaml | head -5
+```
+
+Expected: `warning: pre-flight: configured talosVersion=v1.13 is newer than the node's running Talos v1.12.x` plus a hint about rebooting into a matching maintenance image or lowering the contract. Drift preview still runs.
+
+### K1-pre. Phase 2C version-verify catches silent rollback
+
+⚠️ Same destructive setup as K2, but the gate now does the work automatically. **Heads-up**: the target image lives in the node body (`nodes/<name>.yaml`'s `machine.install.image`), not in `values.yaml` — talm's upgrade wrapper reads it from the rendered config patch, not the chart values overlay (see #176).
+
+Run an intentionally-bad cross-vendor upgrade and expect a hint-bearing blocker:
+
+```bash
+sed -i 's|cozystack/cozystack/talos:v1.12.6|siderolabs/installer:v1.13.0|' nodes/node0.yaml
+talm upgrade -f nodes/node0.yaml
+```
+
+Expected: talosctl upgrade RPC returns success → "post-upgrade verify: waiting 1m30s for the node to finish booting..." → 90s reconcile window → `verifyPostUpgradeVersion` reads `runtime.Version` → detects mismatch → blocker:
+
+```
+post-upgrade: requested upgrade to v1.13.0 but running version is
+v1.12.6 — either Talos auto-rolled back, or the node is still
+booting beyond the 90s window
+hint: two hypotheses produce this symptom: (1) Talos auto-rolled
+back after the new partition failed its boot readiness check —
+cross-vendor upgrades (e.g. cozystack-bundled image -> vanilla
+siderolabs installer) drop bundled extensions and trigger this.
+(2) The node is slower than the 90s reconcile window — large
+image pulls or cold hardware can exceed it. Re-run `talm get
+version` after a minute to distinguish: if the version updated,
+the node was just slow; if it's still the old version, the
+rollback case is real. Pass --skip-post-upgrade-verify to bypass.
+```
+
+`talm upgrade` exits non-zero — the operator sees the failure instead of a false "success".
+
+Phase 2C is **skipped** for the following upgrade flows (each documented in the code):
+
+- `--skip-post-upgrade-verify` (operator opt-out)
+- `--insecure` (auth-only COSI path is unreachable)
+- `--stage` (new partition not yet booted; runtime.Version would always report the old version — guaranteed false positive)
+
+### K2-pre. Manual fallback for `--skip-post-upgrade-verify`
+
+K1-pre exercises the automated Phase 2C gate. If the operator disables it (`--skip-post-upgrade-verify`) — or in flows that the gate doesn't cover (`--insecure`, `--stage`, no target image) — the equivalent manual check is:
+
+```bash
+target="v1.13.0"
+talm upgrade --skip-post-upgrade-verify -f nodes/node0.yaml
+running=$(talm get version --nodes $NODE --endpoints $NODE \
+  -o jsonpath='{.spec.version}')
+test "$running" = "$target" || echo "SILENT ROLLBACK / SLOW BOOT — running $running, expected $target"
+```
+
+This is the post-merge equivalent of what Phase 2C does automatically. Keep the script around — it's still relevant for the `--insecure` flow which the gate skips by design.
+
+### K2. Stage-upgrade to a new minor
+
+```bash
+# In values.yaml, point image at the new installer:
+sed -i 's|installer:v1.12.6|installer:v1.13.0|' values.yaml
+/tmp/talm-safety upgrade --stage -f nodes/node0.yaml
+```
+
+Expected: events stream from `installAndReboot` through `post check passed`. Node returns running v1.13.x (`talm version --nodes $NODE`). Etcd member count unchanged (`talm etcd members`).
+
+### K3. Per-node sequential upgrade (safe)
+
+```bash
+for n in node0 node1 node2; do
+  /tmp/talm-safety upgrade --stage -f nodes/$n.yaml
+  /tmp/talm-safety etcd members --nodes $OTHER --endpoints $OTHER \
+    | grep -c "^[0-9]"  # quorum must be >= 2 at all times
+done
+```
+
+Expected: each node returns to etcd within 60s; quorum never drops below 2/3 (one node down at a time).
+
+### K4. Phase 2A drift preview against new-version node
+
+After K2, with the chart still on v1.13 contract:
+
+```bash
+/tmp/talm-safety apply --dry-run -f nodes/node0.yaml
+```
+
+Expected: no version-mismatch warning (chart contract matches running). Drift preview shows the per-version diff if any (e.g. a new field machinery v1.13 injects).
+
+### K5. Phase 2B on a heterogeneous cluster (mid-rollout)
+
+Between K2-step-1 (node0 upgraded) and K2-step-2 (node1 still on old version), Phase 1 still resolves `lookup "links"` (non-Sensitive COSI resource works on both versions). Phase 2A diffs against the specific node, so the cross-version state is per-node, not cluster-wide. Phase 2B (if enabled) compares against the bytes sent; expect cert-hash false-positives until the allowlist lands (see open question in #172).
+
+## L. Extended diagnostics + service control
+
+### L1. `inspect dependencies` returns a DOT graph
+
+```bash
+/tmp/talm-safety inspect dependencies --nodes $NODE --endpoints $NODE | head
+```
+
+Expected: starts with `digraph {`. Useful for visualizing Talos controller deps. Pipe through `dot -Tpng` to render.
+
+### L2. `pcap` short capture on loopback
+
+```bash
+timeout 8 /tmp/talm-safety pcap --nodes $NODE --endpoints $NODE \
+  --interface lo --duration 2s > /tmp/_cap.pcap
+file /tmp/_cap.pcap && rm /tmp/_cap.pcap
+```
+
+Expected: binary pcap stream to stdout. `file` reports "pcap capture file".
+
+### L3. `time` against NTP
+
+```bash
+/tmp/talm-safety time --nodes $NODE --endpoints $NODE
+```
+
+Expected: table with `NTP-SERVER`, `NODE-TIME`, `NTP-SERVER-TIME`.
+
+### L4. `etcd defrag`
+
+```bash
+/tmp/talm-safety etcd defrag --nodes $NODE --endpoints $NODE
+```
+
+Expected: silent return (no output), exit 0. DB is defragmented.
+
+### L5. `etcd alarm list`
+
+```bash
+/tmp/talm-safety etcd alarm list --nodes $NODE --endpoints $NODE
+```
+
+Expected: empty output on a healthy cluster. Any output indicates an alarm to investigate (NOSPACE / CORRUPT).
+
+### L6. `etcd forfeit-leadership` on a non-leader
+
+```bash
+/tmp/talm-safety etcd forfeit-leadership --nodes $NON_LEADER --endpoints $NODE
+```
+
+Expected: silent no-op. Leader unchanged.
+
+### L7. `service kubelet restart`
+
+```bash
+/tmp/talm-safety service kubelet restart --nodes $NODE --endpoints $NODE
+```
+
+Expected: `Service "kubelet" restarted`. Pod replays after ~10s.
+
+### L8. `service kubelet stop` (refused)
+
+```bash
+/tmp/talm-safety service kubelet stop --nodes $NODE --endpoints $NODE
+```
+
+Expected: `kubelet doesn't support stop operation via API`. Talos intentionally blocks stop on essential services.
+
+### L9. `shutdown` (destructive)
+
+⚠️ Powers off the node. Recovery requires `tofu apply` (or manual provider-side reboot). Use only against a node whose recovery path you control.
+
+```bash
+/tmp/talm-safety shutdown --nodes $TARGET --endpoints $OTHER
+```
+
+Expected: events stream through `teardownLifecycle` → `stopEverything` → `events check condition met`. Etcd member remains in the list until TTL expires (~10 min) or the next membership reconciliation.
+
+### L10. `get rd` lists registered resource types
+
+```bash
+/tmp/talm-safety get rd --nodes $NODE --endpoints $NODE | wc -l
+```
+
+Expected: 100+ resource types. Baseline for `get <type>` calls.
+
+### L11. `get -o jsonpath`
+
+```bash
+/tmp/talm-safety get hostname --nodes $NODE --endpoints $NODE \
+  -o jsonpath='{.spec.hostname}'
+```
+
+Expected: the node's hostname as a bare string. Useful for scripted extraction.
+
+### L12. `logs --tail N`
+
+```bash
+/tmp/talm-safety logs kubelet --tail 3 --nodes $NODE --endpoints $NODE
+```
+
+Expected: last 3 lines of kubelet log.
+
+## M. Negative / boundary cases
+
+### M1. Malformed modeline
+
+```bash
+echo "# talm: nodes=this-is-not-json-array" > /tmp/_bad.yaml
+echo "machine: {type: controlplane}" >> /tmp/_bad.yaml
+/tmp/talm-safety apply --dry-run -f /tmp/_bad.yaml
+rm /tmp/_bad.yaml
+```
+
+Expected: `error parsing JSON array for key nodes` with a hint about the expected syntax.
+
+### M2. Malformed patch (string-where-map)
+
+```bash
+cat > /tmp/_bad.yaml << 'EOF'
+# talm: nodes=["192.0.2.4"], endpoints=["192.0.2.4"], templates=["templates/controlplane.yaml"]
+machine:
+  type: controlplane
+  install: not-a-map-but-a-string
+EOF
+/tmp/talm-safety apply --dry-run -f /tmp/_bad.yaml
+rm /tmp/_bad.yaml
+```
+
+Expected: `yaml: construct errors: cannot construct !!str ... into v1alpha1.InstallConfig` plus a hint about patch shape.
+
+### M3. Bad `--cert-fingerprint`
+
+```bash
+/tmp/talm-safety apply --insecure --cert-fingerprint deadbeef \
+  -f nodes/node0.yaml
+```
+
+Expected: TLS handshake error `leaf peer certificate doesn't match the provided fingerprints: [deadbeef]`.
+
+### M4. `--talosconfig` pointing at missing file
+
+```bash
+/tmp/talm-safety apply --dry-run --talosconfig /tmp/nonexistent \
+  -f nodes/node0.yaml
+```
+
+Expected: `talos config file is empty`. Apply does not proceed.
+
+### M5. `TALOSCONFIG` env var
+
+```bash
+TALOSCONFIG=$PWD/talosconfig /tmp/talm-safety apply --dry-run \
+  --talosconfig "" -f nodes/node0.yaml
+```
+
+Expected: same as native `--talosconfig $PWD/talosconfig`. Phase 2A drift preview runs normally.
+
+## Sanity-check block
+
+Run after every destructive section (E, F, H, and anything that touches `--mode=reboot` / `--mode=staged` / `apply -I`):
+
+```bash
+cd $PROJECT
+for n in node0 node1 node2; do
+  echo "=== $n ==="
+  /tmp/talm-safety apply --dry-run -f nodes/$n.yaml | grep -E "^talm:"
+done
+/tmp/talm-safety etcd members --nodes $NODE --endpoints $NODE
+/tmp/talm-safety health --nodes $NODE --endpoints $NODE
+```
+
+Expected: each node reports drift preview (typically `0/0/0 unchanged` after a clean run), etcd shows 3 members, health passes.
+
+## Adversarial extras
+
+These don't ship as part of the regular run but are worth re-running after refactors that touch the walker / differ / preflight hooks.
+
+### Walker robustness
+
+```bash
+echo "" > /tmp/empty.yaml
+/tmp/talm-safety apply --dry-run -f /tmp/empty.yaml
+# Expected: modeline-prefix error.
+
+cat > /tmp/modeline-only.yaml <<EOF
+# talm: nodes=["$NODE"], endpoints=["$NODE"], templates=["templates/controlplane.yaml"]
+EOF
+/tmp/talm-safety apply --dry-run -f /tmp/modeline-only.yaml
+# Expected: drift preview, possibly significant `-` removals.
+```
+
+### Schema confusion
+
+Try every BondConfig / VLANConfig / BridgeConfig YAML field permutation that operators paste from old docs / unofficial gists. The walker must catch typos in:
+
+- `BridgeConfig.links[]` (legacy docs say `ports[]` — Phase 1 must NOT treat `ports[]` as valid).
+- `VLANConfig.parent` (legacy docs say `link` — same rule).
+
+### Encoding edge cases
+
+```bash
+printf '\xef\xbb\xbfmachine:\r\n  install:\r\n    disk: /dev/sda\r\n' > /tmp/bom.yaml
+/tmp/talm-safety apply --dry-run -f /tmp/bom.yaml
+# Expected: no panic. Walker decodes the doc.
+```
+
+### Mode-gating
+
+Walk every `--mode` value with `--skip-post-apply-verify=false`:
+
+- `auto`, `no-reboot` → Phase 2B runs.
+- `reboot`, `staged`, `try` → Phase 2B auto-skipped (each for a different documented reason).
+- `--dry-run` always skips Phase 2B.
+
+## Cleanup at end of session
+
+```bash
+# Restore any `_test-*.yaml` files left in nodes/
+ls $PROJECT/nodes/_test-* 2>/dev/null && rm $PROJECT/nodes/_test-*
+# Verify project tree is clean
+cd $PROJECT && git status --short
+```
+
+The project should report no `_test-*.yaml` orphans and only the intentional changes (e.g. `Chart.yaml` and `charts/talm/*` updates from `init --update`).
diff --git a/go.mod b/go.mod
index 26f24958..efabf741 100644
--- a/go.mod
+++ b/go.mod
@@ -26,7 +26,7 @@ require (
 	github.com/cosi-project/runtime v1.14.1
 	github.com/distribution/reference v0.6.0 // indirect
 	github.com/docker/cli v29.4.0+incompatible // indirect
-	github.com/dustin/go-humanize v1.0.1 // indirect
+	github.com/dustin/go-humanize v1.0.1
 	github.com/fatih/color v1.19.0 // indirect
 	github.com/foxboron/go-uefi v0.0.0-20251010190908-d29549a44f29 // indirect
 	github.com/gdamore/tcell/v2 v2.13.8 // indirect
@@ -78,7 +78,7 @@ require (
 	golang.org/x/oauth2 v0.36.0 // indirect
 	golang.org/x/sync v0.20.0 // indirect
 	golang.org/x/sys v0.43.0
-	golang.org/x/term v0.42.0 // indirect
+	golang.org/x/term v0.42.0
 	golang.org/x/text v0.36.0 // indirect
 	golang.org/x/time v0.15.0 // indirect
 	golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10 // indirect
diff --git a/pkg/applycheck/diff.go b/pkg/applycheck/diff.go
new file mode 100644
index 00000000..d72691ac
--- /dev/null
+++ b/pkg/applycheck/diff.go
@@ -0,0 +1,358 @@
+// Copyright Cozystack Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package applycheck
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"maps"
+	"reflect"
+	"sort"
+
+	"github.com/cockroachdb/errors"
+	yaml "gopkg.in/yaml.v3"
+)
+
+// ChangeOp classifies a per-document diff between two MachineConfig
+// snapshots.
+type ChangeOp int
+
+const (
+	// OpAdd: doc present in desired, absent in current.
+	OpAdd ChangeOp = iota
+	// OpRemove: doc present in current, absent in desired (the leftover
+	// class — eth1 lingering after migration to eth0).
+	OpRemove
+	// OpUpdate: doc present on both sides with field-level differences.
+	OpUpdate
+	// OpEqual: doc present on both sides, structurally identical.
+	OpEqual
+)
+
+// String renders the op as a single-character glyph for the drift preview
+// output.
+func (o ChangeOp) String() string {
+	switch o {
+	case OpAdd:
+		return "+"
+	case OpRemove:
+		return "-"
+	case OpUpdate:
+		return "~"
+	case OpEqual:
+		return "="
+	}
+
+	return "?"
+}
+
+// DocID is the structural identity used to pair up documents across the
+// current/desired snapshots. v1.12 multi-doc keys by (kind, name). v1.11
+// nested form is collapsed into a synthetic DocID{Kind: "MachineConfig",
+// Name: ""} so the root config doc participates in the diff.
+type DocID struct {
+	Kind string
+	Name string
+}
+
+// FieldChange describes a single leaf-level difference between matched
+// documents. Only populated for ChangeOp.OpUpdate. Path is the YAML
+// dotted accessor inside the document.
+//
+// HasOld / HasNew distinguish "field missing on this side" from "field
+// present with literal nil/null value": a YAML leaf with value `~` (or
+// `null`) is a legitimate value, not a missing field, and the diff has
+// to be able to render it differently.
+type FieldChange struct {
+	Path   string
+	Old    any
+	New    any
+	HasOld bool
+	HasNew bool
+}
+
+// Change is one entry in the per-doc diff between two MachineConfig
+// snapshots. Fields is non-empty only when Op == OpUpdate.
+type Change struct {
+	ID     DocID
+	Op     ChangeOp
+	Fields []FieldChange
+}
+
+// Diff parses current and desired MachineConfig bytes and returns every
+// per-document difference. Equal documents are included as OpEqual so
+// callers can render an unchanged section if they want; pass
+// FilterChanged to drop them.
+//
+// The diff is doc-structural, not byte-level: re-serializing either side
+// with different key ordering or whitespace produces the same Diff
+// output. Talos-mutated leaf fields (cert hashes etc.) currently land
+// as OpUpdate entries with FieldChange path pinpointing what differs;
+// an allowlist will be layered on later (see open question in #172).
+func Diff(current, desired []byte) ([]Change, error) {
+	currentDocs, err := parseDocs(current)
+	if err != nil {
+		return nil, errors.Wrap(err, "applycheck: parsing current snapshot")
+	}
+
+	desiredDocs, err := parseDocs(desired)
+	if err != nil {
+		return nil, errors.Wrap(err, "applycheck: parsing desired snapshot")
+	}
+
+	ids := mergedIDs(currentDocs, desiredDocs)
+
+	changes := make([]Change, 0, len(ids))
+
+	for _, docID := range ids {
+		cur, inCur := currentDocs[docID]
+		des, inDes := desiredDocs[docID]
+
+		switch {
+		case !inCur && inDes:
+			changes = append(changes, Change{ID: docID, Op: OpAdd})
+		case inCur && !inDes:
+			changes = append(changes, Change{ID: docID, Op: OpRemove})
+		case reflect.DeepEqual(cur, des):
+			changes = append(changes, Change{ID: docID, Op: OpEqual})
+		default:
+			changes = append(changes, Change{
+				ID:     docID,
+				Op:     OpUpdate,
+				Fields: leafDiff(cur, des),
+			})
+		}
+	}
+
+	return changes, nil
+}
+
+// FilterChanged returns the subset of changes whose op is not OpEqual.
+// Useful when the operator only wants to see drift, not pinned doc
+// counts.
+func FilterChanged(changes []Change) []Change {
+	out := make([]Change, 0, len(changes))
+
+	for i := range changes {
+		if changes[i].Op != OpEqual {
+			out = append(out, changes[i])
+		}
+	}
+
+	return out
+}
+
+// parseDocs decodes a multi-doc YAML byte stream into a map keyed by
+// DocID. v1.11 root (top-level `machine:` with no `kind:`) becomes the
+// synthetic DocID{Kind: "MachineConfig"}. v1.12 multi-doc docs are keyed
+// by their kind + name (or kind + "" for singletons like HostnameConfig).
+// Unknown shapes are ignored so future doc kinds don't error parse.
+//
+// Two documents sharing the same (kind, name) in one stream are a
+// config defect — Talos's behaviour on duplicates is unspecified and
+// varies between versions, and a silent last-write-wins would mask
+// the real problem behind a misleading drift line. Surface the
+// duplicate as an error so Phase 2A can warn the operator.
+func parseDocs(raw []byte) (map[DocID]map[string]any, error) {
+	out := make(map[DocID]map[string]any)
+
+	if len(bytes.TrimSpace(raw)) == 0 {
+		return out, nil
+	}
+
+	dec := yaml.NewDecoder(bytes.NewReader(raw))
+
+	for idx := 0; ; idx++ {
+		var doc map[string]any
+
+		err := dec.Decode(&doc)
+		if errors.Is(err, io.EOF) {
+			break
+		}
+
+		if err != nil {
+			return nil, errors.Wrapf(err, "decoding YAML document %d", idx)
+		}
+
+		if doc == nil {
+			continue
+		}
+
+		docID, ok := identityOf(doc)
+		if !ok {
+			continue
+		}
+
+		if _, exists := out[docID]; exists {
+			//nolint:wrapcheck // cockroachdb/errors.Newf at boundary.
+			return nil, errors.Newf("duplicate document %s{name: %q} in stream — Talos behaviour on duplicates is unspecified; remove or rename one", docID.Kind, docID.Name)
+		}
+
+		out[docID] = doc
+	}
+
+	return out, nil
+}
+
+// identityOf returns the DocID for a single parsed document. v1.11 form
+// (has `machine:`) collapses to the synthetic MachineConfig identity;
+// v1.12 multi-doc uses kind + optional name.
+func identityOf(doc map[string]any) (DocID, bool) {
+	if _, ok := doc["machine"]; ok {
+		return DocID{Kind: "MachineConfig"}, true
+	}
+
+	kind, ok := doc["kind"].(string)
+	if !ok || kind == "" {
+		return DocID{}, false
+	}
+
+	name, _ := doc["name"].(string)
+
+	return DocID{Kind: kind, Name: name}, true
+}
+
+// mergedIDs returns the union of DocIDs across current/desired, sorted
+// for stable output. Kind orders first, then name within kind.
+func mergedIDs(a, b map[DocID]map[string]any) []DocID {
+	seen := make(map[DocID]struct{}, len(a)+len(b))
+	for id := range a {
+		seen[id] = struct{}{}
+	}
+
+	for id := range b {
+		seen[id] = struct{}{}
+	}
+
+	ids := make([]DocID, 0, len(seen))
+	for id := range seen {
+		ids = append(ids, id)
+	}
+
+	sort.Slice(ids, func(i, j int) bool {
+		if ids[i].Kind != ids[j].Kind {
+			return ids[i].Kind < ids[j].Kind
+		}
+
+		return ids[i].Name < ids[j].Name
+	})
+
+	return ids
+}
+
+// leafDiff returns every leaf-level field change between two parsed
+// documents. Paths use dotted notation; nested objects are recursed,
+// arrays are compared whole-list (a single FieldChange whose Old/New
+// carry the slices). Stable order keyed by path string.
+func leafDiff(cur, des map[string]any) []FieldChange {
+	flatCur := flatten(cur, "")
+	flatDes := flatten(des, "")
+
+	paths := mergedPaths(flatCur, flatDes)
+
+	out := make([]FieldChange, 0, len(paths))
+
+	for _, path := range paths {
+		oldVal, hasOld := flatCur[path]
+		newVal, hasNew := flatDes[path]
+
+		switch {
+		case !hasOld && hasNew:
+			out = append(out, FieldChange{Path: path, New: newVal, HasNew: true})
+		case hasOld && !hasNew:
+			out = append(out, FieldChange{Path: path, Old: oldVal, HasOld: true})
+		case !reflect.DeepEqual(oldVal, newVal):
+			out = append(out, FieldChange{Path: path, Old: oldVal, New: newVal, HasOld: true, HasNew: true})
+		}
+	}
+
+	return out
+}
+
+// flatten walks a parsed YAML map and produces a flat key→leaf-value
+// map. Nested maps recurse, slices are kept as-is (lists are treated as
+// atomic for the leaf diff). Empty prefix yields top-level keys.
+//
+// Empty nested maps (`field: {}` — a real Talos default shape for
+// fields like kubelet.extraArgs) are emitted as their own leaf entry
+// with value map[string]any{}. Without this, a doc with an empty
+// nested map and a doc that omits the field entirely would produce
+// flat representations that differ only at the doc level, and
+// leafDiff would emit OpUpdate with Fields=[] — leaving the operator
+// with a content-free "~" line that says "something changed, can't
+// tell you what".
+func flatten(in map[string]any, prefix string) map[string]any {
+	out := make(map[string]any)
+
+	for key, val := range in {
+		path := key
+		if prefix != "" {
+			path = prefix + "." + key
+		}
+
+		if nested, ok := val.(map[string]any); ok {
+			if len(nested) == 0 {
+				// Empty nested map: emit as a leaf so the
+				// add/remove of the field itself surfaces in
+				// the diff. See godoc above.
+				out[path] = nested
+
+				continue
+			}
+
+			maps.Copy(out, flatten(nested, path))
+
+			continue
+		}
+
+		out[path] = val
+	}
+
+	return out
+}
+
+// mergedPaths returns the sorted union of keys across two flat maps so
+// leafDiff's output ordering is stable.
+func mergedPaths(a, b map[string]any) []string {
+	seen := make(map[string]struct{}, len(a)+len(b))
+	for k := range a {
+		seen[k] = struct{}{}
+	}
+
+	for k := range b {
+		seen[k] = struct{}{}
+	}
+
+	out := make([]string, 0, len(seen))
+	for k := range seen {
+		out = append(out, k)
+	}
+
+	sort.Strings(out)
+
+	return out
+}
+
+// FormatChange returns a one-line, grep-friendly representation of one
+// Change. Used by the preflight hook to write the drift preview to
+// stderr; tests pin the format.
+func FormatChange(c *Change) string {
+	if c.ID.Name == "" {
+		return fmt.Sprintf("  %s %s", c.Op, c.ID.Kind)
+	}
+
+	return fmt.Sprintf("  %s %s{name: %s}", c.Op, c.ID.Kind, c.ID.Name)
+}
diff --git a/pkg/applycheck/diff_test.go b/pkg/applycheck/diff_test.go
new file mode 100644
index 00000000..f8e13754
--- /dev/null
+++ b/pkg/applycheck/diff_test.go
@@ -0,0 +1,373 @@
+// Copyright Cozystack Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package applycheck_test
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/cozystack/talm/pkg/applycheck"
+)
+
+const driftCurrentMultidoc = `version: v1alpha1
+machine:
+  type: controlplane
+  install:
+    disk: /dev/sda
+---
+apiVersion: v1alpha1
+kind: LinkConfig
+name: eth0
+up: true
+---
+apiVersion: v1alpha1
+kind: LinkConfig
+name: eth1
+up: true
+---
+apiVersion: v1alpha1
+kind: HostnameConfig
+hostname: cp-old
+`
+
+const driftDesiredMultidoc = `version: v1alpha1
+machine:
+  type: controlplane
+  install:
+    disk: /dev/sda
+---
+apiVersion: v1alpha1
+kind: LinkConfig
+name: eth0
+up: true
+---
+apiVersion: v1alpha1
+kind: LinkConfig
+name: eth2
+up: true
+---
+apiVersion: v1alpha1
+kind: HostnameConfig
+hostname: cp-new
+`
+
+func findChange(changes []applycheck.Change, kind, name string) (applycheck.Change, bool) {
+	for _, c := range changes {
+		if c.ID.Kind == kind && c.ID.Name == name {
+			return c, true
+		}
+	}
+
+	return applycheck.Change{}, false
+}
+
+func TestDiff_DetectsAddRemoveUpdateOnMultidoc(t *testing.T) {
+	t.Parallel()
+
+	changes, err := applycheck.Diff([]byte(driftCurrentMultidoc), []byte(driftDesiredMultidoc))
+	if err != nil {
+		t.Fatalf("Diff error: %v", err)
+	}
+
+	// eth0 unchanged.
+	if c, ok := findChange(changes, "LinkConfig", "eth0"); !ok || c.Op != applycheck.OpEqual {
+		t.Errorf("eth0 expected Op=Equal, got %+v ok=%v", c, ok)
+	}
+
+	// eth1 removed.
+	if c, ok := findChange(changes, "LinkConfig", "eth1"); !ok || c.Op != applycheck.OpRemove {
+		t.Errorf("eth1 expected Op=Remove, got %+v ok=%v", c, ok)
+	}
+
+	// eth2 added.
+	if c, ok := findChange(changes, "LinkConfig", "eth2"); !ok || c.Op != applycheck.OpAdd {
+		t.Errorf("eth2 expected Op=Add, got %+v ok=%v", c, ok)
+	}
+
+	// HostnameConfig updated (cp-old -> cp-new).
+	c, ok := findChange(changes, "HostnameConfig", "")
+	if !ok || c.Op != applycheck.OpUpdate {
+		t.Fatalf("HostnameConfig expected Op=Update, got %+v ok=%v", c, ok)
+	}
+
+	if len(c.Fields) == 0 {
+		t.Errorf("HostnameConfig update should carry leaf-level field changes, got empty")
+	}
+
+	var sawHostname bool
+
+	for _, f := range c.Fields {
+		if f.Path == "hostname" {
+			sawHostname = true
+
+			if f.Old != "cp-old" || f.New != "cp-new" {
+				t.Errorf("hostname field change Old=%v New=%v, want cp-old/cp-new", f.Old, f.New)
+			}
+		}
+	}
+
+	if !sawHostname {
+		t.Errorf("expected a FieldChange at path 'hostname', got %+v", c.Fields)
+	}
+}
+
+func TestDiff_IdenticalInputs_AllEqual(t *testing.T) {
+	t.Parallel()
+
+	changes, err := applycheck.Diff([]byte(driftCurrentMultidoc), []byte(driftCurrentMultidoc))
+	if err != nil {
+		t.Fatalf("Diff error: %v", err)
+	}
+
+	for _, c := range changes {
+		if c.Op != applycheck.OpEqual {
+			t.Errorf("identical inputs should produce only Equal ops, got %+v", c)
+		}
+	}
+
+	// All four documents present.
+	if len(changes) < 4 {
+		t.Errorf("expected >= 4 Equal entries (root + 2 LinkConfig + Hostname), got %d", len(changes))
+	}
+}
+
+func TestFilterChanged_DropsEqualOps(t *testing.T) {
+	t.Parallel()
+
+	in := []applycheck.Change{
+		{Op: applycheck.OpAdd, ID: applycheck.DocID{Kind: "X", Name: "a"}},
+		{Op: applycheck.OpEqual, ID: applycheck.DocID{Kind: "X", Name: "b"}},
+		{Op: applycheck.OpRemove, ID: applycheck.DocID{Kind: "X", Name: "c"}},
+		{Op: applycheck.OpEqual, ID: applycheck.DocID{Kind: "X", Name: "d"}},
+	}
+
+	out := applycheck.FilterChanged(in)
+	if len(out) != 2 {
+		t.Errorf("FilterChanged returned %d, want 2: %+v", len(out), out)
+	}
+}
+
+func TestDiff_EmptyInputs_NoChanges(t *testing.T) {
+	t.Parallel()
+
+	changes, err := applycheck.Diff(nil, nil)
+	if err != nil {
+		t.Fatalf("Diff(nil, nil) error: %v", err)
+	}
+
+	if len(changes) != 0 {
+		t.Errorf("Diff(nil, nil) returned %d changes, want 0", len(changes))
+	}
+}
+
+// TestDiff_KeyOrderIrrelevant pins that re-serialization with a
+// different field order produces no spurious drift. yaml.v3 decodes
+// into map[string]any whose iteration order is randomized; the
+// reflect.DeepEqual gate compares by value, not source byte order.
+// Without this contract, a Talos restart that re-emitted fields in
+// a different order would surface as 'drift' every time.
+func TestDiff_KeyOrderIrrelevant(t *testing.T) {
+	t.Parallel()
+
+	canonical := []byte("apiVersion: v1alpha1\nkind: LinkConfig\nname: eth0\nup: true\nmtu: 1500\n")
+	shuffled := []byte("kind: LinkConfig\nmtu: 1500\nname: eth0\napiVersion: v1alpha1\nup: true\n")
+
+	changes, err := applycheck.Diff(canonical, shuffled)
+	if err != nil {
+		t.Fatalf("Diff error: %v", err)
+	}
+
+	for _, c := range changes {
+		if c.Op != applycheck.OpEqual {
+			t.Errorf("key reorder produced %v change on %+v; want OpEqual", c.Op, c.ID)
+		}
+	}
+}
+
+// TestDiff_InvalidYAML_ErrorsCleanly pins that malformed YAML
+// surfaces as a wrapped error rather than a panic or silent skip.
+// Diff is called from the post-apply verify hook; a panic here
+// would crash the apply chain mid-way.
+func TestDiff_InvalidYAML_ErrorsCleanly(t *testing.T) {
+	t.Parallel()
+
+	invalid := []byte("not: valid\n  - mixed\n: yaml\n")
+	good := []byte("machine:\n  type: controlplane\n")
+
+	_, err := applycheck.Diff(invalid, good)
+	if err == nil {
+		t.Errorf("expected error on invalid YAML, got nil")
+	}
+
+	_, err = applycheck.Diff(good, invalid)
+	if err == nil {
+		t.Errorf("expected error on invalid desired YAML, got nil")
+	}
+}
+
+// TestDiff_DistinguishesAbsentFromNil pins the HasOld/HasNew flags so a
+// missing field and a present-but-null field do not collapse onto the
+// same FieldChange shape. Without these flags, formatters can't tell
+// `key: ~` from "key not in this side".
+func TestDiff_DistinguishesAbsentFromNil(t *testing.T) {
+	t.Parallel()
+
+	cur := []byte(`apiVersion: v1alpha1
+kind: HostnameConfig
+hostname: cp-01
+`)
+	des := []byte(`apiVersion: v1alpha1
+kind: HostnameConfig
+hostname: cp-02
+extraField: null
+`)
+
+	changes, err := applycheck.Diff(cur, des)
+	if err != nil {
+		t.Fatalf("Diff error: %v", err)
+	}
+
+	var update *applycheck.Change
+
+	for i := range changes {
+		if changes[i].Op == applycheck.OpUpdate {
+			update = &changes[i]
+
+			break
+		}
+	}
+
+	if update == nil {
+		t.Fatalf("expected one OpUpdate, got %+v", changes)
+	}
+
+	var sawExtraAddition bool
+
+	for _, f := range update.Fields {
+		if f.Path != "extraField" {
+			continue
+		}
+
+		sawExtraAddition = true
+
+		if f.HasOld {
+			t.Errorf("extraField HasOld=true on absent-then-null change, want false")
+		}
+
+		if !f.HasNew {
+			t.Errorf("extraField HasNew=false but value present (literal null), want true")
+		}
+	}
+
+	if !sawExtraAddition {
+		t.Errorf("expected a FieldChange at path 'extraField', got %+v", update.Fields)
+	}
+}
+
+// TestDiff_EmptyNestedMap_SurfacesAsLeafChange pins the empty-map
+// edge case in flatten. Without the fix, a current document with an
+// empty nested map (e.g. `kubelet.extraArgs: {}` — a real Talos
+// default shape) and a desired document that omits the field
+// entirely produce Op=OpUpdate with Fields=[] — the operator sees
+// "~ MachineConfig" with no leaf lines and has no idea what
+// changed. flatten() must emit empty nested maps as leaf entries
+// so the diff surfaces the addition/removal.
+func TestDiff_EmptyNestedMap_SurfacesAsLeafChange(t *testing.T) {
+	t.Parallel()
+
+	cur := []byte(`---
+apiVersion: v1alpha1
+kind: TestKind
+name: foo
+extraArgs: {}
+`)
+	des := []byte(`---
+apiVersion: v1alpha1
+kind: TestKind
+name: foo
+`)
+
+	changes, err := applycheck.Diff(cur, des)
+	if err != nil {
+		t.Fatalf("Diff: %v", err)
+	}
+
+	changed := applycheck.FilterChanged(changes)
+	if len(changed) != 1 {
+		t.Fatalf("expected one change, got %+v", changed)
+	}
+
+	if changed[0].Op != applycheck.OpUpdate {
+		t.Fatalf("expected OpUpdate, got %v", changed[0].Op)
+	}
+
+	if len(changed[0].Fields) == 0 {
+		t.Errorf("empty-nested-map removal must surface as a leaf FieldChange — got Fields=[], operator would see a content-free '~' line. fields=%+v", changed[0].Fields)
+	}
+
+	sawEmptyMapField := false
+
+	for _, f := range changed[0].Fields {
+		if f.Path == "extraArgs" {
+			sawEmptyMapField = true
+
+			if !f.HasOld || f.HasNew {
+				t.Errorf("expected HasOld=true, HasNew=false for the disappearing empty map; got %+v", f)
+			}
+		}
+	}
+
+	if !sawEmptyMapField {
+		t.Errorf("expected a FieldChange at path 'extraArgs', got %+v", changed[0].Fields)
+	}
+}
+
+// TestDiff_DuplicateDocID_Errors pins the dedup contract: two
+// documents with the same (kind, name) in one stream are a config
+// defect (Talos's behaviour on duplicates is unspecified and varies
+// between versions). The current implementation silently keeps the
+// last-decoded doc; the test asserts Diff surfaces the duplicate as
+// an error so the operator sees the real problem instead of a
+// misleading "everything is fine" / "one doc changed" line.
+func TestDiff_DuplicateDocID_Errors(t *testing.T) {
+	t.Parallel()
+
+	dup := []byte(`---
+apiVersion: v1alpha1
+kind: LinkConfig
+name: eth0
+mtu: 1500
+---
+apiVersion: v1alpha1
+kind: LinkConfig
+name: eth0
+mtu: 9000
+`)
+	des := []byte(`---
+apiVersion: v1alpha1
+kind: LinkConfig
+name: eth0
+mtu: 1500
+`)
+
+	_, err := applycheck.Diff(dup, des)
+	if err == nil {
+		t.Fatal("Diff must error on duplicate (kind, name) in the current snapshot — silent overwrite hides a real config defect")
+	}
+
+	if !strings.Contains(err.Error(), "LinkConfig") || !strings.Contains(err.Error(), "eth0") {
+		t.Errorf("error should cite the offending kind+name, got: %v", err)
+	}
+}
diff --git a/pkg/applycheck/doc.go b/pkg/applycheck/doc.go
new file mode 100644
index 00000000..b94ba6c1
--- /dev/null
+++ b/pkg/applycheck/doc.go
@@ -0,0 +1,21 @@
+// Copyright Cozystack Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package applycheck implements the apply-time safety gates: extracting
+// host-resource references from a rendered Talos MachineConfig, evaluating
+// disk selectors against a host snapshot, and diffing two MachineConfig
+// snapshots by (kind, name) identity. The walker is YAML-only; no Talos
+// machinery types leak through its surface, so the package can be exercised
+// from contract tests without standing up a Talos client.
+package applycheck
diff --git a/pkg/applycheck/refs.go b/pkg/applycheck/refs.go
new file mode 100644
index 00000000..cf1bd6f9
--- /dev/null
+++ b/pkg/applycheck/refs.go
@@ -0,0 +1,359 @@
+// Copyright Cozystack Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package applycheck
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+
+	"github.com/cockroachdb/errors"
+	yaml "gopkg.in/yaml.v3"
+)
+
+// RefKind classifies a host-resource reference extracted from a rendered
+// MachineConfig.
+type RefKind int
+
+const (
+	// RefKindLink is a host network link by name (eth0, bond0, vlan tag, ...).
+	RefKindLink RefKind = iota
+	// RefKindDiskLiteral is an install/extra disk identified by a literal
+	// device path (machine.install.disk: /dev/sda).
+	RefKindDiskLiteral
+	// RefKindDiskSelector is an install/user-volume disk identified by a
+	// selector (size, model, serial, wwid, modalias, type, busPath).
+	RefKindDiskSelector
+)
+
+// DiskSelector mirrors the Talos v1alpha1 InstallDiskSelector schema (also
+// used by UserVolumeConfig provisioning). Fields are left as raw strings so
+// the walker stays YAML-only; the evaluator interprets each.
+type DiskSelector struct {
+	Size     string `yaml:"size,omitempty"`
+	Name     string `yaml:"name,omitempty"`
+	Model    string `yaml:"model,omitempty"`
+	Serial   string `yaml:"serial,omitempty"`
+	Modalias string `yaml:"modalias,omitempty"`
+	UUID     string `yaml:"uuid,omitempty"`
+	WWID     string `yaml:"wwid,omitempty"`
+	Type     string `yaml:"type,omitempty"`
+	BusPath  string `yaml:"busPath,omitempty"`
+}
+
+// IsZero reports whether the selector has no fields set; the walker uses
+// this to avoid emitting empty selector refs.
+func (s *DiskSelector) IsZero() bool {
+	return *s == DiskSelector{}
+}
+
+// Ref is a host-side reference the walker found in the rendered MachineConfig.
+type Ref struct {
+	Kind     RefKind
+	Name     string       // populated for RefKindLink and RefKindDiskLiteral
+	Selector DiskSelector // populated for RefKindDiskSelector (zero value otherwise)
+	Source   string       // human-readable JSONPath pointing at the offending field
+}
+
+// WalkRefs parses the rendered MachineConfig bytes and returns every
+// host-resource reference it contains. Both the v1.11 nested
+// machine.network.interfaces[] form and the v1.12 multi-doc form
+// (LinkConfig / BondConfig / VLANConfig / BridgeConfig / Layer2VIPConfig
+// plus UserVolumeConfig.provisioning.diskSelector) are supported.
+// Unknown documents are ignored.
+func WalkRefs(rendered []byte) ([]Ref, error) {
+	if len(bytes.TrimSpace(rendered)) == 0 {
+		return nil, nil
+	}
+
+	dec := yaml.NewDecoder(bytes.NewReader(rendered))
+
+	var refs []Ref
+
+	for docIndex := 0; ; docIndex++ {
+		var doc map[string]any
+
+		err := dec.Decode(&doc)
+		if errors.Is(err, io.EOF) {
+			break
+		}
+
+		if err != nil {
+			return nil, errors.Wrapf(err, "applycheck: decoding YAML document %d", docIndex)
+		}
+
+		if doc == nil {
+			continue
+		}
+
+		refs = walkDocument(refs, doc, docIndex)
+	}
+
+	return refs, nil
+}
+
+// walkDocument dispatches between the v1.11 root config shape (top-level
+// `machine:`) and the v1.12 multi-doc shape (top-level `kind:`). Documents
+// matching neither shape are ignored — Talos accepts a small handful of
+// vendor-extension docs the walker does not need to know about.
+func walkDocument(refs []Ref, doc map[string]any, docIndex int) []Ref {
+	if machine, ok := doc["machine"].(map[string]any); ok {
+		return walkV1Alpha1Root(refs, machine, fmt.Sprintf("doc[%d].machine", docIndex))
+	}
+
+	if kind, ok := doc["kind"].(string); ok {
+		return walkMultidocKind(refs, doc, kind, fmt.Sprintf("doc[%d]", docIndex))
+	}
+
+	return refs
+}
+
+// walkV1Alpha1Root handles the legacy nested form: machine.network.interfaces[]
+// for link refs and machine.install.{disk,diskSelector} for disk refs.
+func walkV1Alpha1Root(refs []Ref, machine map[string]any, basePath string) []Ref {
+	if install, ok := machine["install"].(map[string]any); ok {
+		refs = appendDiskRefs(refs, install, basePath+".install")
+	}
+
+	network, ok := machine["network"].(map[string]any)
+	if !ok {
+		return refs
+	}
+
+	interfaces, ok := network["interfaces"].([]any)
+	if !ok {
+		return refs
+	}
+
+	for i, iface := range interfaces {
+		ifaceMap, ok := iface.(map[string]any)
+		if !ok {
+			continue
+		}
+
+		name, ok := ifaceMap["interface"].(string)
+		if !ok || name == "" {
+			continue
+		}
+
+		refs = append(refs, Ref{
+			Kind:   RefKindLink,
+			Name:   name,
+			Source: fmt.Sprintf("%s.network.interfaces[%d].interface", basePath, i),
+		})
+	}
+
+	return refs
+}
+
+// multidocHandler emits the refs for one v1.12 multi-doc kind. Handlers are
+// registered in multidocHandlers and dispatched by walkMultidocKind; this
+// keeps walkMultidocKind a flat lookup instead of a giant switch.
+type multidocHandler func(refs []Ref, doc map[string]any, basePath string) []Ref
+
+//nolint:gochecknoglobals // dispatch table for multidoc kinds; static after init.
+var multidocHandlers = map[string]multidocHandler{
+	// LinkConfig.name is emitted: the typical case is an override of an
+	// existing physical NIC (ens5 settings, MTU on eth0), where the
+	// operator wants validation to catch a typoed interface name. The
+	// rarer "create-a-fresh-virtual-link" case is covered by BondConfig/
+	// BridgeConfig/VLANConfig below, which intentionally do NOT validate
+	// their own .name field — those names describe a virtual link being
+	// created by the apply, not an existing one to reference.
+	//
+	// YAML field names mirror Talos's v1alpha1 schema verbatim:
+	//
+	//   - bond.go BondLinks ->  `links`
+	//   - bridge.go BridgeLinks -> `links` (NOT `ports`)
+	//   - vlan.go ParentLinkConfig -> `parent` (NOT `link`)
+	//   - layer2_vip.go LinkName -> `link`
+	"LinkConfig":      handleNameOnly,
+	"BondConfig":      handleListOnly("links"),
+	"VLANConfig":      handleParentOnly,
+	"BridgeConfig":    handleListOnly("links"),
+	"Layer2VIPConfig": handleLayer2VIP,
+	"HCloudVIPConfig": handleLayer2VIP,
+	// DHCPv4Config / DHCPv6Config / EthernetConfig use .name to
+	// reference an *existing* link — typo there is a Phase 1 catch.
+	// Note: dhcp4.go/dhcp6.go/ethernet.go share the CommonLinkConfig
+	// inline, so a typoed name still surfaces as a missing-link
+	// finding the same way LinkConfig does.
+	"DHCPv4Config":   handleNameOnly,
+	"DHCPv6Config":   handleNameOnly,
+	"EthernetConfig": handleNameOnly,
+	// WireguardConfig / DummyLinkConfig / LinkAliasConfig describe
+	// virtual links being created. Their .name is the new resource,
+	// not an existing-link reference — intentionally not in the
+	// dispatch table so they don't get a name-based Phase 1 finding.
+	"UserVolumeConfig": handleUserVolume,
+}
+
+// walkMultidocKind handles v1.12 multi-doc shapes by kind discriminator.
+// Unknown kinds are intentionally ignored — Talos extensions and future
+// kinds should not cause the walker to error.
+func walkMultidocKind(refs []Ref, doc map[string]any, kind, basePath string) []Ref {
+	handler, ok := multidocHandlers[kind]
+	if !ok {
+		return refs
+	}
+
+	return handler(refs, doc, basePath)
+}
+
+func handleNameOnly(refs []Ref, doc map[string]any, basePath string) []Ref {
+	return appendNameRef(refs, doc, basePath)
+}
+
+// handleListOnly emits only the list-valued slaves/ports of the doc,
+// not the doc's own .name. Used for BondConfig (its .name describes a
+// virtual bond being created by the apply; the .links[] members are
+// pre-existing physical NICs that must be present).
+func handleListOnly(listKey string) multidocHandler {
+	return func(refs []Ref, doc map[string]any, basePath string) []Ref {
+		return appendListRefs(refs, doc, listKey, basePath+"."+listKey)
+	}
+}
+
+// handleParentOnly emits only the parent reference of a VLAN doc, not
+// its own .name. The .name is the VLAN tag's child link name (a new
+// virtual link being created); the .parent is the parent that must
+// exist. The YAML key in v1alpha1 is `parent`, not `link`
+// (vlan.go ParentLinkConfig `yaml:"parent"`).
+func handleParentOnly(refs []Ref, doc map[string]any, basePath string) []Ref {
+	parent, ok := doc["parent"].(string)
+	if !ok || parent == "" {
+		return refs
+	}
+
+	return append(refs, Ref{Kind: RefKindLink, Name: parent, Source: basePath + ".parent"})
+}
+
+func handleLayer2VIP(refs []Ref, doc map[string]any, basePath string) []Ref {
+	link, ok := doc["link"].(string)
+	if !ok || link == "" {
+		return refs
+	}
+
+	return append(refs, Ref{Kind: RefKindLink, Name: link, Source: basePath + ".link"})
+}
+
+func handleUserVolume(refs []Ref, doc map[string]any, basePath string) []Ref {
+	prov, ok := doc["provisioning"].(map[string]any)
+	if !ok {
+		return refs
+	}
+
+	sel, ok := prov["diskSelector"].(map[string]any)
+	if !ok {
+		return refs
+	}
+
+	selector := selectorFromMap(sel)
+	if selector.IsZero() {
+		return refs
+	}
+
+	return append(refs, Ref{
+		Kind:     RefKindDiskSelector,
+		Selector: selector,
+		Source:   basePath + ".provisioning.diskSelector",
+	})
+}
+
+// appendNameRef emits a single RefKindLink keyed by doc["name"], or a no-op
+// when name is missing or empty.
+func appendNameRef(refs []Ref, doc map[string]any, basePath string) []Ref {
+	name, ok := doc["name"].(string)
+	if !ok || name == "" {
+		return refs
+	}
+
+	return append(refs, Ref{Kind: RefKindLink, Name: name, Source: basePath + ".name"})
+}
+
+// appendDiskRefs emits the install-disk-shaped refs from the v1.11
+// machine.install block (literal disk path OR selector). Both fields may
+// be present simultaneously in older configs; the walker emits whichever
+// it finds without judging precedence — that's the validator's call.
+func appendDiskRefs(refs []Ref, install map[string]any, basePath string) []Ref {
+	if disk, ok := install["disk"].(string); ok && disk != "" {
+		refs = append(refs, Ref{
+			Kind:   RefKindDiskLiteral,
+			Name:   disk,
+			Source: basePath + ".disk",
+		})
+	}
+
+	if sel, ok := install["diskSelector"].(map[string]any); ok {
+		if selector := selectorFromMap(sel); !selector.IsZero() {
+			refs = append(refs, Ref{
+				Kind:     RefKindDiskSelector,
+				Selector: selector,
+				Source:   basePath + ".diskSelector",
+			})
+		}
+	}
+
+	return refs
+}
+
+// appendListRefs emits one RefKindLink per string entry in doc[key].
+func appendListRefs(refs []Ref, doc map[string]any, key, basePath string) []Ref {
+	items, ok := doc[key].([]any)
+	if !ok {
+		return refs
+	}
+
+	for i, item := range items {
+		name, ok := item.(string)
+		if !ok || name == "" {
+			continue
+		}
+
+		refs = append(refs, Ref{
+			Kind:   RefKindLink,
+			Name:   name,
+			Source: fmt.Sprintf("%s[%d]", basePath, i),
+		})
+	}
+
+	return refs
+}
+
+// selectorFromMap converts a generic YAML map (as decoded into map[string]any)
+// into a DiskSelector. Numeric size matchers parse to ints in YAML; coerce
+// every value through fmt.Sprintf so the selector stays string-typed.
+func selectorFromMap(m map[string]any) DiskSelector {
+	get := func(key string) string {
+		v, ok := m[key]
+		if !ok || v == nil {
+			return ""
+		}
+
+		return fmt.Sprintf("%v", v)
+	}
+
+	return DiskSelector{
+		Size:     get("size"),
+		Name:     get("name"),
+		Model:    get("model"),
+		Serial:   get("serial"),
+		Modalias: get("modalias"),
+		UUID:     get("uuid"),
+		WWID:     get("wwid"),
+		Type:     get("type"),
+		BusPath:  get("busPath"),
+	}
+}
diff --git a/pkg/applycheck/refs_test.go b/pkg/applycheck/refs_test.go
new file mode 100644
index 00000000..5e528d14
--- /dev/null
+++ b/pkg/applycheck/refs_test.go
@@ -0,0 +1,497 @@
+// Copyright Cozystack Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package applycheck_test
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/cozystack/talm/pkg/applycheck"
+)
+
+const v1_11NestedLinkAndDisk = `version: v1alpha1
+debug: false
+machine:
+  type: controlplane
+  install:
+    disk: /dev/sda
+  network:
+    hostname: cp-01
+    interfaces:
+      - interface: eth0
+        addresses:
+          - 192.0.2.5/24
+        routes:
+          - network: 0.0.0.0/0
+            gateway: 192.0.2.1
+`
+
+const v1_12MultidocLinkConfig = `version: v1alpha1
+debug: false
+machine:
+  type: controlplane
+  install:
+    diskSelector:
+      model: "Samsung*"
+      size: ">= 100GB"
+---
+apiVersion: v1alpha1
+kind: LinkConfig
+name: eth0
+up: true
+addresses:
+  - address: 192.0.2.5/24
+---
+apiVersion: v1alpha1
+kind: VLANConfig
+name: eth0.4000
+parent: eth0
+vlanID: 4000
+---
+apiVersion: v1alpha1
+kind: BondConfig
+name: bond0
+links:
+  - eth1
+  - eth2
+---
+apiVersion: v1alpha1
+kind: BridgeConfig
+name: br0
+links:
+  - eth3
+  - bond0
+---
+apiVersion: v1alpha1
+kind: Layer2VIPConfig
+name: 192.0.2.10
+link: bond1
+`
+
+func findRef(refs []applycheck.Ref, kind applycheck.RefKind, name string) (applycheck.Ref, bool) {
+	for _, r := range refs {
+		if r.Kind == kind && r.Name == name {
+			return r, true
+		}
+	}
+
+	return applycheck.Ref{}, false
+}
+
+func TestWalkRefs_v1_11_ExtractsLinkAndLiteralDisk(t *testing.T) {
+	t.Parallel()
+
+	refs, err := applycheck.WalkRefs([]byte(v1_11NestedLinkAndDisk))
+	if err != nil {
+		t.Fatalf("WalkRefs error: %v", err)
+	}
+
+	eth0, ok := findRef(refs, applycheck.RefKindLink, "eth0")
+	if !ok {
+		t.Fatalf("expected link ref for eth0, got refs=%+v", refs)
+	}
+
+	if eth0.Source == "" {
+		t.Errorf("expected non-empty Source on link ref, got empty")
+	}
+
+	disk, ok := findRef(refs, applycheck.RefKindDiskLiteral, "/dev/sda")
+	if !ok {
+		t.Fatalf("expected disk literal ref for /dev/sda, got refs=%+v", refs)
+	}
+
+	if disk.Source == "" {
+		t.Errorf("expected non-empty Source on disk literal ref, got empty")
+	}
+}
+
+func TestWalkRefs_v1_12_ExtractsLinksFromMultidocAndDiskSelector(t *testing.T) {
+	t.Parallel()
+
+	refs, err := applycheck.WalkRefs([]byte(v1_12MultidocLinkConfig))
+	if err != nil {
+		t.Fatalf("WalkRefs error: %v", err)
+	}
+
+	// Walker emits only references to *existing* links, not names of
+	// virtual links being created by the apply. So:
+	//   - LinkConfig.name -> emitted (override of an existing physical NIC)
+	//   - VLANConfig.parent -> emitted (the parent link must exist)
+	//   - VLANConfig.name -> NOT emitted (the VLAN child is being created)
+	//   - BondConfig.links[] -> emitted (slave NICs must exist)
+	//   - BondConfig.name -> NOT emitted (the bond is being created)
+	//   - BridgeConfig.links[] -> emitted (port NICs must exist)
+	//   - BridgeConfig.name -> NOT emitted (the bridge is being created)
+	wantLinks := []string{"eth0" /* LinkConfig.name + VLANConfig.parent */, "eth1", "eth2" /* BondConfig.links */, "eth3" /* BridgeConfig.links */}
+	for _, name := range wantLinks {
+		if _, ok := findRef(refs, applycheck.RefKindLink, name); !ok {
+			t.Errorf("expected link ref for %q, got refs=%+v", name, refs)
+		}
+	}
+
+	// Names of virtual links being created must NOT surface (bond0 is
+	// a BondConfig.name + a BridgeConfig.links[] entry; the .name path
+	// should not appear, only the .links[] one. eth0.4000 is a
+	// VLANConfig.name with no other references — should not appear at
+	// all).
+	if _, ok := findRef(refs, applycheck.RefKindLink, "eth0.4000"); ok {
+		t.Errorf("VLANConfig.name (eth0.4000) leaked as a ref; only the .parent should validate")
+	}
+
+	// Layer2VIPConfig.link is a distinct ref site. Use a name that no other
+	// document mentions (bond1) so the assertion proves handleLayer2VIP
+	// actually emitted the ref, rather than incidentally picking it up
+	// from a sibling LinkConfig.
+	vip, ok := findRef(refs, applycheck.RefKindLink, "bond1")
+	if !ok {
+		t.Errorf("expected Layer2VIPConfig.link=bond1 to surface as a link ref")
+	} else if !strings.HasSuffix(vip.Source, ".link") {
+		t.Errorf("Layer2VIPConfig.link ref Source = %q, want suffix .link", vip.Source)
+	}
+
+	// Disk selector ref must carry the model+size combination.
+	var diskRef *applycheck.Ref
+	for i := range refs {
+		if refs[i].Kind == applycheck.RefKindDiskSelector {
+			diskRef = &refs[i]
+
+			break
+		}
+	}
+
+	if diskRef == nil {
+		t.Fatalf("expected one disk selector ref, got refs=%+v", refs)
+	}
+
+	if diskRef.Selector.Model != "Samsung*" {
+		t.Errorf("selector.Model = %q, want Samsung*", diskRef.Selector.Model)
+	}
+
+	if diskRef.Selector.Size != ">= 100GB" {
+		t.Errorf("selector.Size = %q, want >= 100GB", diskRef.Selector.Size)
+	}
+}
+
+func TestWalkRefs_EmptyInput_NoRefsNoError(t *testing.T) {
+	t.Parallel()
+
+	refs, err := applycheck.WalkRefs(nil)
+	if err != nil {
+		t.Fatalf("WalkRefs(nil) error: %v", err)
+	}
+
+	if len(refs) != 0 {
+		t.Errorf("WalkRefs(nil) returned %d refs, want 0", len(refs))
+	}
+}
+
+// TestWalkRefs_v1_12_DHCPEthernetEmitNameRef pins that the
+// "ethernet-shaped" v1alpha1 documents — DHCPv4Config, DHCPv6Config,
+// EthernetConfig — surface their `.name` as a Phase 1 link
+// reference. Each describes a configuration applied to an existing
+// host link; a typoed name there is a Phase 1 catch.
+func TestWalkRefs_v1_12_DHCPEthernetEmitNameRef(t *testing.T) {
+	t.Parallel()
+
+	body := `apiVersion: v1alpha1
+kind: DHCPv4Config
+name: ens5
+---
+apiVersion: v1alpha1
+kind: DHCPv6Config
+name: ens6
+---
+apiVersion: v1alpha1
+kind: EthernetConfig
+name: ens7
+`
+	refs, err := applycheck.WalkRefs([]byte(body))
+	if err != nil {
+		t.Fatalf("WalkRefs: %v", err)
+	}
+
+	for _, name := range []string{"ens5", "ens6", "ens7"} {
+		if _, ok := findRef(refs, applycheck.RefKindLink, name); !ok {
+			t.Errorf("expected link ref for %q, got refs=%+v", name, refs)
+		}
+	}
+}
+
+// TestWalkRefs_v1_12_VirtualCreatorsSkipNameRef pins that the
+// virtual-link-creator documents — WireguardConfig, DummyLinkConfig,
+// LinkAliasConfig — do NOT surface their `.name` as a link ref.
+// Their .name is the new resource being created, not a reference
+// to an existing link; emitting it as a ref would block every
+// legitimate create-wireguard / create-dummy / create-alias apply
+// (the same class of bug as the earlier Bond/VLAN/Bridge .name
+// false-positive).
+func TestWalkRefs_v1_12_VirtualCreatorsSkipNameRef(t *testing.T) {
+	t.Parallel()
+
+	body := `apiVersion: v1alpha1
+kind: WireguardConfig
+name: wg0
+---
+apiVersion: v1alpha1
+kind: DummyLinkConfig
+name: dummy0
+---
+apiVersion: v1alpha1
+kind: LinkAliasConfig
+name: my-alias
+selector:
+  match: physical
+`
+	refs, err := applycheck.WalkRefs([]byte(body))
+	if err != nil {
+		t.Fatalf("WalkRefs: %v", err)
+	}
+
+	for _, fake := range []string{"wg0", "dummy0", "my-alias"} {
+		if _, ok := findRef(refs, applycheck.RefKindLink, fake); ok {
+			t.Errorf("virtual-creator name %q leaked as a ref; only existing-link refs should be emitted", fake)
+		}
+	}
+}
+
+// TestWalkRefs_v1_12_RealTalosYAMLKeys pins the exact YAML field
+// names Talos's v1alpha1 schema uses (verified against
+// pkg/machinery/config/types/network/* in the cozystack/talos fork
+// v0.0.0-20260126122716):
+//
+//   - bond.go BondLinks `yaml:"links"`
+//   - bridge.go BridgeLinks `yaml:"links"` (NOT `ports`)
+//   - vlan.go ParentLinkConfig `yaml:"parent"` (NOT `link`)
+//   - layer2_vip.go LinkName `yaml:"link"`
+//   - hcloud_vip.go LinkName `yaml:"link"`
+//
+// Without this contract a renamed YAML key in machinery would
+// silently turn the walker into a no-op for the affected document
+// class and Phase 1 would stop catching typos in those refs. This
+// case was caught only by exercising a real cluster.
+func TestWalkRefs_v1_12_RealTalosYAMLKeys(t *testing.T) {
+	t.Parallel()
+
+	body := `apiVersion: v1alpha1
+kind: VLANConfig
+name: ens5.42
+parent: ens5
+vlanID: 42
+---
+apiVersion: v1alpha1
+kind: BridgeConfig
+name: br0
+links:
+  - ens6
+  - ens7
+---
+apiVersion: v1alpha1
+kind: BondConfig
+name: bond0
+links:
+  - ens8
+  - ens9
+---
+apiVersion: v1alpha1
+kind: Layer2VIPConfig
+name: 10.0.0.1
+link: bond0
+---
+apiVersion: v1alpha1
+kind: HCloudVIPConfig
+name: 10.0.0.2
+link: bond0
+`
+	refs, err := applycheck.WalkRefs([]byte(body))
+	if err != nil {
+		t.Fatalf("WalkRefs: %v", err)
+	}
+
+	want := []struct {
+		name   string
+		source string // suffix
+	}{
+		{"ens5", ".parent"},   // VLANConfig.parent (NOT .link)
+		{"ens6", ".links[0]"}, // BridgeConfig.links (NOT .ports)
+		{"ens7", ".links[1]"},
+		{"ens8", ".links[0]"}, // BondConfig.links
+		{"ens9", ".links[1]"},
+		{"bond0", ".link"}, // Layer2VIPConfig.link + HCloudVIPConfig.link
+	}
+
+	for _, w := range want {
+		var found bool
+
+		for _, r := range refs {
+			if r.Kind == applycheck.RefKindLink && r.Name == w.name && strings.HasSuffix(r.Source, w.source) {
+				found = true
+
+				break
+			}
+		}
+
+		if !found {
+			t.Errorf("expected link ref %q with source suffix %q, got refs=%+v", w.name, w.source, refs)
+		}
+	}
+
+	// Virtual-link names (the doc's own .name on VLAN/Bond/Bridge)
+	// must NOT surface as refs.
+	for _, fake := range []string{"ens5.42", "br0", "bond0"} {
+		if _, ok := findRef(refs, applycheck.RefKindLink, fake); ok && fake != "bond0" {
+			t.Errorf("name %q leaked as a ref; only existing-link refs should be emitted", fake)
+		}
+	}
+}
+
+// TestWalkRefs_AccidentalEncodings_Tolerated pins three real-world
+// shapes that come out of editors and unzipping operations: a UTF-8
+// BOM at the head of the file (common Windows artifact), CRLF line
+// endings (same), and no trailing newline at EOF (no-final-newline
+// editors). The walker uses gopkg.in/yaml.v3 which accepts all
+// three, but a regression in any wrapper would silently produce
+// zero refs and let a typoed config through.
+func TestWalkRefs_AccidentalEncodings_Tolerated(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name     string
+		body     string
+		wantRefs int
+	}{
+		{
+			name:     "UTF-8 BOM + CRLF",
+			body:     "\xef\xbb\xbfmachine:\r\n  install:\r\n    disk: /dev/sda\r\n",
+			wantRefs: 1,
+		},
+		{
+			name:     "no trailing newline at EOF",
+			body:     "apiVersion: v1alpha1\nkind: LinkConfig\nname: eth0",
+			wantRefs: 1,
+		},
+		{
+			name:     "only separators, no content",
+			body:     "---\n---\n---\n",
+			wantRefs: 0,
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+
+			refs, err := applycheck.WalkRefs([]byte(tc.body))
+			if err != nil {
+				t.Errorf("error: %v", err)
+			}
+
+			if len(refs) != tc.wantRefs {
+				t.Errorf("got %d refs, want %d (%+v)", len(refs), tc.wantRefs, refs)
+			}
+		})
+	}
+}
+
+// TestWalkRefs_MalformedStructures_NoPanic pins the walker's
+// tolerance for malformed shapes: corrupt YAML in fields the walker
+// inspects must not panic the whole render. The walker uses
+// best-effort type assertions and drops malformed entries silently
+// (Talos's own parser will reject them downstream with a clearer
+// error than the walker could give).
+func TestWalkRefs_MalformedStructures_NoPanic(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name string
+		body string
+	}{
+		{
+			name: "machine.install is a string",
+			body: `machine:
+  install: "not a map"
+`,
+		},
+		{
+			name: "machine.network.interfaces is a string",
+			body: `machine:
+  network:
+    interfaces: "not a list"
+`,
+		},
+		{
+			name: "interface entry is an int",
+			body: `machine:
+  network:
+    interfaces:
+      - 42
+`,
+		},
+		{
+			name: "LinkConfig with int name",
+			body: `apiVersion: v1alpha1
+kind: LinkConfig
+name: 42
+`,
+		},
+		{
+			name: "BondConfig links contains non-strings",
+			body: `apiVersion: v1alpha1
+kind: BondConfig
+name: bond0
+links:
+  - eth0
+  - 42
+  - null
+`,
+		},
+		{
+			name: "UserVolumeConfig provisioning missing",
+			body: `apiVersion: v1alpha1
+kind: UserVolumeConfig
+name: data
+`,
+		},
+		{
+			name: "yaml document with only null",
+			body: `---
+null
+---
+machine:
+  install:
+    disk: /dev/sda
+`,
+		},
+		{
+			name: "doc with kind but no name",
+			body: `apiVersion: v1alpha1
+kind: LinkConfig
+`,
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+
+			// The bar is "no panic, no error". Whether the walker
+			// emits 0 or N refs is implementation detail; the
+			// contract is that malformed shapes are *survivable*.
+			_, err := applycheck.WalkRefs([]byte(tc.body))
+			if err != nil {
+				t.Errorf("malformed YAML body should not error, got %v", err)
+			}
+		})
+	}
+}
diff --git a/pkg/applycheck/selector.go b/pkg/applycheck/selector.go
new file mode 100644
index 00000000..3e8c1154
--- /dev/null
+++ b/pkg/applycheck/selector.go
@@ -0,0 +1,388 @@
+// Copyright Cozystack Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package applycheck
+
+import (
+	"fmt"
+	"path/filepath"
+	"sort"
+	"strings"
+
+	"github.com/dustin/go-humanize"
+)
+
+const (
+	cmpGE    = ">="
+	cmpLE    = "<="
+	cmpEQ    = "=="
+	cmpGT    = ">"
+	cmpLT    = "<"
+	noneText = "<none>"
+
+	// maxHintItems caps the candidate list shown in selector / link
+	// missing-ref hints. On cozystack hosts after virtual-filter the
+	// real candidate count is small, but loose matchers (e.g. type:
+	// ssd on a storage node) can still surface 20+ entries that drown
+	// the actual finding line. Cap at 10 alphabetically; the tail
+	// collapses into "... and N more" so the operator can scan
+	// quickly and re-run discovery if they need the full set.
+	maxHintItems = 10
+
+	// transportNVMe is the value Talos's block.DiskSpec.Transport
+	// takes for NVMe disks. Kept separate from selectorTypeNVMe
+	// even though they're the same string today: a future Talos
+	// rename of either side independently would otherwise silently
+	// break only one of the two checks.
+	transportNVMe = "nvme"
+	// selectorTypeNVMe is the v1alpha1 InstallDiskSelector.type
+	// enum value for NVMe disks (see predicateType).
+	selectorTypeNVMe = "nvme"
+)
+
+// virtualBusPath is the BusPath value Talos assigns to kernel-virtual
+// block devices (loop, dm, drbd, ram, ...). Real disks live under a
+// PCI / virtio / etc. bus path; matchSelector excludes virtual devices
+// from candidacy so a selector never silently lands on /dev/loop0.
+const virtualBusPath = "/virtual"
+
+// matchSelector returns every disk in candidates that satisfies every
+// non-empty field of sel, excluding read-only, CD-ROM, and kernel-virtual
+// devices the way Talos's install-disk resolution does. An empty
+// selector matches every non-excluded candidate.
+//
+// Match semantics mirror Talos's InstallDiskSelector at the level the
+// pre-apply gate cares about — model and busPath are shell-globs (Talos
+// accepts globs in the YAML), size accepts comparators like ">= 1TB",
+// serial/wwid/modalias/uuid/name are exact-match. The `type` field is
+// not a stored disk attribute but a predicate over (Transport,
+// Rotational), mirrored here from v1alpha1_provider.go:1325-1351.
+func matchSelector(sel *DiskSelector, candidates []DiskInfo) []DiskInfo {
+	var matches []DiskInfo
+
+	for i := range candidates {
+		disk := &candidates[i]
+
+		if isExcludedDisk(disk) {
+			continue
+		}
+
+		if diskMatches(sel, disk) {
+			matches = append(matches, *disk)
+		}
+	}
+
+	return matches
+}
+
+// isExcludedDisk reports whether a disk should never be a selector
+// candidate. Readonly and CDROM are explicit Talos exclusions; virtual
+// devices (dm-*, drbd*, loop*) carry bus_path "/virtual" and are
+// inappropriate install / volume targets.
+func isExcludedDisk(disk *DiskInfo) bool {
+	return disk.Readonly || disk.CDROM || disk.BusPath == virtualBusPath
+}
+
+// diskPredicate is one rule the candidate disk has to satisfy. matchAll
+// short-circuits on the first false.
+type diskPredicate func(sel *DiskSelector, disk *DiskInfo) bool
+
+//nolint:gochecknoglobals // dispatch table, static after init.
+var diskPredicates = []diskPredicate{
+	predicateGlob(func(s *DiskSelector) string { return s.Model }, func(d *DiskInfo) string { return d.Model }),
+	predicateExact(func(s *DiskSelector) string { return s.Serial }, func(d *DiskInfo) string { return d.Serial }),
+	predicateExact(func(s *DiskSelector) string { return s.WWID }, func(d *DiskInfo) string { return d.WWID }),
+	predicateExact(func(s *DiskSelector) string { return s.Modalias }, func(d *DiskInfo) string { return d.Modalias }),
+	predicateExact(func(s *DiskSelector) string { return s.UUID }, func(d *DiskInfo) string { return d.UUID }),
+	predicateType,
+	predicateGlob(func(s *DiskSelector) string { return s.BusPath }, func(d *DiskInfo) string { return d.BusPath }),
+	predicateExact(func(s *DiskSelector) string { return s.Name }, func(d *DiskInfo) string { return d.DevPath }),
+	predicateSize,
+}
+
+// predicateType mirrors Talos's v1alpha1 type-selector resolution
+// (config/types/v1alpha1/v1alpha1_provider.go:1325-1351):
+//
+//	type: nvme -> Transport == "nvme"
+//	type: sd   -> Transport == "mmc"
+//	type: hdd  -> Rotational == true
+//	type: ssd  -> Rotational == false
+//
+// An empty selector type is "don't care".
+//
+// Case handling intentionally diverges from Talos: predicateType
+// lowercases sel.Type before the switch, whereas Talos returns
+// `unsupported disk type "SSD"` for mixed-case input. Phase 1's job
+// is to surface declared-resource mismatches at render time, NOT to
+// re-implement Talos's case-strict input validation — an operator
+// who declared `type: SSD` in values.yaml will get either (a) a
+// match here and a downstream Talos error at apply, or (b) the
+// "matches zero disks" finding from the rest of the selector logic.
+// Case (a) is acceptable: Phase 1 didn't introduce the typo, and
+// the Talos error itself is clear. Tightening this branch would
+// require its own enum hint and offer no protection beyond
+// duplicating Talos's check.
+func predicateType(sel *DiskSelector, disk *DiskInfo) bool {
+	if sel.Type == "" {
+		return true
+	}
+
+	switch strings.ToLower(sel.Type) {
+	case selectorTypeNVMe:
+		return disk.Transport == transportNVMe
+	case "sd":
+		return disk.Transport == "mmc"
+	case "hdd":
+		return disk.Rotational
+	case "ssd":
+		return !disk.Rotational
+	}
+
+	return false
+}
+
+func diskMatches(sel *DiskSelector, disk *DiskInfo) bool {
+	for _, p := range diskPredicates {
+		if !p(sel, disk) {
+			return false
+		}
+	}
+
+	return true
+}
+
+// predicateExact returns a predicate that compares two extracted string
+// fields exactly. An empty selector field is "don't care" — passes.
+func predicateExact(sel func(*DiskSelector) string, dsk func(*DiskInfo) string) diskPredicate {
+	return func(s *DiskSelector, disk *DiskInfo) bool {
+		want := sel(s)
+		if want == "" {
+			return true
+		}
+
+		return want == dsk(disk)
+	}
+}
+
+// predicateGlob matches the disk field against the selector field as a
+// shell-style glob (Talos accepts `Samsung*` and similar in model/busPath).
+func predicateGlob(sel func(*DiskSelector) string, dsk func(*DiskInfo) string) diskPredicate {
+	return func(s *DiskSelector, disk *DiskInfo) bool {
+		pattern := sel(s)
+		if pattern == "" {
+			return true
+		}
+
+		return globMatch(pattern, dsk(disk))
+	}
+}
+
+// predicateSize evaluates the Size comparator. Pulled out of the table
+// because it's the only non-string predicate.
+func predicateSize(s *DiskSelector, d *DiskInfo) bool {
+	if s.Size == "" {
+		return true
+	}
+
+	return sizeMatches(s.Size, d.Size)
+}
+
+// globMatch wraps filepath.Match with exact-match fallback. Talos
+// selectors use shell-style globs; filepath.Match covers `*`/`?`/`[…]`.
+func globMatch(pattern, value string) bool {
+	if pattern == value {
+		return true
+	}
+
+	ok, err := filepath.Match(pattern, value)
+	if err != nil {
+		return false
+	}
+
+	return ok
+}
+
+// sizeMatches evaluates a Talos-style size expression (e.g. "500GB",
+// "<= 1TB", ">= 100GB") against a candidate disk's size in bytes.
+// Unparseable expressions return false — surfaces as "matches zero",
+// which is the right operator signal for a malformed selector.
+func sizeMatches(expr string, sizeBytes uint64) bool {
+	cmp, raw := splitSizeOp(expr)
+
+	want, ok := parseSize(raw)
+	if !ok {
+		return false
+	}
+
+	switch cmp {
+	case cmpGE:
+		return sizeBytes >= want
+	case cmpLE:
+		return sizeBytes <= want
+	case cmpGT:
+		return sizeBytes > want
+	case cmpLT:
+		return sizeBytes < want
+	case cmpEQ, "":
+		return sizeBytes == want
+	}
+
+	return false
+}
+
+// splitSizeOp lifts a leading comparator from a size expression.
+// Whitespace between the operator and the literal is tolerated. Returns
+// ("", expr) when no comparator is present (equality is implied).
+func splitSizeOp(expr string) (string, string) {
+	expr = strings.TrimSpace(expr)
+
+	for _, prefix := range []string{cmpGE, cmpLE, cmpEQ, cmpGT, cmpLT} {
+		if rest, ok := strings.CutPrefix(expr, prefix); ok {
+			return prefix, strings.TrimSpace(rest)
+		}
+	}
+
+	return "", expr
+}
+
+// parseSize converts a human-readable size literal ("500GB", "1.5tb",
+// "100 MiB", "500gb") to bytes via humanize.ParseBytes, which mirrors
+// Talos's own size parser (block/InstallDiskSelector resolution
+// delegates to it). SI (kB/MB/GB/TB) and IEC (KiB/MiB/GiB/TiB) units
+// are accepted case-insensitively. A bare number is bytes.
+func parseSize(raw string) (uint64, bool) {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return 0, false
+	}
+
+	value, err := humanize.ParseBytes(raw)
+	if err != nil {
+		return 0, false
+	}
+
+	return value, true
+}
+
+// diskPathList returns the sorted set of DiskInfo.DevPath values for
+// non-excluded candidates so error messages list them in a stable,
+// scannable order without burying real candidates under noise from
+// dm/drbd/loop entries.
+func diskPathList(disks []DiskInfo) []string {
+	var paths []string
+
+	for i := range disks {
+		if isExcludedDisk(&disks[i]) {
+			continue
+		}
+
+		paths = append(paths, disks[i].DevPath)
+	}
+
+	sort.Strings(paths)
+
+	return paths
+}
+
+// quote returns the value wrapped in double quotes; used in finding
+// messages so a name with spaces (rare but possible for disk identifiers)
+// stays readable.
+func quote(s string) string {
+	return `"` + s + `"`
+}
+
+// joinAvailable returns the input as a comma-separated list, sorted.
+// Empty input becomes "<none>" so the operator sees an explicit signal
+// instead of an awkward "available links: ". Lists longer than
+// maxHintItems are truncated with a "... and N more" suffix.
+func joinAvailable(items []string) string {
+	if len(items) == 0 {
+		return noneText
+	}
+
+	sorted := append([]string(nil), items...)
+	sort.Strings(sorted)
+
+	return strings.Join(truncateAtN(sorted, maxHintItems), ", ")
+}
+
+// truncateAtN caps a pre-sorted list of human-readable items at n
+// entries; the (len - n) tail collapses into a single "... and K
+// more" string appended at the end. Returns the input unmodified
+// when len <= n.
+func truncateAtN(items []string, n int) []string {
+	if len(items) <= n {
+		return items
+	}
+
+	head := make([]string, 0, n+1)
+	head = append(head, items[:n]...)
+	head = append(head, fmt.Sprintf("... and %d more", len(items)-n))
+
+	return head
+}
+
+// summarizeDisks returns a compact human-readable list of non-excluded
+// disks, one per `path model serial size` triple. Used in selector-
+// mismatch findings so the operator can pick the right disk without
+// running another command. Excluded devices (virtual, readonly, CD)
+// are omitted from the summary to keep the hint actionable.
+func summarizeDisks(disks []DiskInfo) string {
+	var parts []string
+
+	for i := range disks {
+		if isExcludedDisk(&disks[i]) {
+			continue
+		}
+
+		parts = append(parts, formatDisk(&disks[i]))
+	}
+
+	if len(parts) == 0 {
+		return noneText
+	}
+
+	sort.Strings(parts)
+
+	return strings.Join(truncateAtN(parts, maxHintItems), "; ")
+}
+
+func formatDisk(disk *DiskInfo) string {
+	var builder strings.Builder
+
+	builder.WriteString(disk.DevPath)
+
+	if disk.Model != "" {
+		builder.WriteString(" model=")
+		builder.WriteString(disk.Model)
+	}
+
+	if disk.Serial != "" {
+		builder.WriteString(" serial=")
+		builder.WriteString(disk.Serial)
+	}
+
+	if disk.Size > 0 {
+		builder.WriteString(" size=")
+		builder.WriteString(formatBytes(disk.Size))
+	}
+
+	return builder.String()
+}
+
+// formatBytes renders a byte count via humanize.Bytes, matching the
+// parse-side parseSize implementation symmetrically.
+func formatBytes(b uint64) string {
+	return humanize.Bytes(b)
+}
diff --git a/pkg/applycheck/validate.go b/pkg/applycheck/validate.go
new file mode 100644
index 00000000..ac02c2df
--- /dev/null
+++ b/pkg/applycheck/validate.go
@@ -0,0 +1,171 @@
+// Copyright Cozystack Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package applycheck
+
+// HostSnapshot captures the host-side resource inventory the validator
+// compares declared refs against. Fields are populated from COSI
+// `links` and `disks` reads at apply time; tests construct fakes
+// directly.
+type HostSnapshot struct {
+	Links []string
+	Disks []DiskInfo
+}
+
+// DiskInfo describes one host block device. Fields mirror Talos's
+// block.Disk COSI resource shape verbatim — InstallDiskSelector.type
+// (an enum: ssd/hdd/nvme/sd) is NOT a stored field but a derived
+// predicate over Transport + Rotational, mirrored in matchSelector.
+// CDROM and Readonly devices are excluded by Talos's install-disk
+// resolution; matchSelector mirrors that exclusion.
+//
+// Symlinks captures the alternate path forms Talos exposes for the
+// same device (/dev/disk/by-id/wwn-…, /dev/disk/by-path/pci-…,
+// /dev/disk/by-diskseq/…); RefKindDiskLiteral validation accepts any
+// of these as equivalent to DevPath, because by-id paths are the
+// recommended stable form for `machine.install.disk` and an operator
+// using them is doing the right thing.
+type DiskInfo struct {
+	DevPath    string // /dev/sda
+	Model      string
+	Serial     string
+	WWID       string
+	Modalias   string
+	UUID       string
+	Transport  string // sata / nvme / scsi / mmc / virtio / ...
+	BusPath    string
+	Size       uint64 // bytes
+	Rotational bool
+	Readonly   bool
+	CDROM      bool
+	Symlinks   []string
+}
+
+// Severity classifies a finding's blocker status.
+type Severity int
+
+const (
+	// SeverityBlocker fails the apply.
+	SeverityBlocker Severity = iota
+	// SeverityWarning prints the finding but does not block.
+	SeverityWarning
+)
+
+// Finding is one validation result for one Ref. ValidateRefs returns one
+// Finding per Ref; clean refs produce no findings.
+type Finding struct {
+	Ref      Ref
+	Severity Severity
+	Reason   string
+	Hint     string
+}
+
+// IsBlocker returns true when the finding fails the apply.
+func (f *Finding) IsBlocker() bool {
+	return f.Severity == SeverityBlocker
+}
+
+// ValidateRefs checks each Ref against the host snapshot:
+//
+//   - RefKindLink: name must match an entry in snapshot.Links exactly.
+//     Mismatch is a blocker.
+//   - RefKindDiskLiteral: name must equal a DiskInfo.DevPath in
+//     snapshot.Disks. Mismatch is a blocker.
+//   - RefKindDiskSelector: selector must match >= 1 disk. Zero matches is
+//     a blocker. Multiple matches is a warning (install picks the first
+//     match — operators usually want to narrow the selector).
+//
+// ValidateRefs collects every finding in a single pass; callers can show
+// them all at once instead of one-at-a-time.
+func ValidateRefs(refs []Ref, snapshot HostSnapshot) []Finding {
+	if len(refs) == 0 {
+		return nil
+	}
+
+	linkSet := make(map[string]struct{}, len(snapshot.Links))
+	for _, name := range snapshot.Links {
+		linkSet[name] = struct{}{}
+	}
+
+	// Disk-literal validation accepts DevPath (`/dev/sda`) and every
+	// stable Symlink alternative (/dev/disk/by-id/wwn-…, by-path/…,
+	// by-diskseq/…). The recommended Talos pattern is by-id, so the
+	// gate must not reject by-id literals.
+	diskPaths := make(map[string]struct{}, len(snapshot.Disks)*4) //nolint:mnd // upper-bound estimate for sym-count.
+	for i := range snapshot.Disks {
+		d := &snapshot.Disks[i]
+		diskPaths[d.DevPath] = struct{}{}
+
+		for _, sym := range d.Symlinks {
+			diskPaths[sym] = struct{}{}
+		}
+	}
+
+	var findings []Finding
+
+	for i := range refs {
+		ref := &refs[i]
+		switch ref.Kind {
+		case RefKindLink:
+			findings = appendIfMissing(findings, ref, linkSet, snapshot.Links, "link")
+		case RefKindDiskLiteral:
+			findings = appendIfMissing(findings, ref, diskPaths, diskPathList(snapshot.Disks), "disk")
+		case RefKindDiskSelector:
+			findings = appendSelectorFinding(findings, ref, snapshot.Disks)
+		}
+	}
+
+	return findings
+}
+
+// appendIfMissing appends a blocker finding when ref.Name isn't in present.
+// available is the sorted-by-the-caller list shown to the operator so they
+// can pick the right name without re-running discovery.
+func appendIfMissing(findings []Finding, ref *Ref, present map[string]struct{}, available []string, kind string) []Finding {
+	if _, ok := present[ref.Name]; ok {
+		return findings
+	}
+
+	return append(findings, Finding{
+		Ref:      *ref,
+		Severity: SeverityBlocker,
+		Reason:   "declared " + kind + " " + quote(ref.Name) + " not found on target node",
+		Hint:     "available " + kind + "s: " + joinAvailable(available),
+	})
+}
+
+// appendSelectorFinding emits a blocker on zero matches and a warning on
+// multiple matches. A single match is clean (no finding).
+func appendSelectorFinding(findings []Finding, ref *Ref, disks []DiskInfo) []Finding {
+	matches := matchSelector(&ref.Selector, disks)
+
+	switch {
+	case len(matches) == 0:
+		return append(findings, Finding{
+			Ref:      *ref,
+			Severity: SeverityBlocker,
+			Reason:   "disk selector matches zero disks on target node",
+			Hint:     "available disks: " + summarizeDisks(disks),
+		})
+	case len(matches) > 1:
+		return append(findings, Finding{
+			Ref:      *ref,
+			Severity: SeverityWarning,
+			Reason:   "disk selector matches multiple disks; install picks the first match",
+			Hint:     "matched: " + summarizeDisks(matches),
+		})
+	}
+
+	return findings
+}
diff --git a/pkg/applycheck/validate_test.go b/pkg/applycheck/validate_test.go
new file mode 100644
index 00000000..03cb91da
--- /dev/null
+++ b/pkg/applycheck/validate_test.go
@@ -0,0 +1,563 @@
+// Copyright Cozystack Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package applycheck_test
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/cozystack/talm/pkg/applycheck"
+)
+
+func TestValidateRefs_LinkMissing_EmitsBlockerWithAvailableList(t *testing.T) {
+	t.Parallel()
+
+	refs := []applycheck.Ref{{
+		Kind:   applycheck.RefKindLink,
+		Name:   "eth9999",
+		Source: "machine.network.interfaces[0].interface",
+	}}
+	snapshot := applycheck.HostSnapshot{
+		Links: []string{"eth0", "eth1"},
+	}
+
+	findings := applycheck.ValidateRefs(refs, snapshot)
+	if len(findings) != 1 {
+		t.Fatalf("ValidateRefs returned %d findings, want 1: %+v", len(findings), findings)
+	}
+
+	f := findings[0]
+	if !f.IsBlocker() {
+		t.Errorf("missing link should be a blocker, got severity %v", f.Severity)
+	}
+
+	if !strings.Contains(f.Reason, "eth9999") {
+		t.Errorf("reason should cite the offending name, got %q", f.Reason)
+	}
+
+	if !strings.Contains(f.Hint, "eth0") || !strings.Contains(f.Hint, "eth1") {
+		t.Errorf("hint should list available links, got %q", f.Hint)
+	}
+}
+
+// TestValidateRefs_LinkMissing_HintTruncatedAtTopN pins the hint
+// length budget. A cozystack host can surface dozens of link names
+// (bonds, VLANs, bridges, lo, eth*) — a flat join produces 1000+ char
+// hints that drown the actual blocker line. The hint must show the
+// first 10 names alphabetically and collapse the tail into "... and
+// N more"; operators can re-run `talm get links` if they need the
+// full list.
+func TestValidateRefs_LinkMissing_HintTruncatedAtTopN(t *testing.T) {
+	t.Parallel()
+
+	links := make([]string, 0, 30)
+	for i := range 30 {
+		links = append(links, "eth"+itoa(i))
+	}
+
+	refs := []applycheck.Ref{{Kind: applycheck.RefKindLink, Name: "eth9999"}}
+	snapshot := applycheck.HostSnapshot{Links: links}
+
+	findings := applycheck.ValidateRefs(refs, snapshot)
+	if len(findings) != 1 {
+		t.Fatalf("expected one blocker, got %+v", findings)
+	}
+
+	hint := findings[0].Hint
+	if !strings.Contains(hint, "... and 20 more") {
+		t.Errorf("hint should collapse the tail into '... and N more', got %q", hint)
+	}
+
+	// First 10 in alphabetical sort: eth0, eth1, eth10..eth17.
+	// The tail (eth18..eth9, eth20..eth29) must NOT appear inline.
+	if strings.Contains(hint, "eth29") {
+		t.Errorf("hint should not list eth29 (beyond top-10), got %q", hint)
+	}
+
+	if !strings.Contains(hint, "eth0") {
+		t.Errorf("hint should list eth0 (first alphabetically), got %q", hint)
+	}
+}
+
+// TestValidateRefs_DiskSelectorMismatch_HintTruncated pins the same
+// budget for the disk-summary path: virtual devices are already
+// filtered, but a host with many real disks (storage nodes, RAID
+// hosts) still produces unwieldy hints. Cap at 10 disks + "... and N
+// more".
+func TestValidateRefs_DiskSelectorMismatch_HintTruncated(t *testing.T) {
+	t.Parallel()
+
+	disks := make([]applycheck.DiskInfo, 0, 25)
+	for i := range 25 {
+		disks = append(disks, applycheck.DiskInfo{
+			DevPath: "/dev/sd" + string(rune('a'+i%26)) + itoa(i/26),
+			Model:   "FakeModel",
+			BusPath: "/pci0000:00",
+		})
+	}
+
+	refs := []applycheck.Ref{{
+		Kind:     applycheck.RefKindDiskSelector,
+		Name:     "{model: NoSuchModel}",
+		Selector: applycheck.DiskSelector{Model: "NoSuchModel"},
+	}}
+	snapshot := applycheck.HostSnapshot{Disks: disks}
+
+	findings := applycheck.ValidateRefs(refs, snapshot)
+	if len(findings) != 1 {
+		t.Fatalf("expected one blocker, got %+v", findings)
+	}
+
+	hint := findings[0].Hint
+	if !strings.Contains(hint, "... and 15 more") {
+		t.Errorf("disk hint should collapse the tail, got %q", hint)
+	}
+}
+
+// itoa is a tiny helper to avoid pulling strconv into the test file
+// for a single integer formatting site.
+func itoa(n int) string {
+	if n == 0 {
+		return "0"
+	}
+
+	var buf [20]byte
+
+	i := len(buf)
+
+	neg := n < 0
+	if neg {
+		n = -n
+	}
+
+	for n > 0 {
+		i--
+		buf[i] = byte('0' + n%10)
+		n /= 10
+	}
+
+	if neg {
+		i--
+		buf[i] = '-'
+	}
+
+	return string(buf[i:])
+}
+
+func TestValidateRefs_LinkPresent_NoFinding(t *testing.T) {
+	t.Parallel()
+
+	refs := []applycheck.Ref{{Kind: applycheck.RefKindLink, Name: "eth0"}}
+	snapshot := applycheck.HostSnapshot{Links: []string{"eth0", "eth1"}}
+
+	findings := applycheck.ValidateRefs(refs, snapshot)
+	if len(findings) != 0 {
+		t.Errorf("ValidateRefs returned %d findings, want 0: %+v", len(findings), findings)
+	}
+}
+
+func TestValidateRefs_DiskLiteralMissing_EmitsBlocker(t *testing.T) {
+	t.Parallel()
+
+	refs := []applycheck.Ref{{Kind: applycheck.RefKindDiskLiteral, Name: "/dev/sdz"}}
+	snapshot := applycheck.HostSnapshot{
+		Disks: []applycheck.DiskInfo{
+			{DevPath: "/dev/sda", Model: "Samsung 980 Pro"},
+			{DevPath: "/dev/nvme0n1"},
+		},
+	}
+
+	findings := applycheck.ValidateRefs(refs, snapshot)
+	if len(findings) != 1 || !findings[0].IsBlocker() {
+		t.Fatalf("expected one blocker, got %+v", findings)
+	}
+
+	if !strings.Contains(findings[0].Hint, "/dev/sda") {
+		t.Errorf("hint should list available disk paths, got %q", findings[0].Hint)
+	}
+}
+
+// TestValidateRefs_DiskLiteralByID_AcceptedViaSymlink pins the
+// contract: machine.install.disk accepts the by-id /
+// by-path / by-diskseq stable forms Talos exposes via Disk.Symlinks.
+// These are the recommended forms for stable boot ordering, so the
+// gate must not block them — verified against a live Talos node
+// where /dev/sda surfaces /dev/disk/by-id/wwn-0x602742… as a symlink.
+func TestValidateRefs_DiskLiteralByID_AcceptedViaSymlink(t *testing.T) {
+	t.Parallel()
+
+	refs := []applycheck.Ref{{
+		Kind: applycheck.RefKindDiskLiteral,
+		Name: "/dev/disk/by-id/wwn-0x602742ce4e9046729ae81c05166f4d8e",
+	}}
+	snapshot := applycheck.HostSnapshot{
+		Disks: []applycheck.DiskInfo{
+			{
+				DevPath: "/dev/sda",
+				WWID:    "naa.602742ce4e9046729ae81c05166f4d8e",
+				Symlinks: []string{
+					"/dev/disk/by-diskseq/23",
+					"/dev/disk/by-id/scsi-3602742ce4e9046729ae81c05166f4d8e",
+					"/dev/disk/by-id/wwn-0x602742ce4e9046729ae81c05166f4d8e",
+					"/dev/disk/by-path/pci-0000:00:04.0-scsi-0:0:0:1",
+				},
+			},
+		},
+	}
+
+	findings := applycheck.ValidateRefs(refs, snapshot)
+	if len(findings) != 0 {
+		t.Errorf("by-id symlink should resolve to /dev/sda; gate must not block, got findings=%+v", findings)
+	}
+}
+
+func TestValidateRefs_SelectorZeroMatches_Blocker(t *testing.T) {
+	t.Parallel()
+
+	refs := []applycheck.Ref{{
+		Kind:     applycheck.RefKindDiskSelector,
+		Selector: applycheck.DiskSelector{Model: "Samsumg*"}, // typo on purpose
+	}}
+	snapshot := applycheck.HostSnapshot{
+		Disks: []applycheck.DiskInfo{
+			{DevPath: "/dev/sda", Model: "Samsung 980 Pro"},
+		},
+	}
+
+	findings := applycheck.ValidateRefs(refs, snapshot)
+	if len(findings) != 1 || !findings[0].IsBlocker() {
+		t.Fatalf("expected one blocker on zero matches, got %+v", findings)
+	}
+}
+
+func TestValidateRefs_SelectorOneMatch_NoFinding(t *testing.T) {
+	t.Parallel()
+
+	refs := []applycheck.Ref{{
+		Kind:     applycheck.RefKindDiskSelector,
+		Selector: applycheck.DiskSelector{Model: "Samsung*"},
+	}}
+	snapshot := applycheck.HostSnapshot{
+		Disks: []applycheck.DiskInfo{
+			{DevPath: "/dev/sda", Model: "Samsung 980 Pro"},
+			{DevPath: "/dev/nvme0n1", Model: "WD Black"},
+		},
+	}
+
+	findings := applycheck.ValidateRefs(refs, snapshot)
+	if len(findings) != 0 {
+		t.Errorf("expected no findings on single match, got %+v", findings)
+	}
+}
+
+func TestValidateRefs_SelectorMultipleMatches_Warning(t *testing.T) {
+	t.Parallel()
+
+	refs := []applycheck.Ref{{
+		Kind:     applycheck.RefKindDiskSelector,
+		Selector: applycheck.DiskSelector{Type: "ssd"},
+	}}
+	// type: ssd is derived from Rotational=false (see predicateType).
+	snapshot := applycheck.HostSnapshot{
+		Disks: []applycheck.DiskInfo{
+			{DevPath: "/dev/sda", Transport: "sata", Rotational: false},
+			{DevPath: "/dev/sdb", Transport: "nvme", Rotational: false},
+		},
+	}
+
+	findings := applycheck.ValidateRefs(refs, snapshot)
+	if len(findings) != 1 || findings[0].Severity != applycheck.SeverityWarning {
+		t.Fatalf("expected one warning on multiple matches, got %+v", findings)
+	}
+}
+
+// TestValidateRefs_TypeSelector_TalosSemantics pins the (Transport,
+// Rotational) -> type-enum mapping the gate inherits from Talos's own
+// install-disk resolution. Without this mapping, a perfectly valid
+// `diskSelector: { type: ssd }` against a SATA SSD would block.
+func TestValidateRefs_TypeSelector_TalosSemantics(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name        string
+		selector    applycheck.DiskSelector
+		disk        applycheck.DiskInfo
+		wantMatches bool
+	}{
+		{
+			name:        "type ssd matches sata rotational=false",
+			selector:    applycheck.DiskSelector{Type: "ssd"},
+			disk:        applycheck.DiskInfo{DevPath: "/dev/sda", Transport: "sata", Rotational: false},
+			wantMatches: true,
+		},
+		{
+			name:        "type ssd rejects sata rotational=true",
+			selector:    applycheck.DiskSelector{Type: "ssd"},
+			disk:        applycheck.DiskInfo{DevPath: "/dev/sda", Transport: "sata", Rotational: true},
+			wantMatches: false,
+		},
+		{
+			name:        "type hdd matches scsi rotational=true",
+			selector:    applycheck.DiskSelector{Type: "hdd"},
+			disk:        applycheck.DiskInfo{DevPath: "/dev/sda", Transport: "scsi", Rotational: true},
+			wantMatches: true,
+		},
+		{
+			name:        "type nvme matches nvme transport",
+			selector:    applycheck.DiskSelector{Type: "nvme"},
+			disk:        applycheck.DiskInfo{DevPath: "/dev/nvme0n1", Transport: "nvme", Rotational: false},
+			wantMatches: true,
+		},
+		{
+			name:        "type sd matches mmc transport",
+			selector:    applycheck.DiskSelector{Type: "sd"},
+			disk:        applycheck.DiskInfo{DevPath: "/dev/mmcblk0", Transport: "mmc"},
+			wantMatches: true,
+		},
+		{
+			name:        "type SSD case-insensitive",
+			selector:    applycheck.DiskSelector{Type: "SSD"},
+			disk:        applycheck.DiskInfo{DevPath: "/dev/sda", Transport: "sata", Rotational: false},
+			wantMatches: true,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+
+			refs := []applycheck.Ref{{Kind: applycheck.RefKindDiskSelector, Selector: tc.selector}}
+			snapshot := applycheck.HostSnapshot{Disks: []applycheck.DiskInfo{tc.disk}}
+
+			findings := applycheck.ValidateRefs(refs, snapshot)
+
+			matched := len(findings) == 0
+			if matched != tc.wantMatches {
+				t.Errorf("matched=%v want=%v findings=%+v", matched, tc.wantMatches, findings)
+			}
+		})
+	}
+}
+
+// TestValidateRefs_ReadonlyAndCDROMExcluded mirrors Talos's
+// install-disk resolution, which never picks readonly or cdrom devices.
+// The gate must exclude them too — otherwise a node with a CD-ROM
+// surfaces a spurious "multiple matches" warning on every selector.
+func TestValidateRefs_ReadonlyAndCDROMExcluded(t *testing.T) {
+	t.Parallel()
+
+	refs := []applycheck.Ref{{
+		Kind:     applycheck.RefKindDiskSelector,
+		Selector: applycheck.DiskSelector{Type: "ssd"},
+	}}
+	snapshot := applycheck.HostSnapshot{
+		Disks: []applycheck.DiskInfo{
+			{DevPath: "/dev/sda", Transport: "sata", Rotational: false},
+			{DevPath: "/dev/sr0", Transport: "sata", Rotational: false, CDROM: true},
+			{DevPath: "/dev/sdz", Transport: "sata", Rotational: false, Readonly: true},
+		},
+	}
+
+	findings := applycheck.ValidateRefs(refs, snapshot)
+	if len(findings) != 0 {
+		t.Errorf("expected one clean match (sda); cdrom + readonly disks must be excluded, got findings=%+v", findings)
+	}
+}
+
+// TestValidateRefs_VirtualDisksExcluded pins the exclusion of kernel-
+// virtual block devices (loop, dm, drbd, ram). Talos block.DiskSpec
+// reports BusPath="/virtual" for those; the gate must skip them so a
+// selector like `type: ssd` on a cozystack host (which hosts many
+// loop/dm/drbd devices for DRBD-replicated PVs) doesn't surface a
+// spurious "multiple matches" warning across every install. Verified
+// on a live OCI cluster: real disks (sda, sdb) have PCI bus paths,
+// virtual disks (dm-*, drbd*, loop*) carry "/virtual".
+func TestValidateRefs_VirtualDisksExcluded(t *testing.T) {
+	t.Parallel()
+
+	refs := []applycheck.Ref{{
+		Kind:     applycheck.RefKindDiskSelector,
+		Selector: applycheck.DiskSelector{Type: "ssd"},
+	}}
+	snapshot := applycheck.HostSnapshot{
+		Disks: []applycheck.DiskInfo{
+			{DevPath: "/dev/sda", BusPath: "/pci0000:00/0000:00:04.0", Transport: "virtio", Rotational: false},
+			{DevPath: "/dev/dm-0", BusPath: "/virtual", Rotational: false},
+			{DevPath: "/dev/drbd1000", BusPath: "/virtual", Rotational: false},
+			{DevPath: "/dev/loop0", BusPath: "/virtual", Rotational: false},
+		},
+	}
+
+	findings := applycheck.ValidateRefs(refs, snapshot)
+	if len(findings) != 0 {
+		t.Errorf("expected one clean match (sda); virtual disks must be excluded, got findings=%+v", findings)
+	}
+}
+
+// TestValidateRefs_DiskLiteralHint_OmitsVirtual pins the hint output
+// scope: when a literal disk path doesn't resolve, the operator-facing
+// hint must list real block devices only, not virtual noise.
+func TestValidateRefs_DiskLiteralHint_OmitsVirtual(t *testing.T) {
+	t.Parallel()
+
+	refs := []applycheck.Ref{{
+		Kind: applycheck.RefKindDiskLiteral,
+		Name: "/dev/sdz",
+	}}
+	snapshot := applycheck.HostSnapshot{
+		Disks: []applycheck.DiskInfo{
+			{DevPath: "/dev/sda", BusPath: "/pci0000:00/0000:00:04.0"},
+			{DevPath: "/dev/dm-0", BusPath: "/virtual"},
+			{DevPath: "/dev/loop0", BusPath: "/virtual"},
+		},
+	}
+
+	findings := applycheck.ValidateRefs(refs, snapshot)
+	if len(findings) != 1 {
+		t.Fatalf("expected one blocker, got %+v", findings)
+	}
+
+	hint := findings[0].Hint
+	if strings.Contains(hint, "/dev/dm-0") || strings.Contains(hint, "/dev/loop0") {
+		t.Errorf("hint should omit virtual devices, got %q", hint)
+	}
+
+	if !strings.Contains(hint, "/dev/sda") {
+		t.Errorf("hint should list real disks, got %q", hint)
+	}
+}
+
+func TestValidateRefs_SelectorBySize_GreaterThanOrEqual(t *testing.T) {
+	t.Parallel()
+
+	refs := []applycheck.Ref{{
+		Kind:     applycheck.RefKindDiskSelector,
+		Selector: applycheck.DiskSelector{Size: ">= 500GB"},
+	}}
+	snapshot := applycheck.HostSnapshot{
+		Disks: []applycheck.DiskInfo{
+			{DevPath: "/dev/sda", Size: 250_000_000_000}, // 250GB — below cut
+			{DevPath: "/dev/sdb", Size: 500_000_000_000}, // 500GB — at cut
+			{DevPath: "/dev/sdc", Size: 1_000_000_000_000},
+		},
+	}
+
+	findings := applycheck.ValidateRefs(refs, snapshot)
+	if len(findings) != 1 || findings[0].Severity != applycheck.SeverityWarning {
+		t.Fatalf("expected one warning (two disks above 500GB), got %+v", findings)
+	}
+}
+
+// TestValidateRefs_SelectorBySize_HumanizedUnits pins the parser
+// against humanize.ParseBytes — lowercase units, mixed case, optional
+// spaces, IEC binary, and SI decimal all work. Without this the gate
+// would block `size: ">= 100gb"` selectors that Talos parses cleanly.
+func TestValidateRefs_SelectorBySize_HumanizedUnits(t *testing.T) {
+	t.Parallel()
+
+	disks := []applycheck.DiskInfo{
+		{DevPath: "/dev/sda", Size: 100_000_000_000}, // 100 GB SI
+		{DevPath: "/dev/sdb", Size: 99_000_000_000},
+	}
+
+	tests := []struct {
+		name        string
+		size        string
+		wantMatches int
+	}{
+		{"lowercase gb", ">= 100gb", 1},
+		{"lowercase gib", ">= 1gib", 2},
+		{"mixed case MiB", "<= 200000MiB", 2},
+		{"spaced unit", ">= 100 GB", 1},
+		{"bare numeric bytes", ">= 100000000000", 1},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+
+			refs := []applycheck.Ref{{
+				Kind:     applycheck.RefKindDiskSelector,
+				Selector: applycheck.DiskSelector{Size: tc.size},
+			}}
+			findings := applycheck.ValidateRefs(refs, applycheck.HostSnapshot{Disks: disks})
+
+			// 0 matches -> 1 blocker; 1 match -> 0 findings; >1 -> 1 warning.
+			var actualMatches int
+
+			switch {
+			case len(findings) == 0:
+				actualMatches = 1
+			case findings[0].Severity == applycheck.SeverityBlocker:
+				actualMatches = 0
+			default:
+				actualMatches = 2
+			}
+
+			if actualMatches != tc.wantMatches {
+				t.Errorf("size=%q matched %d disks, want %d (findings=%+v)", tc.size, actualMatches, tc.wantMatches, findings)
+			}
+		})
+	}
+}
+
+func TestValidateRefs_SelectorBySerial_ExactMatch(t *testing.T) {
+	t.Parallel()
+
+	refs := []applycheck.Ref{{
+		Kind:     applycheck.RefKindDiskSelector,
+		Selector: applycheck.DiskSelector{Serial: "S649NJ0R602345"},
+	}}
+	snapshot := applycheck.HostSnapshot{
+		Disks: []applycheck.DiskInfo{
+			{DevPath: "/dev/sda", Serial: "S649NJ0R602345"},
+			{DevPath: "/dev/sdb", Serial: "S649NJ0R602346"},
+		},
+	}
+
+	findings := applycheck.ValidateRefs(refs, snapshot)
+	if len(findings) != 0 {
+		t.Errorf("exact serial should match exactly one disk, got %+v", findings)
+	}
+}
+
+func TestValidateRefs_MultipleProblems_AllSurfacedInOnePass(t *testing.T) {
+	t.Parallel()
+
+	refs := []applycheck.Ref{
+		{Kind: applycheck.RefKindLink, Name: "eth9999"},
+		{Kind: applycheck.RefKindLink, Name: "br0"},
+		{Kind: applycheck.RefKindDiskLiteral, Name: "/dev/sdz"},
+		{Kind: applycheck.RefKindDiskSelector, Selector: applycheck.DiskSelector{Model: "Nope*"}},
+	}
+	snapshot := applycheck.HostSnapshot{
+		Links: []string{"eth0", "eth1", "br0"},
+		Disks: []applycheck.DiskInfo{{DevPath: "/dev/sda", Model: "Samsung"}},
+	}
+
+	findings := applycheck.ValidateRefs(refs, snapshot)
+	if len(findings) != 3 {
+		t.Errorf("expected 3 findings (1 missing link, 1 missing disk literal, 1 zero-match selector), got %d: %+v", len(findings), findings)
+	}
+}
+
+func TestValidateRefs_EmptyRefs_NoFindings(t *testing.T) {
+	t.Parallel()
+
+	findings := applycheck.ValidateRefs(nil, applycheck.HostSnapshot{Links: []string{"eth0"}})
+	if len(findings) != 0 {
+		t.Errorf("ValidateRefs(nil) returned %d findings, want 0", len(findings))
+	}
+}
diff --git a/pkg/commands/apply.go b/pkg/commands/apply.go
index df688b2b..ceb465cd 100644
--- a/pkg/commands/apply.go
+++ b/pkg/commands/apply.go
@@ -17,6 +17,7 @@ package commands
 import (
 	"context"
 	"fmt"
+	"io"
 	"os"
 	"path/filepath"
 	"strings"
@@ -49,20 +50,23 @@ const applyCommandName = "talm apply"
 var applyCmdFlags struct {
 	helpers.Mode
 
-	certFingerprints  []string
-	insecure          bool
-	configFiles       []string // -f/--files
-	talosVersion      string
-	withSecrets       string
-	debug             bool
-	kubernetesVersion string
-	dryRun            bool
-	preserve          bool
-	stage             bool
-	force             bool
-	configTryTimeout  time.Duration
-	nodesFromArgs     bool
-	endpointsFromArgs bool
+	certFingerprints       []string
+	insecure               bool
+	configFiles            []string // -f/--files
+	talosVersion           string
+	withSecrets            string
+	debug                  bool
+	kubernetesVersion      string
+	dryRun                 bool
+	preserve               bool
+	stage                  bool
+	force                  bool
+	configTryTimeout       time.Duration
+	nodesFromArgs          bool
+	endpointsFromArgs      bool
+	skipResourceValidation bool
+	skipDriftPreview       bool
+	skipPostApplyVerify    bool
 }
 
 //nolint:gochecknoglobals // cobra command, idiomatic for cobra-based CLIs
@@ -239,13 +243,17 @@ func applyOneFileTemplateMode(configFile string, modelineTemplates []string, wit
 // original ctx unchanged.
 func buildApplyClosure() applyFunc {
 	return func(ctx context.Context, c *client.Client, data []byte) error {
-		cosiCtx, err := cosiPreflightContext(ctx)
+		cosiCtx, nodeID, err := cosiPreflightContext(ctx)
 		if err != nil {
 			return err
 		}
 
 		preflightCheckTalosVersion(cosiCtx, cosiVersionReader(c), applyCmdFlags.talosVersion, os.Stderr)
 
+		if err := runPreApplyGates(cosiCtx, c, data, nodeID, os.Stderr); err != nil {
+			return err
+		}
+
 		resp, err := c.ApplyConfiguration(ctx, &machineapi.ApplyConfigurationRequest{
 			Data:           data,
 			Mode:           applyCmdFlags.Mode.Mode,
@@ -258,6 +266,10 @@ func buildApplyClosure() applyFunc {
 
 		helpers.PrintApplyResults(resp)
 
+		if err := runPostApplyGate(cosiCtx, c, data, nodeID, os.Stderr); err != nil {
+			return err
+		}
+
 		return nil
 	}
 }
@@ -320,8 +332,14 @@ func applyOneFileDirectPatchMode(configFile, withSecretsPath string) error {
 		fmt.Printf("- talm: file=%s, nodes=%s, endpoints=%s\n", configFile, targetNodes, GlobalArgs.Endpoints)
 
 		read := cosiVersionReader(c)
+
 		for _, node := range targetNodes {
-			preflightCheckTalosVersion(client.WithNode(ctx, node), read, applyCmdFlags.talosVersion, os.Stderr)
+			nodeCtx := client.WithNode(ctx, node)
+			preflightCheckTalosVersion(nodeCtx, read, applyCmdFlags.talosVersion, os.Stderr)
+
+			if err := runPreApplyGates(nodeCtx, c, result, node, os.Stderr); err != nil {
+				return err
+			}
 		}
 
 		resp, err := c.ApplyConfiguration(ctx, &machineapi.ApplyConfigurationRequest{
@@ -331,15 +349,135 @@ func applyOneFileDirectPatchMode(configFile, withSecretsPath string) error {
 			TryModeTimeout: durationpb.New(applyCmdFlags.configTryTimeout),
 		})
 		if err != nil {
+			// Post-apply verify intentionally not run on this path:
+			// Talos's ApplyConfiguration is multi-node aware, so an
+			// error here doesn't pinpoint which nodes succeeded and
+			// which didn't. Surface the wrapped error so the operator
+			// can re-run apply (which will re-trigger the pre-apply
+			// gate too) — running verify on possibly-partially-applied
+			// state would produce confusing per-node divergence noise
+			// on top of the actual failure.
 			return errors.Wrap(annotateApplyConfigError(err), "applying new configuration")
 		}
 
 		helpers.PrintApplyResults(resp)
 
-		return nil
+		return runPostApplyGates(ctx, c, result, targetNodes)
 	})
 }
 
+// runPostApplyGates fans Phase 2B verification across every target
+// node, collecting per-node findings before surfacing them. Apply
+// already happened on every node — short-circuiting on the first
+// divergence would hide later nodes' state. Mirrors ValidateRefs's
+// collect-then-block pattern.
+func runPostApplyGates(ctx context.Context, c *client.Client, result []byte, targetNodes []string) error {
+	var perNodeErrs []error
+
+	for _, node := range targetNodes {
+		if err := runPostApplyGate(client.WithNode(ctx, node), c, result, node, os.Stderr); err != nil {
+			perNodeErrs = append(perNodeErrs, errors.Wrapf(err, "node %s", node))
+		}
+	}
+
+	return errors.Join(perNodeErrs...)
+}
+
+// runPreApplyGates wires the two pre-apply safety gates against the
+// rendered MachineConfig. Phase 1 (resource existence) blocks on bad
+// refs unless --skip-resource-validation is set. Phase 2A (drift
+// preview) is informational and never blocks; --skip-drift-preview
+// suppresses the read entirely.
+//
+// Phase 2A intentionally runs on --dry-run: the diff is read-only,
+// and "show me what would change" is precisely what dry-run is for.
+// Skipping it would leave operators with no way to preview drift
+// short of a real apply.
+func runPreApplyGates(ctx context.Context, c *client.Client, rendered []byte, nodeID string, w io.Writer) error {
+	if !applyCmdFlags.skipResourceValidation {
+		if err := preflightValidateResources(ctx, cosiLinksDisksReader(c), rendered, w); err != nil {
+			return err
+		}
+	}
+
+	if !shouldRunDriftPreview(applyCmdFlags.skipDriftPreview) {
+		return nil
+	}
+
+	return previewDrift(ctx, cosiMachineConfigReader(c, applyCmdFlags.insecure), rendered, nodeID, w)
+}
+
+// shouldRunDriftPreview is the testable predicate for Phase 2A
+// scheduling. Pure function for pin-testing the contract: dry-run
+// is intentionally NOT a skip reason here (the preview is read-only;
+// dry-run wants the diff). Only the explicit skip flag suppresses
+// the gate.
+func shouldRunDriftPreview(skip bool) bool {
+	return !skip
+}
+
+// runPostApplyGate wires Phase 2B (post-apply state verification).
+// Skipped on dry-run (no real apply) and on the staged/try/reboot
+// apply modes:
+//
+//   - --mode=staged stores the new config as staged; the active
+//     MachineConfig resource is unchanged until reboot, so a verify
+//     against ActiveID always reports divergence.
+//   - --mode=try applies the config but auto-rolls back after the
+//     configured timeout; verify would race against the rollback
+//     timer and produce false positives.
+//   - --mode=reboot reboots the node after ApplyConfiguration
+//     returns success; the COSI connection dies mid-verify and the
+//     reader returns a transient error, which the gate would
+//     surface as a blocker — a false positive for a successful
+//     reboot apply.
+//
+// All three modes have explicit contracts that diverge from "what
+// was sent is what is on the node now after success was reported"
+// (or guarantee the on-node state is inaccessible for verification);
+// the gate respects them.
+func runPostApplyGate(ctx context.Context, c *client.Client, sent []byte, nodeID string, w io.Writer) error {
+	if !shouldRunPostApplyVerify(applyCmdFlags.Mode.Mode, applyCmdFlags.dryRun, applyCmdFlags.skipPostApplyVerify) {
+		return nil
+	}
+
+	return verifyAppliedState(ctx, cosiMachineConfigReader(c, applyCmdFlags.insecure), sent, nodeID, w)
+}
+
+// shouldRunPostApplyVerify is the testable predicate for runPostApplyGate.
+// Returns false when the verify must be skipped for any reason listed in
+// runPostApplyGate's doc.
+func shouldRunPostApplyVerify(mode machineapi.ApplyConfigurationRequest_Mode, dryRun, skip bool) bool {
+	if skip || dryRun {
+		return false
+	}
+
+	switch mode {
+	case machineapi.ApplyConfigurationRequest_STAGED,
+		machineapi.ApplyConfigurationRequest_TRY,
+		machineapi.ApplyConfigurationRequest_REBOOT,
+		// AUTO is skipped because Talos's apply-server promotes AUTO
+		// to REBOOT internally when the change requires it (the
+		// CanApplyImmediate check inside v1alpha1_server.go's AUTO
+		// branch). The verify call would then race the reboot the
+		// node dispatches in a goroutine before returning the RPC,
+		// producing a false-positive transient-error blocker —
+		// the same shape the explicit REBOOT skip avoids.
+		//
+		// Cost: AUTO applies that DON'T require a reboot lose their
+		// post-apply verify. Acceptable trade-off: the verify is
+		// default-off until the Talos-mutated-field allowlist lands
+		// (see #172), and an operator who needs verify-on-no-reboot
+		// can pass --mode=no-reboot explicitly.
+		machineapi.ApplyConfigurationRequest_AUTO:
+		return false
+	case machineapi.ApplyConfigurationRequest_NO_REBOOT:
+		// fall through to default true.
+	}
+
+	return true
+}
+
 // withApplyClient creates a Talos client appropriate for the current apply
 // mode and invokes the given action with it. The action receives a context
 // in which gRPC node metadata is set to the resolved node list — either
@@ -531,21 +669,21 @@ func openClientPerNodeAuth(parentCtx context.Context, c *client.Client) openClie
 // invariant. Surface it as an error instead of silently passing the
 // ctx through to a COSI call that apid will reject — the latter is
 // the exact silent no-op this helper exists to prevent.
-func cosiPreflightContext(ctx context.Context) (context.Context, error) {
+func cosiPreflightContext(ctx context.Context) (context.Context, string, error) {
 	md, ok := metadata.FromOutgoingContext(ctx)
 	if !ok {
-		return ctx, nil
+		return ctx, "", nil
 	}
 
 	nodes := md.Get("nodes")
 	switch len(nodes) {
 	case 0:
-		return ctx, nil
+		return ctx, "", nil
 	case 1:
-		return client.WithNode(ctx, nodes[0]), nil
+		return client.WithNode(ctx, nodes[0]), nodes[0], nil
 	default:
 		//nolint:wrapcheck // sentinel constructed in-place; WithHint attaches operator guidance
-		return nil, errors.WithHint(
+		return nil, "", errors.WithHint(
 			errors.Newf("cosiPreflightContext: refusing to scope ctx with %d nodes; expected exactly one", len(nodes)),
 			"applyTemplatesPerNode iterates one node at a time, so a multi-element plural slice at this point indicates a broken caller",
 		)
@@ -740,6 +878,9 @@ func init() {
 	applyCmd.Flags().DurationVar(&applyCmdFlags.configTryTimeout, "timeout", constants.ConfigTryTimeout, "the config will be rolled back after specified timeout (if try mode is selected)")
 	applyCmd.Flags().StringSliceVar(&applyCmdFlags.certFingerprints, "cert-fingerprint", nil, "list of server certificate fingeprints to accept (defaults to no check)")
 	applyCmd.Flags().BoolVar(&applyCmdFlags.force, "force", false, "will overwrite existing files")
+	applyCmd.Flags().BoolVar(&applyCmdFlags.skipResourceValidation, "skip-resource-validation", false, "skip the pre-apply check that declared host resources (links, disks) exist on the target node")
+	applyCmd.Flags().BoolVar(&applyCmdFlags.skipDriftPreview, "skip-drift-preview", false, "skip the pre-apply diff of on-node vs rendered MachineConfig")
+	applyCmd.Flags().BoolVar(&applyCmdFlags.skipPostApplyVerify, "skip-post-apply-verify", true, "skip the post-apply structural verification of on-node vs sent MachineConfig (default skip until the Talos-mutated field allowlist lands; see #172)")
 	helpers.AddModeFlags(&applyCmdFlags.Mode, applyCmd)
 
 	addCommand(applyCmd)
diff --git a/pkg/commands/apply_test.go b/pkg/commands/apply_test.go
index eaff2d66..f953b5b7 100644
--- a/pkg/commands/apply_test.go
+++ b/pkg/commands/apply_test.go
@@ -737,7 +737,7 @@ func TestCosiPreflightContext_StripsPluralAndAttachesSingular(t *testing.T) {
 	const node = testNodeAddrA
 	in := client.WithNodes(context.Background(), node)
 
-	out, err := cosiPreflightContext(in)
+	out, _, err := cosiPreflightContext(in)
 	if err != nil {
 		t.Fatalf("cosiPreflightContext: %v", err)
 	}
@@ -762,7 +762,7 @@ func TestCosiPreflightContext_StripsPluralAndAttachesSingular(t *testing.T) {
 // that apid would route to the wrong target.
 func TestCosiPreflightContext_LeavesNoMetadataAlone(t *testing.T) {
 	in := context.Background()
-	out, err := cosiPreflightContext(in)
+	out, _, err := cosiPreflightContext(in)
 	if err != nil {
 		t.Fatalf("cosiPreflightContext: %v", err)
 	}
@@ -782,7 +782,7 @@ func TestCosiPreflightContext_LeavesNoMetadataAlone(t *testing.T) {
 // this helper exists to prevent on the single-node case.
 func TestCosiPreflightContext_RejectsMultiNodeCtx(t *testing.T) {
 	in := client.WithNodes(context.Background(), "a", "b")
-	_, err := cosiPreflightContext(in)
+	_, _, err := cosiPreflightContext(in)
 	if err == nil {
 		t.Fatal("expected error for multi-node outgoing ctx, got nil")
 	}
diff --git a/pkg/commands/init.go b/pkg/commands/init.go
index c463d5b9..ebcaa691 100644
--- a/pkg/commands/init.go
+++ b/pkg/commands/init.go
@@ -31,6 +31,7 @@ import (
 	"github.com/cozystack/talm/pkg/generated"
 	"github.com/cozystack/talm/pkg/secureperm"
 	"github.com/spf13/cobra"
+	"golang.org/x/term"
 	"gopkg.in/yaml.v3"
 
 	"github.com/siderolabs/talos/cmd/talosctl/cmd/mgmt/gen"
@@ -772,9 +773,14 @@ func readChartYamlPreset() (string, error) {
 // imageLineRe matches the top-level `image:` line in a preset
 // values.yaml regardless of YAML serialization style — double-quoted,
 // single-quoted, unquoted, with or without a trailing comment.
-// Line-anchored (?m)^image:…$ so a nested key, indented entry, or
-// commented `# image:` line is never substituted.
-var imageLineRe = regexp.MustCompile(`(?m)^image:.*$`)
+//
+// Line-anchored (?m)^image: requires at least one space (or end-of-
+// line) after the colon. Without the space requirement the regex
+// would also match `image:noSpaceValue`, which is not valid YAML
+// (yaml.v3 rejects key:value with no space after colon) — silently
+// rewriting a broken preset masks the underlying preset bug. With
+// the space requirement, only valid YAML key-value pairs match.
+var imageLineRe = regexp.MustCompile(`(?m)^image:(\s|$).*$`)
 
 // applyImageOverride returns values with the top-level `image:` line
 // replaced so it points at override. An empty override returns values
@@ -822,16 +828,85 @@ func applyImageOverride(values []byte, override string) ([]byte, error) {
 	}), nil
 }
 
+// validateImageRefShape rejects --image values that cannot be a
+// plausible installer image reference. The check is intentionally
+// loose — it does not vouch for the image existing in any registry
+// or for the tag being a valid Talos version — only that the value
+// has the structural shape of an OCI ref: a registry-or-path
+// component, at least one path separator, and either a ":TAG"
+// suffix or an "@sha256:" / "@sha512:" digest pin.
+//
+// Catches: bare ":malformed", "no-slash:tag" (docker-hub-style
+// shortcuts that won't resolve in the talm context), trailing
+// "/" with no tag, empty-tag colon prefixes. Misses (intentionally):
+// non-existent registries, invalid version tags, unsupported
+// platforms — those surface at apply time as expected.
+func validateImageRefShape(ref string) error {
+	if strings.HasPrefix(ref, ":") || strings.HasPrefix(ref, "@") || strings.HasPrefix(ref, "/") {
+		//nolint:wrapcheck // cockroachdb/errors.WithHint at boundary.
+		return errors.WithHint(
+			errors.Newf("--image %q is malformed: starts with a reserved separator (':' or '@' or '/')", ref),
+			"pass a full image reference such as 'ghcr.io/siderolabs/installer:v1.13.0' or 'factory.talos.dev/installer/<sha>:<version>'",
+		)
+	}
+
+	if strings.HasSuffix(ref, ":") || strings.HasSuffix(ref, "/") || strings.HasSuffix(ref, "@") {
+		//nolint:wrapcheck // cockroachdb/errors.WithHint at boundary.
+		return errors.WithHint(
+			errors.Newf("--image %q is malformed: ends with a separator with nothing after it", ref),
+			"add a tag (e.g. ':v1.13.0') or a digest pin (e.g. '@sha256:<hex>') after the last path component",
+		)
+	}
+
+	if !strings.Contains(ref, "/") {
+		//nolint:wrapcheck // cockroachdb/errors.WithHint at boundary.
+		return errors.WithHint(
+			errors.Newf("--image %q is missing a registry / path component", ref),
+			"Talos installer images are pulled by full reference — pass at least 'registry/path:tag', not a bare 'name:tag' shortcut",
+		)
+	}
+
+	lastSlash := strings.LastIndex(ref, "/")
+	tail := ref[lastSlash+1:]
+
+	hasTag := false
+	if idx := strings.LastIndex(tail, ":"); idx > 0 && idx < len(tail)-1 {
+		hasTag = true
+	}
+
+	hasDigest := strings.Contains(ref, "@sha256:") || strings.Contains(ref, "@sha512:")
+
+	if !hasTag && !hasDigest {
+		//nolint:wrapcheck // cockroachdb/errors.WithHint at boundary.
+		return errors.WithHint(
+			errors.Newf("--image %q has no tag or digest", ref),
+			"append ':<version>' (e.g. ':v1.13.0') or '@sha256:<hex>' to pin the installer build",
+		)
+	}
+
+	return nil
+}
+
 // validateImageOverride scans presetFiles for the chosen preset's
 // values.yaml and confirms a top-level image line is present when
 // the user passed --image. The check runs before any file is written
 // so a flag-vs-preset mismatch fails the command up front instead of
 // leaving a half-initialized project on disk.
+//
+// Also performs a basic shape check on the override itself —
+// malformed values like "::malformed" or "no-slash:tag" are rejected
+// here, instead of being silently written to values.yaml and only
+// surfacing on the next talm template / apply call deep inside
+// configloader.NewFromBytes.
 func validateImageOverride(presetFiles map[string]string, presetName, override string) error {
 	if override == "" {
 		return nil
 	}
 
+	if err := validateImageRefShape(override); err != nil {
+		return err
+	}
+
 	for path, content := range presetFiles {
 		parts := strings.SplitN(path, "/", 2)
 		if len(parts) != 2 || parts[0] != presetName {
@@ -860,8 +935,51 @@ func validateImageOverride(presetFiles map[string]string, presetName, override s
 	)
 }
 
-// askUserOverwrite asks user if they want to overwrite a file.
-func askUserOverwrite(filePath string) (bool, error) {
+// overwritePolicy describes how a single update-time file conflict
+// should be resolved. The decision is made once per call so the unit
+// tests (and the --force gate) can short-circuit the interactive
+// prompt deterministically.
+type overwritePolicy int
+
+const (
+	// overwritePolicyAsk prompts the user on stdin; the historical
+	// behaviour, used when stdin is a real tty and --force is not set.
+	overwritePolicyAsk overwritePolicy = iota
+	// overwritePolicyForce always accepts the overwrite. Used when
+	// --force is set; the operator opted in.
+	overwritePolicyForce
+	// overwritePolicyNonInteractive blocks the call with a hint to
+	// rerun under a tty or with --force. Used when stdin is not a tty
+	// and --force is not set — distinguishes "operator declined" from
+	// "talm couldn't even ask" so a scripted refresh fails loudly
+	// instead of silently leaving the project on a stale preset.
+	overwritePolicyNonInteractive
+)
+
+// stdinIsTTY reports whether process stdin is connected to a
+// terminal. Var-typed so the unit tests can swap a fake.
+//
+// term.IsTerminal correctly returns false for /dev/null and pipes —
+// the naive os.Stdin.Stat()&ModeCharDevice check accepted /dev/null
+// (it's a character device) and led the previous version to prompt
+// in cron / scripted shells, EOFing the read.
+//
+//nolint:gochecknoglobals // injection seam for testability; matches stdinReader below.
+var stdinIsTTY = func() bool {
+	return term.IsTerminal(int(os.Stdin.Fd()))
+}
+
+// stdinReader is the io.Reader the interactive prompt reads from.
+// Var-typed so unit tests can supply canned input.
+//
+//nolint:gochecknoglobals // injection seam for testability.
+var stdinReader io.Reader = os.Stdin
+
+// askUserOverwrite resolves a single update-time file conflict
+// according to overwritePolicy. Returns (true, nil) to accept the
+// overwrite, (false, nil) to skip, or an error when policy is
+// non-interactive (with a hint pointing the operator at --force).
+func askUserOverwrite(filePath string, policy overwritePolicy) (bool, error) {
 	// Show relative path from project root
 	relPath, err := filepath.Rel(Config.RootDir, filePath)
 	if err != nil {
@@ -869,7 +987,22 @@ func askUserOverwrite(filePath string) (bool, error) {
 		relPath = filePath
 	}
 
-	reader := bufio.NewReader(os.Stdin)
+	switch policy {
+	case overwritePolicyForce:
+		fmt.Fprintf(os.Stderr, "Overwriting %s (--force)\n", relPath)
+
+		return true, nil
+	case overwritePolicyNonInteractive:
+		//nolint:wrapcheck // cockroachdb/errors.WithHint at boundary.
+		return false, errors.WithHint(
+			errors.Newf("file %q differs from the preset template, but talm is running non-interactively and cannot prompt for confirmation", relPath),
+			"rerun under a tty to confirm interactively, or pass --force to accept all preset-template overwrites.",
+		)
+	case overwritePolicyAsk:
+		// fall through to interactive prompt below
+	}
+
+	reader := bufio.NewReader(stdinReader)
 
 	fmt.Fprintf(os.Stderr, "File %s differs from template. Overwrite? [y/N]: ", relPath)
 
@@ -883,6 +1016,19 @@ func askUserOverwrite(filePath string) (bool, error) {
 	return response == "y" || response == "yes", nil
 }
 
+// resolveOverwritePolicy picks the policy for the current invocation
+// from the (--force, tty) inputs. Pure function for testability.
+func resolveOverwritePolicy(force, isTTY bool) overwritePolicy {
+	switch {
+	case force:
+		return overwritePolicyForce
+	case !isTTY:
+		return overwritePolicyNonInteractive
+	}
+
+	return overwritePolicyAsk
+}
+
 // filesDiffer checks if two files have different content.
 func filesDiffer(filePath string, newContent []byte) (bool, error) {
 	existingContent, err := os.ReadFile(filePath)
@@ -898,46 +1044,49 @@ func filesDiffer(filePath string, newContent []byte) (bool, error) {
 	return !bytes.Equal(existingContent, newContent), nil
 }
 
-// updateFileWithConfirmation updates a file if it differs, asking user for confirmation.
-func updateFileWithConfirmation(filePath string, newContent []byte, permissions os.FileMode) error {
-	// Check if file exists
-	exists := fileExists(filePath)
-
-	if !exists {
-		// File doesn't exist, create it without asking
-		parentDir := filepath.Dir(filePath)
-		if err := os.MkdirAll(parentDir, os.ModePerm); err != nil {
-			return errors.Wrap(err, "failed to create output dir")
-		}
+// createReportedFile writes a fresh file (parent dir created as
+// needed) and prints a one-line `Created path/to/file` summary so
+// the operator sees what the update added. Shared between the
+// initial-create and post-overwrite branches.
+func createReportedFile(filePath string, newContent []byte, permissions os.FileMode) error {
+	parentDir := filepath.Dir(filePath)
+	if err := os.MkdirAll(parentDir, os.ModePerm); err != nil {
+		return errors.Wrap(err, "failed to create output dir")
+	}
 
-		if err := os.WriteFile(filePath, newContent, permissions); err != nil {
-			return errors.Wrap(err, "failed to write file")
-		}
+	if err := os.WriteFile(filePath, newContent, permissions); err != nil {
+		return errors.Wrap(err, "failed to write file")
+	}
 
-		// Show relative path from project root
-		relPath, err := filepath.Rel(Config.RootDir, filePath)
-		if err != nil {
-			relPath = filePath
-		}
+	relPath, err := filepath.Rel(Config.RootDir, filePath)
+	if err != nil {
+		relPath = filePath
+	}
 
-		fmt.Fprintf(os.Stderr, "%s %s\n", reportVerbCreated, relPath)
+	fmt.Fprintf(os.Stderr, "%s %s\n", reportVerbCreated, relPath)
 
-		return nil
+	return nil
+}
+
+// updateFileWithConfirmation updates a file if it differs, asking user for confirmation.
+func updateFileWithConfirmation(filePath string, newContent []byte, permissions os.FileMode) error {
+	if !fileExists(filePath) {
+		return createReportedFile(filePath, newContent, permissions)
 	}
 
-	// File exists, check if content differs
 	differs, err := filesDiffer(filePath, newContent)
 	if err != nil {
 		return err
 	}
 
 	if !differs {
-		// File is the same, skip silently
 		return nil
 	}
 
-	// File differs, ask user
-	overwrite, err := askUserOverwrite(filePath)
+	// File differs — pick the policy from (--force, tty) and resolve.
+	policy := resolveOverwritePolicy(initCmdFlags.force, stdinIsTTY())
+
+	overwrite, err := askUserOverwrite(filePath, policy)
 	if err != nil {
 		return errors.Wrap(err, "failed to read user input")
 	}
@@ -1109,7 +1258,7 @@ func init() {
 	initCmd.Flags().StringVarP(&initCmdFlags.preset, "preset", "p", "", "preset for file generation (not required with --encrypt, --decrypt, or --update)")
 	initCmd.Flags().StringVarP(&initCmdFlags.name, "name", "N", "", "cluster name (not required with --encrypt, --decrypt, or --update)")
 	initCmd.Flags().StringVar(&initCmdFlags.image, "image", "", "override the Talos installer image written to the preset's values.yaml (e.g. factory.talos.dev/installer/<sha256>:<version>)")
-	initCmd.Flags().BoolVar(&initCmdFlags.force, "force", false, "will overwrite existing files")
+	initCmd.Flags().BoolVar(&initCmdFlags.force, "force", false, "overwrite existing files; on --update also auto-accepts every preset-template diff without the interactive prompt")
 	initCmd.Flags().BoolVarP(&initCmdFlags.update, "update", "u", false, "update Talm library chart")
 	// Override persistent -e flag for init command to use for encrypt
 	// Remove the persistent endpoints flag from init command and add our own -e flag
diff --git a/pkg/commands/init_overwrite_test.go b/pkg/commands/init_overwrite_test.go
new file mode 100644
index 00000000..cc0662e9
--- /dev/null
+++ b/pkg/commands/init_overwrite_test.go
@@ -0,0 +1,140 @@
+// Copyright Cozystack Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package commands
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/errors"
+)
+
+// TestResolveOverwritePolicy pins the (force, isTTY) -> policy
+// matrix that drives askUserOverwrite. The whole point is to make
+// the non-tty case fail loudly rather than silently leave the
+// project on a stale preset, AND to honor --force as the documented
+// scripted-refresh escape hatch.
+func TestResolveOverwritePolicy(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name  string
+		force bool
+		isTTY bool
+		want  overwritePolicy
+	}{
+		{"force + tty -> force", true, true, overwritePolicyForce},
+		{"force + non-tty -> force", true, false, overwritePolicyForce},
+		{"no force + tty -> ask", false, true, overwritePolicyAsk},
+		{"no force + non-tty -> non-interactive", false, false, overwritePolicyNonInteractive},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+
+			got := resolveOverwritePolicy(tc.force, tc.isTTY)
+			if got != tc.want {
+				t.Errorf("resolveOverwritePolicy(force=%v, tty=%v) = %v, want %v", tc.force, tc.isTTY, got, tc.want)
+			}
+		})
+	}
+}
+
+// TestAskUserOverwrite_ForcePolicy pins the contract: under
+// overwritePolicyForce the prompt is bypassed entirely and the
+// answer is unconditionally yes. The operator opted into this by
+// passing --force; no stdin read happens.
+func TestAskUserOverwrite_ForcePolicy(t *testing.T) {
+	t.Parallel()
+
+	ok, err := askUserOverwrite("/tmp/anything", overwritePolicyForce)
+	if err != nil {
+		t.Fatalf("force policy should not error, got %v", err)
+	}
+
+	if !ok {
+		t.Errorf("force policy must return true, got false")
+	}
+}
+
+// TestAskUserOverwrite_NonInteractivePolicy pins the contract: when
+// running non-interactively without --force, the call returns a
+// hint-bearing error pointing operators at --force. The previous
+// behaviour was a raw stdin EOF wrapped in a less actionable message.
+func TestAskUserOverwrite_NonInteractivePolicy(t *testing.T) {
+	t.Parallel()
+
+	ok, err := askUserOverwrite("/tmp/anything", overwritePolicyNonInteractive)
+	if err == nil {
+		t.Fatal("non-interactive policy must error, got nil")
+	}
+
+	if ok {
+		t.Errorf("non-interactive policy must return false, got true")
+	}
+
+	hints := errors.GetAllHints(err)
+	if len(hints) == 0 {
+		t.Errorf("expected a hint pointing at --force, got none")
+	}
+
+	hintsStr := strings.Join(hints, " ")
+	if !strings.Contains(hintsStr, "--force") {
+		t.Errorf("hint should mention --force, got %q", hintsStr)
+	}
+}
+
+// TestAskUserOverwrite_AskPolicy_YesResponse pins the interactive
+// path: under the ask policy, stdin is consulted; a `y` answer
+// returns true, anything else returns false. stdinReader is swapped
+// for a strings.Reader so the test doesn't need a real tty.
+func TestAskUserOverwrite_AskPolicy_YesResponse(t *testing.T) {
+	originalReader := stdinReader
+	t.Cleanup(func() { stdinReader = originalReader })
+
+	stdinReader = strings.NewReader("y\n")
+
+	ok, err := askUserOverwrite("/tmp/anything", overwritePolicyAsk)
+	if err != nil {
+		t.Fatalf("ask policy returned error: %v", err)
+	}
+
+	if !ok {
+		t.Errorf("answer 'y' must return true, got false")
+	}
+}
+
+// TestAskUserOverwrite_AskPolicy_NoResponse pins the inverse: every
+// answer that isn't 'y' / 'yes' returns false (skip). Default-no
+// is the conservative behaviour the previous prompt promised via
+// the `[y/N]` suffix.
+func TestAskUserOverwrite_AskPolicy_NoResponse(t *testing.T) {
+	originalReader := stdinReader
+	t.Cleanup(func() { stdinReader = originalReader })
+
+	for _, response := range []string{"\n", "n\n", "no\n", "garbage\n"} {
+		stdinReader = strings.NewReader(response)
+
+		ok, err := askUserOverwrite("/tmp/anything", overwritePolicyAsk)
+		if err != nil {
+			t.Errorf("response %q returned unexpected error: %v", response, err)
+		}
+
+		if ok {
+			t.Errorf("response %q must return false, got true", response)
+		}
+	}
+}
diff --git a/pkg/commands/init_test.go b/pkg/commands/init_test.go
index 337f2918..7eadf16a 100644
--- a/pkg/commands/init_test.go
+++ b/pkg/commands/init_test.go
@@ -175,6 +175,38 @@ podSubnets:
 		}
 	})
 
+	t.Run("rejects image: without a space after the colon", func(t *testing.T) {
+		// image:noSpace and image:foo are not valid YAML
+		// (yaml.v3 rejects key:value with no space). The regex
+		// must NOT match these, so a broken preset surfaces as a
+		// "no image: field" error from validateImageOverride
+		// rather than silently getting rewritten.
+		broken := []string{
+			"image:noSpace\n",
+			"image:ghcr.io/foo:v1\n",
+		}
+		for _, input := range broken {
+			_, err := applyImageOverride([]byte(input), "factory.talos.dev/installer/abc:v1.13.0")
+			if err == nil {
+				t.Errorf("expected an error on broken YAML input %q (no space after colon), got none", input)
+			}
+		}
+	})
+
+	t.Run("matches image: with bare colon (empty value, valid YAML)", func(t *testing.T) {
+		// `image:` followed by end-of-line is a YAML key with
+		// nil value — valid YAML. The regex MUST still match
+		// so --image can populate the empty preset slot.
+		got, err := applyImageOverride([]byte("image:\n"), "factory.talos.dev/installer/abc:v1.13.0")
+		if err != nil {
+			t.Fatalf("expected the empty-value form to be rewritable, got: %v", err)
+		}
+
+		if !bytes.Contains(got, []byte(`image: "factory.talos.dev/installer/abc:v1.13.0"`)) {
+			t.Errorf("empty-value form should be rewritten by --image, got:\n%s", got)
+		}
+	})
+
 	t.Run("supports unquoted, single-quoted, and trailing-comment styles", func(t *testing.T) {
 		styles := []struct {
 			name string
@@ -318,6 +350,71 @@ func TestUpdateTalmLibraryChartRejectsImageFlag(t *testing.T) {
 	}
 }
 
+// TestValidateImageRefShape_RejectsMalformed pins the shape check
+// that runs before any --image value reaches the preset rewrite.
+// Catches operator typos (::malformed, no-slash:tag, trailing
+// separator) at command-parse time, instead of leaving a corrupt
+// values.yaml that fails deep inside configloader on the next
+// `talm template` / `talm apply`.
+func TestValidateImageRefShape_RejectsMalformed(t *testing.T) {
+	tests := []struct {
+		name string
+		ref  string
+		want bool
+	}{
+		{"empty colon-prefix", "::malformed", false},
+		{"leading colon", ":foo/bar:tag", false},
+		{"leading at", "@sha256:abc", false},
+		{"leading slash", "/foo/bar:tag", false},
+		{"trailing colon", "ghcr.io/foo/bar:", false},
+		{"trailing slash", "ghcr.io/foo/bar/", false},
+		{"trailing at", "ghcr.io/foo/bar@", false},
+		{"no slash separator", "no-slash:tag", false},
+		{"slash but no tag and no digest", "ghcr.io/foo/bar", false},
+		{"slash with empty colon at end", "ghcr.io/foo/bar:", false},
+		{"valid tagged ref", "ghcr.io/siderolabs/installer:v1.13.0", true},
+		{"valid digest pin sha256", "ghcr.io/foo/bar@sha256:abc123def456", true},
+		{"valid digest pin sha512", "ghcr.io/foo/bar@sha512:0123456789abcdef", true},
+		{"valid factory ref", "factory.talos.dev/installer/abcd1234:v1.13.0", true},
+		{"valid registry with port", "registry.local:5000/foo/installer:v1.12.6", true},
+		{"valid cozystack ref", "ghcr.io/cozystack/cozystack/talos:v1.12.6", true},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			err := validateImageRefShape(tc.ref)
+			if tc.want && err != nil {
+				t.Errorf("validateImageRefShape(%q) = %v, want nil", tc.ref, err)
+			}
+
+			if !tc.want && err == nil {
+				t.Errorf("validateImageRefShape(%q) = nil, want error", tc.ref)
+			}
+		})
+	}
+}
+
+// TestValidateImageOverride_RejectsMalformedAtPresetLevel pins the
+// integration: malformed --image values are surfaced as a hint-
+// bearing error from validateImageOverride too, BEFORE the preset
+// existence check (so the user sees the malformed-ref signal even
+// on a typo'd preset name).
+func TestValidateImageOverride_RejectsMalformedAtPresetLevel(t *testing.T) {
+	preset := map[string]string{
+		"good/values.yaml": "image: \"original\"\n",
+		"good/Chart.yaml":  "name: good\n",
+	}
+
+	err := validateImageOverride(preset, "good", "::malformed")
+	if err == nil {
+		t.Fatal("expected an error on malformed --image")
+	}
+
+	if !strings.Contains(err.Error(), "malformed") {
+		t.Errorf("error should cite the offending value, got: %v", err)
+	}
+}
+
 // TestValidateImageOverride pins the up-front mismatch detection that
 // runs in initCmd.RunE before any file is written: --image must be
 // rejected when the chosen preset has no top-level image: field, so
diff --git a/pkg/commands/preflight.go b/pkg/commands/preflight.go
index 33585f3e..b5870abb 100644
--- a/pkg/commands/preflight.go
+++ b/pkg/commands/preflight.go
@@ -68,20 +68,32 @@ func annotateApplyConfigError(err error) error {
 	return errors.WithHint(err, applyConfigDecodeHint)
 }
 
-// versionReader fetches the running Talos version from a node. It returns
-// ok=false on any error so callers can treat the result as best-effort. The
-// signature is what makes preflightCheckTalosVersion testable without a live
-// COSI server.
-type versionReader func(ctx context.Context) (version string, ok bool)
+// versionReader fetches the running Talos version from a node. Three-valued
+// return mirrors the linksDisksReader / machineConfigReader contract:
+//
+//   - (version, true,  nil) — success
+//   - ("",      false, err) — real read failure (network, RPC, COSI server)
+//   - ("",      false, nil) — by-design unreachable (auth-disallowed, etc.)
+//
+// Callers decide what each combination means. preflightCheckTalosVersion's
+// pre-apply warning silently surrenders on any !ok regardless of err
+// (best-effort warning). verifyPostUpgradeVersion treats err!=nil as a real
+// read failure that IS the rollback signal — exactly what the gate exists to
+// catch — and surfaces it as a hint-bearing blocker.
+type versionReader func(ctx context.Context) (version string, ok bool, err error)
 
 // cosiVersionReader returns a versionReader that reads the Talos version from
 // the node's COSI `Versions.runtime.talos.dev/runtime/version` resource. The
 // resource is declared NonSensitive in Talos and is therefore reachable
 // through a maintenance (--insecure) connection that only carries the Reader
-// role. Any read failure (RPC error, NotFound, PermissionDenied, multi-node
-// proxy error) is reported as ok=false.
+// role.
+//
+// Any read failure (RPC error, NotFound, PermissionDenied, multi-node proxy
+// error, connection refused) flows through as (false, wrapped err). Callers
+// that need to distinguish a real failure from a no-result case can inspect
+// the err.
 func cosiVersionReader(c *client.Client) versionReader {
-	return func(ctx context.Context) (string, bool) {
+	return func(ctx context.Context) (string, bool, error) {
 		ctx, cancel := context.WithTimeout(ctx, preflightCOSIReadTimeout)
 		defer cancel()
 
@@ -91,10 +103,10 @@ func cosiVersionReader(c *client.Client) versionReader {
 			resource.NewMetadata(runtime.NamespaceName, runtime.VersionType, "version", resource.VersionUndefined),
 		)
 		if err != nil {
-			return "", false
+			return "", false, errors.Wrap(err, "reading runtime.Version COSI resource")
 		}
 
-		return res.TypedSpec().Version, true
+		return res.TypedSpec().Version, true, nil
 	}
 }
 
@@ -108,7 +120,12 @@ func cosiVersionReader(c *client.Client) versionReader {
 // aggressive contract — this is the documented reproduction case for the
 // "unknown keys found during decoding" error.
 func preflightCheckTalosVersion(ctx context.Context, read versionReader, configuredVersion string, w io.Writer) {
-	runningVersion, ok := read(ctx)
+	// Pre-apply warning is best-effort: silent surrender on any !ok
+	// regardless of err. The err is captured but intentionally
+	// discarded — if reading the runtime.Version fails, the apply call
+	// that follows will surface the underlying connection issue with a
+	// clearer message than this gate could.
+	runningVersion, ok, _ := read(ctx)
 	if !ok {
 		return
 	}
diff --git a/pkg/commands/preflight_apply_safety.go b/pkg/commands/preflight_apply_safety.go
new file mode 100644
index 00000000..8bf21025
--- /dev/null
+++ b/pkg/commands/preflight_apply_safety.go
@@ -0,0 +1,618 @@
+// Copyright Cozystack Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package commands
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"strings"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cosi-project/runtime/pkg/resource"
+	"github.com/cosi-project/runtime/pkg/safe"
+	"github.com/cozystack/talm/pkg/applycheck"
+	"github.com/siderolabs/talos/pkg/machinery/client"
+	"github.com/siderolabs/talos/pkg/machinery/resources/block"
+	"github.com/siderolabs/talos/pkg/machinery/resources/config"
+	"github.com/siderolabs/talos/pkg/machinery/resources/network"
+	yaml "gopkg.in/yaml.v3"
+)
+
+// linksDisksReader returns the host's link and disk inventory used by
+// Phase 1 validation. The reader is a function type so tests can supply
+// fakes without standing up a Talos client.
+//
+// Three-valued return mirrors machineConfigReader so the validator can
+// distinguish:
+//
+//   - (snapshot, true, nil) — success.
+//   - (zero, false, nil) — reader signalled "no result, no error".
+//     The link / disk COSI resources are NonSensitive, so this shape
+//     should not happen in production via the COSI reader; reserved
+//     for future custom readers that genuinely want to surrender
+//     without an err. The validator treats it the same as the (zero,
+//     false, err) branch — surfaces as a hint-bearing blocker —
+//     because Phase 1 cannot make a useful declared-resource decision
+//     without a snapshot in hand.
+//   - (zero, false, err) — transient read failure (apid timeout,
+//     network blip). The validator wraps and surfaces it so the
+//     operator sees the actual cause, not a misleading "config is
+//     wrong" blocker.
+//
+// The three-valued signature mirrors machineConfigReader to keep the
+// test-fake surface unified, even though only the (zero, false, err)
+// branch is exercised in production.
+type linksDisksReader func(ctx context.Context) (snapshot applycheck.HostSnapshot, ok bool, err error)
+
+// machineConfigReader returns the current on-node MachineConfig bytes
+// used by Phase 2 (drift preview + post-apply verify). The MachineConfig
+// COSI resource is declared meta.Sensitive in Talos, so the auth path
+// can read it but the insecure / maintenance connection cannot —
+// ok=false on that path is the documented graceful-degrade signal,
+// distinct from a transient read failure (err non-nil).
+type machineConfigReader func(ctx context.Context) (configBytes []byte, ok bool, err error)
+
+// maintenanceConnectionMessage is the single line printed when the
+// drift gates run against a connection that cannot read the Sensitive
+// MachineConfig resource (insecure / maintenance mode). Both Phase 2
+// hooks surface this and proceed without blocking.
+const maintenanceConnectionMessage = "drift verification unavailable on maintenance connection"
+
+// preflightValidateResources runs Phase 1: extract host-resource references
+// from the rendered MachineConfig and verify each against the node's
+// snapshot. Returns nil when there are no blockers (warnings are printed
+// but do not block). Returns an error when there is at least one
+// blocker; the error message names every blocker so a single rerun
+// surfaces every problem.
+func preflightValidateResources(
+	ctx context.Context,
+	read linksDisksReader,
+	rendered []byte,
+	w io.Writer,
+) error {
+	snapshot, ok, err := read(ctx)
+	if err != nil {
+		// Transient COSI read failure (apid timeout, network blip).
+		// Surface the underlying cause so the operator doesn't get
+		// a misleading "your config is wrong, pass --skip-...".
+		//nolint:wrapcheck // cockroachdb/errors.WithHint at boundary.
+		return errors.WithHint(
+			errors.Wrap(err, "pre-flight: reading host links/disks snapshot from the node"),
+			"this is a node-side connection / COSI error, not a config defect. Retry, fix connectivity, or pass --skip-resource-validation to bypass the gate.",
+		)
+	}
+
+	if !ok {
+		// Single message via the error + hint chain. An earlier
+		// version printed a duplicate warning line to w, so the
+		// operator saw --skip-resource-validation mentioned twice.
+		//nolint:wrapcheck // cockroachdb/errors.WithHint at boundary.
+		return errors.WithHint(
+			errors.New("pre-flight: host links/disks snapshot unavailable"),
+			"the node didn't surface its links/disks resources — typically the auth path is wrong. Pass --skip-resource-validation to bypass.",
+		)
+	}
+
+	refs, err := applycheck.WalkRefs(rendered)
+	if err != nil {
+		return errors.Wrap(err, "pre-flight: walking rendered MachineConfig")
+	}
+
+	findings := applycheck.ValidateRefs(refs, snapshot)
+	if len(findings) == 0 {
+		return nil
+	}
+
+	blockers := 0
+
+	for i := range findings {
+		f := &findings[i]
+		printFinding(w, f)
+
+		if f.IsBlocker() {
+			blockers++
+		}
+	}
+
+	if blockers == 0 {
+		return nil
+	}
+
+	//nolint:wrapcheck // cockroachdb/errors.WithHint at boundary.
+	return errors.WithHint(
+		errors.Newf("pre-flight: %d declared host resource(s) do not resolve on the target node", blockers),
+		"correct the values referenced above, or pass --skip-resource-validation to bypass.",
+	)
+}
+
+// printFinding writes one finding line plus its hint to w. Output shape:
+//
+//	[blocker] declared link "eth9999" not found on target node
+//	  hint: available links: eth0, eth1
+func printFinding(w io.Writer, finding *applycheck.Finding) {
+	tag := "blocker"
+	if !finding.IsBlocker() {
+		tag = "warning"
+	}
+
+	_, _ = fmt.Fprintf(w, "[%s] %s (source: %s)\n", tag, finding.Reason, finding.Ref.Source)
+
+	if finding.Hint != "" {
+		_, _ = fmt.Fprintf(w, "  hint: %s\n", finding.Hint)
+	}
+}
+
+// previewDrift runs Phase 2A: read the node's current MachineConfig and
+// diff against the rendered config we're about to apply, then print a
+// +/-/~/= preview to w. Returns nil unconditionally — pre-apply preview
+// is informational. On the insecure / maintenance path the MachineConfig
+// resource is Sensitive and unreachable, so the reader returns
+// ok=false; the function prints one explanatory line and returns nil.
+//
+// nodeID is the per-node identifier (typically an IP) used to prefix
+// gate output lines on a multi-node apply so the operator can tell
+// whose diff is whose. Pass "" when the call site has no per-node
+// scope (template-rendering path with a single implicit node).
+func previewDrift(
+	ctx context.Context,
+	read machineConfigReader,
+	rendered []byte,
+	nodeID string,
+	w io.Writer,
+) error {
+	current, ok, err := read(ctx)
+	if err != nil {
+		_, _ = fmt.Fprintf(w, "%swarning: drift preview skipped, could not read on-node MachineConfig: %v\n", nodePrefix(nodeID), err)
+
+		return nil
+	}
+
+	if !ok {
+		_, _ = fmt.Fprintln(w, "talm:", maintenanceConnectionMessage)
+
+		return nil
+	}
+
+	changes, err := applycheck.Diff(current, rendered)
+	if err != nil {
+		_, _ = fmt.Fprintf(w, "%swarning: drift preview skipped, diff failed: %v\n", nodePrefix(nodeID), err)
+
+		return nil
+	}
+
+	printDriftPreview(w, headerWithNode("talm: drift preview", nodeID), changes)
+
+	return nil
+}
+
+// nodePrefix returns "node X: " for non-empty nodeID, empty otherwise.
+// Used at start of stderr lines to disambiguate multi-node output.
+func nodePrefix(nodeID string) string {
+	if nodeID == "" {
+		return ""
+	}
+
+	return "node " + nodeID + ": "
+}
+
+// headerWithNode appends "(node X)" to a base header when nodeID is
+// non-empty, leaving the bare header unchanged otherwise. Used by
+// printDriftPreview so multi-node Phase 2A/2B output is per-node-
+// distinguishable on stderr.
+func headerWithNode(base, nodeID string) string {
+	if nodeID == "" {
+		return base
+	}
+
+	return base + " (node " + nodeID + ")"
+}
+
+// verifyAppliedState runs Phase 2B: after ApplyConfiguration returns
+// success, re-read the node's MachineConfig and structurally compare
+// against the bytes we sent. Returns:
+//
+//   - nil if everything matches.
+//   - nil with an informational line when ok=false (insecure /
+//     maintenance path; MachineConfig is Sensitive there).
+//   - a hint-bearing error on actual divergence.
+//   - a hint-bearing error when err != nil from the reader: the verify
+//     gate exists specifically to catch silent rollbacks, so a read
+//     failure has to surface, not be swallowed.
+func verifyAppliedState(
+	ctx context.Context,
+	read machineConfigReader,
+	sent []byte,
+	nodeID string,
+	w io.Writer,
+) error {
+	onNode, ok, err := read(ctx)
+	if err != nil {
+		//nolint:wrapcheck // cockroachdb/errors.WithHint at boundary.
+		return errors.WithHint(
+			errors.Wrap(err, "post-apply: re-reading on-node MachineConfig"),
+			"the apply succeeded but verification couldn't read the node back — pass --skip-post-apply-verify to bypass, or investigate the underlying COSI read error.",
+		)
+	}
+
+	if !ok {
+		_, _ = fmt.Fprintln(w, "talm:", maintenanceConnectionMessage)
+
+		return nil
+	}
+
+	changes, err := applycheck.Diff(onNode, sent)
+	if err != nil {
+		return errors.Wrap(err, "post-apply: structural diff of on-node vs sent config")
+	}
+
+	changed := applycheck.FilterChanged(changes)
+	if len(changed) == 0 {
+		return nil
+	}
+
+	printDriftPreview(w, headerWithNode("talm: post-apply divergence", nodeID), changes)
+
+	//nolint:wrapcheck // cockroachdb/errors.WithHint at boundary.
+	return errors.WithHint(
+		errors.Newf("post-apply: %d document(s) on the node diverge from the configuration sent to the node", len(changed)),
+		"check that no controller / extension is reverting state, and that the talosVersion contract matches the running Talos.",
+	)
+}
+
+// printDriftPreview renders a Change list with a header label. OpEqual
+// entries are dropped from the per-line listing but counted in the
+// trailing summary so the reader can confirm the diff against expected
+// scope.
+func printDriftPreview(w io.Writer, header string, changes []applycheck.Change) {
+	_, _ = fmt.Fprintln(w, header)
+
+	var adds, removes, updates, equals int
+
+	for i := range changes {
+		change := &changes[i]
+		switch change.Op {
+		case applycheck.OpEqual:
+			equals++
+
+			continue
+		case applycheck.OpAdd:
+			adds++
+		case applycheck.OpRemove:
+			removes++
+		case applycheck.OpUpdate:
+			updates++
+		}
+
+		_, _ = fmt.Fprintln(w, applycheck.FormatChange(change))
+
+		for j := range change.Fields {
+			f := &change.Fields[j]
+			_, _ = fmt.Fprintf(w, "      %s\n", formatFieldChangeLine(f))
+		}
+	}
+
+	_, _ = fmt.Fprintf(w, "talm: %d addition, %d removal, %d update, %d unchanged.\n", adds, removes, updates, equals)
+}
+
+// formatFieldChangeLine renders one FieldChange entry for the drift
+// preview. The default form is "path: old -> new"; the slice-vs-slice
+// case takes a set-diff fast path so a 50-element certSANs update
+// surfaces as "path: removed [127.0.0.1]" instead of dumping both
+// slices in full. The set-diff is multiset-aware (duplicate-cleanup
+// surfaces correctly) and handles the equal-multiset reorder case
+// with an explicit "(reordered, N element(s))" line so the operator
+// isn't left wondering why an OpUpdate fired with no apparent change.
+func formatFieldChangeLine(f *applycheck.FieldChange) string {
+	if oldSlice, newSlice, ok := bothSlices(f); ok {
+		return formatSliceSetDiff(f.Path, oldSlice, newSlice)
+	}
+
+	return fmt.Sprintf("%s: %s -> %s", f.Path, formatFieldValue(f.HasOld, f.Old), formatFieldValue(f.HasNew, f.New))
+}
+
+// bothSlices returns the two sides as []any when the FieldChange
+// represents a slice-to-slice update. Either side being absent or a
+// non-slice type returns ok=false; callers fall back to the inline
+// flow-style renderer.
+func bothSlices(change *applycheck.FieldChange) ([]any, []any, bool) {
+	if !change.HasOld || !change.HasNew {
+		return nil, nil, false
+	}
+
+	oldSlice, okOld := change.Old.([]any)
+	newSlice, okNew := change.New.([]any)
+
+	if !okOld || !okNew {
+		return nil, nil, false
+	}
+
+	return oldSlice, newSlice, true
+}
+
+// formatSliceSetDiff renders the multiset difference between two
+// slices in the form "path: removed [...], added [...]". Buckets
+// that are empty are omitted; an empty diff with non-empty input
+// surfaces as "path: reordered (N element(s))" — telling the
+// operator the slice's element identity is unchanged.
+func formatSliceSetDiff(path string, oldSlice, newSlice []any) string {
+	removed, added := multisetDiff(oldSlice, newSlice)
+
+	switch {
+	case len(removed) == 0 && len(added) == 0:
+		return fmt.Sprintf("%s: reordered (%d element(s))", path, len(oldSlice))
+	case len(removed) == 0:
+		return fmt.Sprintf("%s: added %s", path, mustRenderFlow(added))
+	case len(added) == 0:
+		return fmt.Sprintf("%s: removed %s", path, mustRenderFlow(removed))
+	default:
+		return fmt.Sprintf("%s: removed %s, added %s", path, mustRenderFlow(removed), mustRenderFlow(added))
+	}
+}
+
+// multisetDiff returns the elements present in oldSlice but not in
+// newSlice (removed) and present in newSlice but not in oldSlice
+// (added) using multiset semantics. Duplicates count individually:
+// removing one of two copies of "127.0.0.1" surfaces as a single
+// removal, not as the whole slice changing.
+//
+// Equality is keyed via fmt's %v so any hashable-or-not element type
+// has a stable canonical form for the count map. The `fmt` package
+// sorts map keys when printing — so map[string]any, map[int]any, and
+// map[any]any all produce deterministic output across runs, and a
+// slice of map elements (e.g., `machine.network.interfaces[]` in
+// v1.11 nested form) dedups correctly. The behaviour is documented
+// in the `fmt` package, not the language spec, so a future stdlib
+// change could break it — but that would break far more things than
+// this differ, and the change would surface as a wider failure.
+func multisetDiff(oldSlice, newSlice []any) ([]any, []any) {
+	counts := make(map[string]int, len(oldSlice))
+	for _, item := range oldSlice {
+		counts[canonicalKey(item)]++
+	}
+
+	added := make([]any, 0)
+
+	for _, item := range newSlice {
+		key := canonicalKey(item)
+		if counts[key] > 0 {
+			counts[key]--
+
+			continue
+		}
+
+		added = append(added, item)
+	}
+
+	removed := make([]any, 0)
+
+	for _, item := range oldSlice {
+		key := canonicalKey(item)
+		if counts[key] > 0 {
+			counts[key]--
+
+			removed = append(removed, item)
+		}
+	}
+
+	return removed, added
+}
+
+// canonicalKey returns a stable string form for any value used as a
+// multiset key. Today this is fmt's %v, which sorts map keys
+// internally; a future Go stdlib change that drops that guarantee
+// would silently break dedup stability across processes. Wrapping
+// the call gives a single replacement site (sort map keys via
+// json.Marshal with SortMapKeys, or migrate to a structural
+// canonicaliser) when that day comes.
+func canonicalKey(item any) string {
+	return fmt.Sprintf("%v", item)
+}
+
+// mustRenderFlow is a render-or-fall-back wrapper around
+// renderFlowYAML for the set-diff path: the caller cannot do anything
+// useful with an error here (the bucket is non-empty by construction,
+// so an encode failure would be deep), so degrade to Go's default %v
+// rather than dropping the line entirely.
+func mustRenderFlow(items []any) string {
+	rendered, err := renderFlowYAML(items)
+	if err != nil {
+		return fmt.Sprintf("%v", items)
+	}
+
+	return rendered
+}
+
+// formatFieldValue renders one side of an OpUpdate FieldChange. A field
+// that is absent on the side reads as `(absent)`; a present field whose
+// value is literally nil/null reads as `<nil>` — distinct from absent
+// and unambiguous in the output.
+//
+// Scalars (strings, ints, bools) render inline as before. Slices and
+// maps go through a YAML flow-style encoder so the operator sees
+// `[a, b, c]` / `{k: v, k2: v2}` instead of Go's `[a b c]` /
+// `map[k:v k2:v2]` — the flow form mirrors how the same value would
+// appear in a Helm values file and stays readable on long certSAN
+// lists or nodeLabel maps.
+func formatFieldValue(has bool, value any) string {
+	if !has {
+		return "(absent)"
+	}
+
+	switch value.(type) {
+	case nil, bool, string,
+		int, int8, int16, int32, int64,
+		uint, uint8, uint16, uint32, uint64,
+		float32, float64:
+		return fmt.Sprintf("%v", value)
+	}
+
+	rendered, err := renderFlowYAML(value)
+	if err != nil {
+		return fmt.Sprintf("%v", value)
+	}
+
+	return rendered
+}
+
+// renderFlowYAML returns a single-line YAML flow-style representation
+// of v. Container nodes (slices, maps) recursively get yaml.FlowStyle
+// so the output is e.g. `[127.0.0.1, 127.0.0.1, 192.0.2.5]` instead of
+// the default block style (which would span multiple lines and break
+// the `path: old -> new` diff layout).
+func renderFlowYAML(value any) (string, error) {
+	var node yaml.Node
+	if err := node.Encode(value); err != nil {
+		return "", errors.Wrap(err, "encoding FieldChange value to YAML node")
+	}
+
+	applyFlowStyle(&node)
+
+	var buf strings.Builder
+
+	enc := yaml.NewEncoder(&buf)
+	enc.SetIndent(0)
+
+	if err := enc.Encode(&node); err != nil {
+		return "", errors.Wrap(err, "marshaling FieldChange value in flow style")
+	}
+
+	_ = enc.Close()
+
+	return strings.TrimRight(buf.String(), "\n"), nil
+}
+
+// applyFlowStyle walks a yaml.Node tree and forces every container
+// node (sequence, mapping) into flow style. Scalar nodes inherit the
+// natural representation Encode picked for them; setting FlowStyle on
+// a scalar is a no-op.
+func applyFlowStyle(n *yaml.Node) {
+	n.Style = yaml.FlowStyle
+	for _, child := range n.Content {
+		applyFlowStyle(child)
+	}
+}
+
+// readWithFreshTimeout runs op with a context derived from parent
+// that carries a fresh timeout budget. Used by cosiLinksDisksReader
+// to give each COSI ListAll call its own deadline — sharing one
+// context.WithTimeout across multiple reads leaks the time spent on
+// the first into the second's budget, which on a slow node turns a
+// legitimate-but-slow read into a false transient-timeout blocker.
+//
+// Generic over the return type so the helper composes with
+// safe.StateListAll[T]'s generic signature without an adapter.
+func readWithFreshTimeout[T any](parent context.Context, timeout time.Duration, op func(context.Context) (T, error)) (T, error) {
+	ctx, cancel := context.WithTimeout(parent, timeout)
+	defer cancel()
+
+	return op(ctx)
+}
+
+// cosiLinksDisksReader returns a linksDisksReader backed by the node's
+// COSI state. Both LinkStatus (network namespace) and Disk (block
+// namespace) are NonSensitive, so the maintenance / insecure path can
+// read them — Phase 1 works on both paths.
+//
+// Each of the two list operations gets its OWN preflightCOSIReadTimeout
+// budget via readWithFreshTimeout. See that helper's godoc for the
+// shared-budget leak it prevents.
+func cosiLinksDisksReader(c *client.Client) linksDisksReader {
+	return func(ctx context.Context) (applycheck.HostSnapshot, bool, error) {
+		links, err := readWithFreshTimeout(ctx, preflightCOSIReadTimeout, func(ctx context.Context) (safe.List[*network.LinkStatus], error) {
+			return safe.StateListAll[*network.LinkStatus](ctx, c.COSI)
+		})
+		if err != nil {
+			return applycheck.HostSnapshot{}, false, errors.Wrap(err, "listing LinkStatus resources")
+		}
+
+		snapshot := applycheck.HostSnapshot{
+			Links: make([]string, 0, links.Len()),
+		}
+
+		for link := range links.All() {
+			snapshot.Links = append(snapshot.Links, link.Metadata().ID())
+		}
+
+		disks, err := readWithFreshTimeout(ctx, preflightCOSIReadTimeout, func(ctx context.Context) (safe.List[*block.Disk], error) {
+			return safe.StateListAll[*block.Disk](ctx, c.COSI)
+		})
+		if err != nil {
+			return applycheck.HostSnapshot{}, false, errors.Wrap(err, "listing Disk resources")
+		}
+
+		snapshot.Disks = make([]applycheck.DiskInfo, 0, disks.Len())
+
+		for disk := range disks.All() {
+			spec := disk.TypedSpec()
+			snapshot.Disks = append(snapshot.Disks, applycheck.DiskInfo{
+				DevPath:    spec.DevPath,
+				Model:      spec.Model,
+				Serial:     spec.Serial,
+				WWID:       spec.WWID,
+				Modalias:   spec.Modalias,
+				UUID:       spec.UUID,
+				BusPath:    spec.BusPath,
+				Transport:  spec.Transport,
+				Size:       spec.Size,
+				Rotational: spec.Rotational,
+				Readonly:   spec.Readonly,
+				CDROM:      spec.CDROM,
+				Symlinks:   append([]string(nil), spec.Symlinks...),
+			})
+		}
+
+		return snapshot, true, nil
+	}
+}
+
+// cosiMachineConfigReader returns a machineConfigReader backed by the
+// node's COSI state. The MachineConfig resource is meta.Sensitive: on
+// the insecure / maintenance path the Reader role doesn't see it.
+// Callers know which path they're on (it's the --insecure flag, set
+// statically at command-line parse), so we branch deterministically
+// on insecure rather than collapsing every COSI error into
+// ok=false. Transient failures (network blip, apid restart) surface
+// as err non-nil so verifyAppliedState can surface them rather than
+// silently passing.
+func cosiMachineConfigReader(c *client.Client, insecure bool) machineConfigReader {
+	if insecure {
+		return func(context.Context) ([]byte, bool, error) {
+			return nil, false, nil
+		}
+	}
+
+	return func(ctx context.Context) ([]byte, bool, error) {
+		ctx, cancel := context.WithTimeout(ctx, preflightCOSIReadTimeout)
+		defer cancel()
+
+		res, err := safe.StateGet[*config.MachineConfig](
+			ctx,
+			c.COSI,
+			resource.NewMetadata(config.NamespaceName, config.MachineConfigType, config.ActiveID, resource.VersionUndefined),
+		)
+		if err != nil {
+			return nil, false, errors.Wrap(err, "reading on-node MachineConfig")
+		}
+
+		bytesRaw, err := res.Provider().Bytes()
+		if err != nil {
+			return nil, false, errors.Wrap(err, "marshaling on-node MachineConfig")
+		}
+
+		return bytesRaw, true, nil
+	}
+}
diff --git a/pkg/commands/preflight_apply_safety_test.go b/pkg/commands/preflight_apply_safety_test.go
new file mode 100644
index 00000000..9db6b5f3
--- /dev/null
+++ b/pkg/commands/preflight_apply_safety_test.go
@@ -0,0 +1,871 @@
+// Copyright Cozystack Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package commands
+
+import (
+	"bytes"
+	"context"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/errors"
+	"github.com/cozystack/talm/pkg/applycheck"
+	machineapi "github.com/siderolabs/talos/pkg/machinery/api/machine"
+)
+
+const renderedV1_11 = `version: v1alpha1
+machine:
+  type: controlplane
+  install:
+    disk: /dev/sda
+  network:
+    interfaces:
+      - interface: eth9999
+`
+
+const renderedV1_12Multidoc = `version: v1alpha1
+machine:
+  type: controlplane
+  install:
+    diskSelector:
+      model: "Samsung*"
+---
+apiVersion: v1alpha1
+kind: LinkConfig
+name: eth0
+`
+
+func stubLinksDisksReader(snapshot applycheck.HostSnapshot, ok bool) linksDisksReader {
+	return func(context.Context) (applycheck.HostSnapshot, bool, error) {
+		return snapshot, ok, nil
+	}
+}
+
+// stubLinksDisksReaderErr returns a reader that simulates a transient
+// COSI read failure. Used to pin the "blocker wraps the underlying
+// cause" contract, distinguishing transient failures from auth-disallowed.
+func stubLinksDisksReaderErr(err error) linksDisksReader {
+	return func(context.Context) (applycheck.HostSnapshot, bool, error) {
+		return applycheck.HostSnapshot{}, false, err
+	}
+}
+
+func stubMachineConfigReader(configBytes []byte, ok bool, err error) machineConfigReader {
+	return func(context.Context) ([]byte, bool, error) {
+		return configBytes, ok, err
+	}
+}
+
+func TestPreflightValidateResources_LinkMissing_BlocksWithHint(t *testing.T) {
+	t.Parallel()
+
+	snapshot := applycheck.HostSnapshot{
+		Links: []string{"eth0", "eth1"},
+		Disks: []applycheck.DiskInfo{{DevPath: "/dev/sda", Model: "Samsung"}},
+	}
+
+	buf := &bytes.Buffer{}
+	err := preflightValidateResources(
+		context.Background(),
+		stubLinksDisksReader(snapshot, true),
+		[]byte(renderedV1_11),
+		buf,
+	)
+	if err == nil {
+		t.Fatal("expected blocker error for missing eth9999, got nil")
+	}
+
+	out := buf.String()
+	if !strings.Contains(out, "eth9999") {
+		t.Errorf("output should cite the offending name, got %q", out)
+	}
+
+	if !strings.Contains(out, "eth0") || !strings.Contains(out, "eth1") {
+		t.Errorf("output should list available links as a hint, got %q", out)
+	}
+}
+
+func TestPreflightValidateResources_AllRefsResolvable_Passes(t *testing.T) {
+	t.Parallel()
+
+	snapshot := applycheck.HostSnapshot{
+		Links: []string{"eth0"},
+		Disks: []applycheck.DiskInfo{{DevPath: "/dev/sda", Model: "Samsung 980 Pro"}},
+	}
+
+	err := preflightValidateResources(
+		context.Background(),
+		stubLinksDisksReader(snapshot, true),
+		[]byte(renderedV1_12Multidoc),
+		&bytes.Buffer{},
+	)
+	if err != nil {
+		t.Errorf("expected nil error when every ref resolves, got %v", err)
+	}
+}
+
+func TestPreflightValidateResources_ReaderFails_SurfacesAndBlocks(t *testing.T) {
+	t.Parallel()
+
+	err := preflightValidateResources(
+		context.Background(),
+		stubLinksDisksReader(applycheck.HostSnapshot{}, false),
+		[]byte(renderedV1_11),
+		&bytes.Buffer{},
+	)
+	if err == nil {
+		t.Fatal("expected error when reader returns ok=false, got nil")
+	}
+
+	if !strings.Contains(err.Error(), "unavailable") {
+		t.Errorf("expected 'unavailable' in error for ok=false path, got err=%v", err)
+	}
+}
+
+// TestPreflightValidateResources_TransientErr_WrapsUnderlying pins the
+// three-valued reader contract: a transient (err != nil) read failure
+// must surface the underlying cause to the operator, NOT the
+// "config is wrong" blocker class used for ok=false. Without the
+// distinction a 2s COSI timeout looked like a config defect.
+func TestPreflightValidateResources_TransientErr_WrapsUnderlying(t *testing.T) {
+	t.Parallel()
+
+	transient := errors.New("apid: connection reset")
+	err := preflightValidateResources(
+		context.Background(),
+		stubLinksDisksReaderErr(transient),
+		[]byte(renderedV1_11),
+		&bytes.Buffer{},
+	)
+	if err == nil {
+		t.Fatal("expected error when reader returns transient err, got nil")
+	}
+
+	if !strings.Contains(err.Error(), "connection reset") {
+		t.Errorf("expected underlying transient error in chain, got %v", err)
+	}
+
+	if !strings.Contains(err.Error(), "reading host links/disks snapshot") {
+		t.Errorf("expected 'reading host links/disks snapshot' context wrap, got %v", err)
+	}
+}
+
+func TestPreviewDrift_RemovedLinkSurfacesInOutput(t *testing.T) {
+	t.Parallel()
+
+	current := []byte(`version: v1alpha1
+machine:
+  type: controlplane
+---
+apiVersion: v1alpha1
+kind: LinkConfig
+name: eth1
+up: true
+`)
+	desired := []byte(`version: v1alpha1
+machine:
+  type: controlplane
+---
+apiVersion: v1alpha1
+kind: LinkConfig
+name: eth0
+up: true
+`)
+
+	buf := &bytes.Buffer{}
+	err := previewDrift(
+		context.Background(),
+		stubMachineConfigReader(current, true, nil),
+		desired,
+		"",
+		buf,
+	)
+	if err != nil {
+		t.Fatalf("previewDrift error: %v", err)
+	}
+
+	out := buf.String()
+	if !strings.Contains(out, "- LinkConfig") || !strings.Contains(out, "eth1") {
+		t.Errorf("output should surface '- LinkConfig{name: eth1}', got %q", out)
+	}
+
+	if !strings.Contains(out, "+ LinkConfig") || !strings.Contains(out, "eth0") {
+		t.Errorf("output should surface '+ LinkConfig{name: eth0}', got %q", out)
+	}
+}
+
+func TestPreviewDrift_InsecurePath_DegradesGracefully(t *testing.T) {
+	t.Parallel()
+
+	buf := &bytes.Buffer{}
+	err := previewDrift(
+		context.Background(),
+		stubMachineConfigReader(nil, false, nil),
+		[]byte(renderedV1_12Multidoc),
+		"",
+		buf,
+	)
+	if err != nil {
+		t.Errorf("previewDrift on insecure path should not block, got err=%v", err)
+	}
+
+	if !strings.Contains(buf.String(), "maintenance connection") {
+		t.Errorf("expected 'maintenance connection' explanatory line, got %q", buf.String())
+	}
+}
+
+func TestVerifyAppliedState_Match_NoError(t *testing.T) {
+	t.Parallel()
+
+	sent := []byte(renderedV1_12Multidoc)
+	err := verifyAppliedState(
+		context.Background(),
+		stubMachineConfigReader(sent, true, nil),
+		sent,
+		"",
+		&bytes.Buffer{},
+	)
+	if err != nil {
+		t.Errorf("verifyAppliedState should accept matching configs, got err=%v", err)
+	}
+}
+
+func TestVerifyAppliedState_Divergence_BlocksWithDetails(t *testing.T) {
+	t.Parallel()
+
+	sent := []byte(`version: v1alpha1
+machine:
+  type: controlplane
+---
+apiVersion: v1alpha1
+kind: LinkConfig
+name: eth0
+up: true
+`)
+	onNode := []byte(`version: v1alpha1
+machine:
+  type: controlplane
+---
+apiVersion: v1alpha1
+kind: LinkConfig
+name: eth0
+up: false
+`)
+
+	buf := &bytes.Buffer{}
+	err := verifyAppliedState(
+		context.Background(),
+		stubMachineConfigReader(onNode, true, nil),
+		sent,
+		"",
+		buf,
+	)
+	if err == nil {
+		t.Fatal("verifyAppliedState should block on divergence, got nil error")
+	}
+
+	out := buf.String()
+	if !strings.Contains(out, "LinkConfig") || !strings.Contains(out, "eth0") {
+		t.Errorf("output should name the diverging doc, got %q", out)
+	}
+}
+
+// TestVerifyAppliedState_ReaderError_Blocks pins the contract: a
+// non-nil err from the reader on the auth path is a hard blocker.
+// The verify gate exists specifically to catch silent rollbacks, so
+// swallowing a transient read error would defeat the purpose.
+func TestVerifyAppliedState_ReaderError_Blocks(t *testing.T) {
+	t.Parallel()
+
+	transient := errors.New("apid: connection reset")
+	err := verifyAppliedState(
+		context.Background(),
+		stubMachineConfigReader(nil, false, transient),
+		[]byte(renderedV1_12Multidoc),
+		"",
+		&bytes.Buffer{},
+	)
+	if err == nil {
+		t.Fatal("expected error on reader failure, got nil")
+	}
+
+	if !strings.Contains(err.Error(), "connection reset") {
+		t.Errorf("expected underlying error in chain, got %v", err)
+	}
+}
+
+func TestVerifyAppliedState_InsecurePath_NoBlock(t *testing.T) {
+	t.Parallel()
+
+	buf := &bytes.Buffer{}
+	err := verifyAppliedState(
+		context.Background(),
+		stubMachineConfigReader(nil, false, nil),
+		[]byte(renderedV1_12Multidoc),
+		"",
+		buf,
+	)
+	if err != nil {
+		t.Errorf("verifyAppliedState on insecure path should not block, got err=%v", err)
+	}
+
+	if !strings.Contains(buf.String(), "maintenance connection") {
+		t.Errorf("expected 'maintenance connection' explanatory line, got %q", buf.String())
+	}
+}
+
+// TestShouldRunDriftPreview_RespectsOnlySkipFlag pins the Phase 2A
+// scheduling contract: dry-run does NOT skip the drift preview (the
+// gate is read-only and dry-run wants the diff); only the explicit
+// --skip-drift-preview flag suppresses it. This pins the
+// fix(commands): run Phase 2A drift preview on --dry-run commit
+// against future flips that might re-introduce the dry-run skip.
+func TestShouldRunDriftPreview_RespectsOnlySkipFlag(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		skip bool
+		want bool
+	}{
+		{"default: runs", false, true},
+		{"--skip-drift-preview: skipped", true, false},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+
+			got := shouldRunDriftPreview(tc.skip)
+			if got != tc.want {
+				t.Errorf("shouldRunDriftPreview(skip=%v) = %v, want %v", tc.skip, got, tc.want)
+			}
+		})
+	}
+}
+
+// TestMultisetDiff_StringKeyedSlices_StableDedup pins the contract
+// for the schema shape MachineConfig actually surfaces today: a
+// slice of strings (certSANs, peers, routes, link names). The %v
+// key function is stable for strings, so duplicate-dedup works.
+func TestMultisetDiff_StringKeyedSlices_StableDedup(t *testing.T) {
+	t.Parallel()
+
+	oldSlice := []any{"127.0.0.1", "127.0.0.1", "192.0.2.5"}
+	newSlice := []any{"127.0.0.1", "192.0.2.5"}
+
+	removed, added := multisetDiff(oldSlice, newSlice)
+	if len(removed) != 1 || removed[0] != "127.0.0.1" {
+		t.Errorf("removed: want [127.0.0.1], got %v", removed)
+	}
+
+	if len(added) != 0 {
+		t.Errorf("added: want [], got %v", added)
+	}
+}
+
+// TestMultisetDiff_MapElementsStableAcrossRuns pins the dedup
+// behaviour on slice-of-map inputs — the shape Phase 2A produces
+// when diffing v1.11 nested form (e.g. machine.network.interfaces[])
+// or any other array-of-object configuration. The dedup relies on
+// fmt's %v producing the same string for two equal-by-content maps
+// across runs; `fmt` sorts map keys for stable output, so this
+// holds for map[string]any / map[int]any / map[any]any equally.
+//
+// The test runs 100 times to surface any iteration-order
+// non-determinism: if a future Go stdlib change drops the key-sort
+// guarantee, this test starts flaking and multisetDiff's key
+// function needs a structural canonicaliser instead.
+func TestMultisetDiff_MapElementsStableAcrossRuns(t *testing.T) {
+	t.Parallel()
+
+	mapElem := map[string]any{"interface": "eth0", "mtu": 1500, "dhcp": true}
+
+	for run := range 100 {
+		oldSlice := []any{mapElem, mapElem}
+		newSlice := []any{mapElem}
+
+		removed, added := multisetDiff(oldSlice, newSlice)
+		if len(removed) != 1 || len(added) != 0 {
+			t.Fatalf("run %d: map-element dedup unstable — removed=%v, added=%v; if you see this, fmt's stable-map-print guarantee may have changed", run, removed, added)
+		}
+	}
+}
+
+// TestReadWithFreshTimeout_DoesNotLeakBudgetAcrossCalls pins the
+// per-call-budget property used by cosiLinksDisksReader for its
+// two COSI ListAll reads. Sharing a single context.WithTimeout
+// across both reads would leak the first read's elapsed time
+// into the second's budget: on a slow node a 1.5s links read
+// would leave the disks read with sub-second budget and produce
+// a false transient-timeout blocker.
+//
+// The test runs the helper twice — the first call burns most of
+// the timeout, the second call inspects its own context's
+// deadline and asserts the deadline is approximately the full
+// budget from "now", not the residue after the first call.
+func TestReadWithFreshTimeout_DoesNotLeakBudgetAcrossCalls(t *testing.T) {
+	t.Parallel()
+
+	parent := context.Background()
+	budget := 200 * time.Millisecond
+
+	// First call: burn ~3/4 of the budget. Sleeps inside the op.
+	_, _ = readWithFreshTimeout(parent, budget, func(_ context.Context) (struct{}, error) {
+		time.Sleep(150 * time.Millisecond)
+
+		return struct{}{}, nil
+	})
+
+	// Second call: inspect the fresh context's deadline. It should be
+	// approximately parent's "now" + full budget — proof that the
+	// first call's elapsed time did NOT leak into this budget.
+	checkpoint := time.Now()
+
+	_, _ = readWithFreshTimeout(parent, budget, func(ctx context.Context) (struct{}, error) {
+		deadline, hasDeadline := ctx.Deadline()
+		if !hasDeadline {
+			t.Fatal("second call should have a deadline; got none")
+		}
+
+		remaining := time.Until(deadline)
+		// Tolerate 25ms of scheduling jitter. A leaked budget would
+		// leave the second call with budget - 150ms = 50ms, well
+		// below the budget - 25ms = 175ms floor below.
+		if remaining < budget-25*time.Millisecond {
+			t.Errorf("budget leaked: second-call deadline at %v from now, expected ~%v (budget=%v, checkpoint=%v)",
+				remaining, budget, budget, checkpoint)
+		}
+
+		return struct{}{}, nil
+	})
+}
+
+// TestPrintDriftPreview_SliceSetDiff_RemovesDuplicate pins the
+// set-diff shape: when both sides are slices, the preview shows
+// `removed [...]` / `added [...]` buckets instead of dumping the
+// full slice twice. The certSANs duplicate-cleanup case shrinks
+// from a 50+ char two-slice dump to a focused
+// `removed [127.0.0.1]` line. Multiset semantics — one extra copy
+// of 127.0.0.1 in Old surfaces as exactly one removal.
+func TestPrintDriftPreview_SliceSetDiff_RemovesDuplicate(t *testing.T) {
+	t.Parallel()
+
+	changes := []applycheck.Change{{
+		ID: applycheck.DocID{Kind: "MachineConfig"},
+		Op: applycheck.OpUpdate,
+		Fields: []applycheck.FieldChange{{
+			Path:   "cluster.apiServer.certSANs",
+			Old:    []any{"127.0.0.1", "127.0.0.1", "192.0.2.5"},
+			New:    []any{"127.0.0.1", "192.0.2.5"},
+			HasOld: true,
+			HasNew: true,
+		}},
+	}}
+
+	buf := &bytes.Buffer{}
+	printDriftPreview(buf, "drift:", changes)
+
+	out := buf.String()
+	if !strings.Contains(out, "removed [127.0.0.1]") {
+		t.Errorf("slice set-diff should surface the removed duplicate, got:\n%s", out)
+	}
+
+	if strings.Contains(out, "added [") {
+		t.Errorf("no elements were added; the bucket should be omitted, got:\n%s", out)
+	}
+
+	// The full-slice fallback must not fire.
+	if strings.Contains(out, "[127.0.0.1, 127.0.0.1, 192.0.2.5]") {
+		t.Errorf("full-slice render should be replaced by set-diff, got:\n%s", out)
+	}
+}
+
+// TestPrintDriftPreview_SliceSetDiff_AddOnly pins the "added only"
+// case: a new SAN appended to certSANs surfaces as
+// `added [new.example]` with no `removed` bucket.
+func TestPrintDriftPreview_SliceSetDiff_AddOnly(t *testing.T) {
+	t.Parallel()
+
+	changes := []applycheck.Change{{
+		ID: applycheck.DocID{Kind: "MachineConfig"},
+		Op: applycheck.OpUpdate,
+		Fields: []applycheck.FieldChange{{
+			Path:   "cluster.apiServer.certSANs",
+			Old:    []any{"127.0.0.1"},
+			New:    []any{"127.0.0.1", "192.0.2.5"},
+			HasOld: true,
+			HasNew: true,
+		}},
+	}}
+
+	buf := &bytes.Buffer{}
+	printDriftPreview(buf, "drift:", changes)
+
+	out := buf.String()
+	if !strings.Contains(out, "added [192.0.2.5]") {
+		t.Errorf("slice set-diff should surface the added entry, got:\n%s", out)
+	}
+
+	if strings.Contains(out, "removed [") {
+		t.Errorf("no elements were removed; the bucket should be omitted, got:\n%s", out)
+	}
+}
+
+// TestPrintDriftPreview_SliceSetDiff_ReorderOnly pins the equal-
+// multiset reorder case: when both slices contain the same
+// elements with the same multiplicities but in a different order,
+// reflect.DeepEqual flags an OpUpdate but the set diff is empty.
+// Surface a `(reordered, same elements)` note so the operator
+// isn't left wondering what changed.
+func TestPrintDriftPreview_SliceSetDiff_ReorderOnly(t *testing.T) {
+	t.Parallel()
+
+	changes := []applycheck.Change{{
+		ID: applycheck.DocID{Kind: "MachineConfig"},
+		Op: applycheck.OpUpdate,
+		Fields: []applycheck.FieldChange{{
+			Path:   "cluster.apiServer.certSANs",
+			Old:    []any{"127.0.0.1", "192.0.2.5"},
+			New:    []any{"192.0.2.5", "127.0.0.1"},
+			HasOld: true,
+			HasNew: true,
+		}},
+	}}
+
+	buf := &bytes.Buffer{}
+	printDriftPreview(buf, "drift:", changes)
+
+	out := buf.String()
+	if !strings.Contains(out, "reordered") {
+		t.Errorf("equal-multiset reorder should surface as 'reordered', got:\n%s", out)
+	}
+}
+
+// TestPreviewDrift_MultiNode_HeaderCarriesNodeID pins the per-node
+// header on Phase 2A output. Multi-node apply calls previewDrift /
+// verifyAppliedState in a loop, sharing one stderr — without a node
+// identifier on the header, an operator scanning the output cannot
+// tell which node owns which diff. The header for a non-empty
+// nodeID must include the node literal so multi-node output is
+// disambiguated.
+func TestPreviewDrift_MultiNode_HeaderCarriesNodeID(t *testing.T) {
+	t.Parallel()
+
+	current := []byte(`version: v1alpha1
+machine:
+  type: controlplane
+  install:
+    disk: /dev/sda
+`)
+	desired := []byte(`version: v1alpha1
+machine:
+  type: controlplane
+  install:
+    disk: /dev/sdb
+`)
+
+	buf := &bytes.Buffer{}
+	err := previewDrift(
+		context.Background(),
+		stubMachineConfigReader(current, true, nil),
+		desired,
+		"192.0.2.10",
+		buf,
+	)
+	if err != nil {
+		t.Fatalf("previewDrift error: %v", err)
+	}
+
+	if !strings.Contains(buf.String(), "(node 192.0.2.10)") {
+		t.Errorf("non-empty nodeID must appear in the drift-preview header, got:\n%s", buf.String())
+	}
+}
+
+// TestVerifyAppliedState_MultiNode_HeaderCarriesNodeID is the
+// Phase 2B counterpart of TestPreviewDrift_MultiNode_HeaderCarriesNodeID:
+// the post-apply divergence header must also disambiguate per node
+// so an operator scanning stderr from a multi-node apply that
+// diverged on two nodes can act on each one independently.
+func TestVerifyAppliedState_MultiNode_HeaderCarriesNodeID(t *testing.T) {
+	t.Parallel()
+
+	sent := []byte(`version: v1alpha1
+machine:
+  type: controlplane
+---
+apiVersion: v1alpha1
+kind: LinkConfig
+name: eth0
+up: true
+`)
+	onNode := []byte(`version: v1alpha1
+machine:
+  type: controlplane
+---
+apiVersion: v1alpha1
+kind: LinkConfig
+name: eth0
+up: false
+`)
+
+	buf := &bytes.Buffer{}
+	err := verifyAppliedState(
+		context.Background(),
+		stubMachineConfigReader(onNode, true, nil),
+		sent,
+		"192.0.2.11",
+		buf,
+	)
+	if err == nil {
+		t.Fatal("expected divergence to surface as an error")
+	}
+
+	if !strings.Contains(buf.String(), "(node 192.0.2.11)") {
+		t.Errorf("non-empty nodeID must appear in the post-apply divergence header, got:\n%s", buf.String())
+	}
+}
+
+// TestPreviewDrift_EmptyNodeID_HeaderUnchanged pins the single-node
+// case: when nodeID is "" (the implicit-single-node template path),
+// the header stays bare so existing output formats are preserved.
+func TestPreviewDrift_EmptyNodeID_HeaderUnchanged(t *testing.T) {
+	t.Parallel()
+
+	desired := []byte(`version: v1alpha1
+machine:
+  type: controlplane
+  install:
+    disk: /dev/sdb
+`)
+	current := []byte(`version: v1alpha1
+machine:
+  type: controlplane
+  install:
+    disk: /dev/sda
+`)
+
+	buf := &bytes.Buffer{}
+	err := previewDrift(
+		context.Background(),
+		stubMachineConfigReader(current, true, nil),
+		desired,
+		"",
+		buf,
+	)
+	if err != nil {
+		t.Fatalf("previewDrift error: %v", err)
+	}
+
+	if strings.Contains(buf.String(), "(node ") {
+		t.Errorf("empty nodeID should leave the header bare, got:\n%s", buf.String())
+	}
+}
+
+// TestPrintDriftPreview_SliceFlowStyle_AbsentOnOneSide pins the
+// flow-style fallback path that the set-diff layer does NOT cover:
+// when a slice appears (or disappears) entirely, we render it as a
+// YAML flow list `[a, b, c]` instead of Go's `[a b c]`. The set-diff
+// path covers slice-vs-slice diffs; this test covers the slice
+// vs. (absent) case where the operator still needs to see the
+// list contents.
+func TestPrintDriftPreview_SliceFlowStyle_AbsentOnOneSide(t *testing.T) {
+	t.Parallel()
+
+	changes := []applycheck.Change{{
+		ID: applycheck.DocID{Kind: "MachineConfig"},
+		Op: applycheck.OpUpdate,
+		Fields: []applycheck.FieldChange{{
+			Path:   "cluster.apiServer.certSANs",
+			Old:    nil,
+			New:    []any{"127.0.0.1", "192.0.2.5"},
+			HasOld: false,
+			HasNew: true,
+		}},
+	}}
+
+	buf := &bytes.Buffer{}
+	printDriftPreview(buf, "drift:", changes)
+
+	out := buf.String()
+	if strings.Contains(out, "[127.0.0.1 192.0.2.5]") {
+		t.Errorf("slice rendered in Go fmt %%v style, want YAML flow:\n%s", out)
+	}
+
+	if !strings.Contains(out, "[127.0.0.1, 192.0.2.5]") {
+		t.Errorf("slice should render as YAML flow list, got:\n%s", out)
+	}
+
+	if !strings.Contains(out, "(absent) ->") {
+		t.Errorf("absent-old should render as '(absent) ->', got:\n%s", out)
+	}
+}
+
+// TestPrintDriftPreview_MapFieldChange_RendersFlowStyle pins the
+// map case: Go's %v produces "map[a:1 b:2]" which is awkward and
+// can mislead the reader into thinking "a:1" is a single token.
+// YAML flow mapping "{a: 1, b: 2}" mirrors how the same value
+// would be typed in a Helm values file.
+func TestPrintDriftPreview_MapFieldChange_RendersFlowStyle(t *testing.T) {
+	t.Parallel()
+
+	changes := []applycheck.Change{{
+		ID: applycheck.DocID{Kind: "MachineConfig"},
+		Op: applycheck.OpUpdate,
+		Fields: []applycheck.FieldChange{{
+			Path:   "machine.nodeLabels",
+			Old:    map[string]any{"role": "control-plane"},
+			New:    map[string]any{"role": "control-plane", "tier": "primary"},
+			HasOld: true,
+			HasNew: true,
+		}},
+	}}
+
+	buf := &bytes.Buffer{}
+	printDriftPreview(buf, "drift:", changes)
+
+	out := buf.String()
+	if strings.Contains(out, "map[role:control-plane]") {
+		t.Errorf("map rendered in Go fmt %%v style, want YAML flow:\n%s", out)
+	}
+
+	if !strings.Contains(out, "{role: control-plane}") {
+		t.Errorf("map should render as YAML flow mapping, got:\n%s", out)
+	}
+}
+
+// TestPrintDriftPreview_ScalarFieldChange_StaysInline pins the
+// scalar case: short string / int / bool values render inline
+// without yaml-quoting noise. The operator sees
+// "cluster.network.dnsDomain: cozy.local -> cozy.example"
+// the same as before.
+func TestPrintDriftPreview_ScalarFieldChange_StaysInline(t *testing.T) {
+	t.Parallel()
+
+	changes := []applycheck.Change{{
+		ID: applycheck.DocID{Kind: "MachineConfig"},
+		Op: applycheck.OpUpdate,
+		Fields: []applycheck.FieldChange{{
+			Path:   "cluster.network.dnsDomain",
+			Old:    "cozy.local",
+			New:    "cozy.example",
+			HasOld: true,
+			HasNew: true,
+		}},
+	}}
+
+	buf := &bytes.Buffer{}
+	printDriftPreview(buf, "drift:", changes)
+
+	out := buf.String()
+	if !strings.Contains(out, "cozy.local -> cozy.example") {
+		t.Errorf("scalar field should render inline, got:\n%s", out)
+	}
+}
+
+// TestShouldRunDriftPreview_ModeAgnostic pins the Phase 2A invariant
+// that the preview runs for EVERY apply mode. The preview is read-only
+// (a Diff against the on-node MachineConfig), so reboot/staged/try/auto/
+// no-reboot all benefit from seeing it. The matrix below iterates every
+// apply mode and asserts the predicate ignores it — if a future change
+// adds mode-gating, this test forces the author to update the matrix
+// (and reconsider why the read-only preview should ever be skipped).
+//
+// The matrix is paired with shouldRunPostApplyVerify_RespectsModeAndDryRun
+// where mode-gating IS the correct contract (the verify can race
+// against rollback or reach a stale snapshot). Keep them next to each
+// other so the divergence is obvious.
+func TestShouldRunDriftPreview_ModeAgnostic(t *testing.T) {
+	t.Parallel()
+
+	modes := []machineapi.ApplyConfigurationRequest_Mode{
+		machineapi.ApplyConfigurationRequest_REBOOT,
+		machineapi.ApplyConfigurationRequest_NO_REBOOT,
+		machineapi.ApplyConfigurationRequest_AUTO,
+		machineapi.ApplyConfigurationRequest_STAGED,
+		machineapi.ApplyConfigurationRequest_TRY,
+	}
+
+	for _, mode := range modes {
+		t.Run(mode.String(), func(t *testing.T) {
+			t.Parallel()
+
+			// Phase 2A is mode-agnostic by design: the predicate
+			// signature `(skip bool) bool` deliberately excludes
+			// mode. The mode loop here documents the invariant for
+			// future readers — if somebody adds a `mode` parameter
+			// to shouldRunDriftPreview, this test stops compiling
+			// and forces a conscious decision.
+			if !shouldRunDriftPreview(false) {
+				t.Errorf("Phase 2A must run for mode=%v, got skip", mode)
+			}
+
+			if shouldRunDriftPreview(true) {
+				t.Errorf("--skip-drift-preview must suppress Phase 2A for mode=%v, got run", mode)
+			}
+		})
+	}
+}
+
+// TestShouldRunPostApplyVerify_RespectsModeAndDryRun pins the post-apply
+// gate's skip predicate. staged/try modes have contracts that diverge
+// from 'sent == on-node' (staged: not yet active until reboot; try:
+// auto-rollback), reboot kills the COSI connection mid-verify, AUTO
+// promotes to REBOOT internally when the change requires it, and
+// dry-run obviously didn't apply anything. The gate must skip in
+// those cases regardless of the skip flag — only --mode=no-reboot
+// reliably leaves the node up with a stable ActiveID for the verify
+// to read.
+func TestShouldRunPostApplyVerify_RespectsModeAndDryRun(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		mode machineapi.ApplyConfigurationRequest_Mode
+		dry  bool
+		skip bool
+		want bool
+	}{
+		// no-reboot leaves the node up after apply, so the verify
+		// reaches a stable COSI snapshot. The only mode where verify
+		// runs by default.
+		{"no_reboot runs", machineapi.ApplyConfigurationRequest_NO_REBOOT, false, false, true},
+		// Modes that reach the node only intermittently or never on
+		// ActiveID: skipped.
+		{"reboot skipped", machineapi.ApplyConfigurationRequest_REBOOT, false, false, false},
+		{"staged skipped", machineapi.ApplyConfigurationRequest_STAGED, false, false, false},
+		{"try skipped", machineapi.ApplyConfigurationRequest_TRY, false, false, false},
+		// AUTO is skipped because Talos's apply-server promotes it to
+		// REBOOT internally when the change requires one. The verify
+		// would race the reboot; same shape as the explicit REBOOT skip.
+		// Pinned to keep a future maintainer from "fixing" AUTO back to
+		// runs and re-introducing the race.
+		{"auto skipped (Talos promotes to REBOOT internally)", machineapi.ApplyConfigurationRequest_AUTO, false, false, false},
+		{"dry-run skipped", machineapi.ApplyConfigurationRequest_NO_REBOOT, true, false, false},
+		{"skip flag honoured", machineapi.ApplyConfigurationRequest_NO_REBOOT, false, true, false},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+
+			got := shouldRunPostApplyVerify(tc.mode, tc.dry, tc.skip)
+			if got != tc.want {
+				t.Errorf("shouldRunPostApplyVerify(mode=%v, dry=%v, skip=%v) = %v, want %v", tc.mode, tc.dry, tc.skip, got, tc.want)
+			}
+		})
+	}
+}
diff --git a/pkg/commands/preflight_test.go b/pkg/commands/preflight_test.go
index 0ce9b2a2..11277e8b 100644
--- a/pkg/commands/preflight_test.go
+++ b/pkg/commands/preflight_test.go
@@ -112,11 +112,21 @@ func TestAnnotateApplyConfigError(t *testing.T) {
 	}
 }
 
-// stubReader is a versionReader that returns a fixed value, matching the
-// contract preflightCheckTalosVersion uses (string, ok). Tests use it to
-// drive the function without standing up a live COSI server.
+// stubReader is a versionReader that returns a fixed (version, ok) pair
+// with err=nil. Tests use it to drive the version-comparison logic
+// without standing up a live COSI server. The "ok=false err=nil" shape
+// it produces models the by-design-unreachable case; tests that need
+// to exercise the err!=nil path use stubReaderErr.
 func stubReader(version string, ok bool) versionReader {
-	return func(context.Context) (string, bool) { return version, ok }
+	return func(context.Context) (string, bool, error) { return version, ok, nil }
+}
+
+// stubReaderErr is a versionReader that returns (false, err) — the
+// shape that surfaces when reading runtime.Version legitimately fails
+// (connection refused, context deadline exceeded, RPC error). The
+// post-upgrade verify treats this as the rollback signal.
+func stubReaderErr(err error) versionReader {
+	return func(context.Context) (string, bool, error) { return "", false, err }
 }
 
 func TestPreflightCheckTalosVersion(t *testing.T) {
diff --git a/pkg/commands/preflight_upgrade_verify.go b/pkg/commands/preflight_upgrade_verify.go
new file mode 100644
index 00000000..f647a205
--- /dev/null
+++ b/pkg/commands/preflight_upgrade_verify.go
@@ -0,0 +1,160 @@
+// Copyright Cozystack Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package commands
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"strings"
+
+	"github.com/cockroachdb/errors"
+	machineryconfig "github.com/siderolabs/talos/pkg/machinery/config"
+)
+
+const postUpgradeVersionMismatchHint = "two hypotheses produce this symptom: " +
+	"(1) Talos auto-rolled back after the new partition failed its boot readiness check — " +
+	"cross-vendor upgrades (e.g. cozystack-bundled image -> vanilla siderolabs installer) " +
+	"drop bundled extensions and trigger this. " +
+	"(2) The node is slower than the 90s reconcile window — large image pulls or cold " +
+	"hardware can exceed it. Re-run `talm get version` after a minute to distinguish: if " +
+	"the version updated, the node was just slow; if it's still the old version, the " +
+	"rollback case is real. Pass --skip-post-upgrade-verify to bypass."
+
+// verifyPostUpgradeVersion is the Phase 2C gate: after talosctl upgrade
+// returns, re-read the node's runtime.Version COSI resource and compare
+// against the target version parsed from the installer image. Talos
+// auto-rolls back when a new boot fails its readiness check, but the
+// upgrade RPC has already acked — so success without verification is
+// false advertising. See cozystack/talm#175 for the reproduction
+// (cross-vendor installer image triggering an A/B rollback on a
+// 3-node OCI v1.12.6 stand).
+//
+// Returns nil on match (or on best-effort read failure with --skip-* not
+// set; the caller decides). Returns a hint-bearing blocker on
+// version mismatch.
+func verifyPostUpgradeVersion(
+	ctx context.Context,
+	read versionReader,
+	targetImage string,
+	w io.Writer,
+) error {
+	target := parseTargetVersion(targetImage)
+	if target == "" {
+		// Best-effort: no tag, digest pin, or empty image. We can't
+		// verify without a tag literal — pass silently. The blocking
+		// class is reserved for actual detected mismatches.
+		return nil
+	}
+
+	targetContract, err := machineryconfig.ParseContractFromVersion(target)
+	if err != nil {
+		// Unparseable tag (e.g. "latest", a branch name, a SHA-style
+		// fork tag) — same best-effort surrender as the empty case.
+		return nil //nolint:nilerr // surrender is the contract; see TestVerifyPostUpgradeVersion_UnparseableTag_Skip.
+	}
+
+	running, ok, readErr := read(ctx)
+	if !ok {
+		if readErr != nil {
+			// Real read failure on an upgrade-verify call IS the
+			// rollback signal — exactly what the gate exists to
+			// catch. A node that auto-rolled back or hung mid-boot
+			// looks like "connection refused" / "context deadline
+			// exceeded" from the COSI client. Block with the same
+			// two-hypothesis hint as a detected version mismatch so
+			// the operator gets actionable guidance instead of a
+			// silent warning that buries the real signal.
+			//
+			//nolint:wrapcheck // cockroachdb/errors.WithHint at boundary.
+			return errors.WithHint(
+				errors.Wrapf(readErr, "post-upgrade verify: could not read running version from the node after upgrade to %s", target),
+				postUpgradeVersionMismatchHint,
+			)
+		}
+
+		// ok=false err=nil — by-design unreachable (e.g. a custom
+		// reader that signals "not applicable on this path" without
+		// an err). cosiVersionReader does not produce this shape,
+		// but the contract leaves room for future readers that need
+		// to surrender silently without an error.
+		_, _ = fmt.Fprintln(w, "warning: post-upgrade verification skipped, could not read running version from the node")
+
+		return nil
+	}
+
+	runningContract, err := machineryconfig.ParseContractFromVersion(running)
+	if err != nil {
+		_, _ = fmt.Fprintf(w, "warning: post-upgrade verification skipped, could not parse running version %q\n", running)
+
+		return nil //nolint:nilerr // best-effort: unparseable running version is a soft warning.
+	}
+
+	if runningContract.Major == targetContract.Major && runningContract.Minor == targetContract.Minor {
+		return nil
+	}
+
+	//nolint:wrapcheck // cockroachdb/errors.WithHint at boundary.
+	return errors.WithHint(
+		errors.Newf("post-upgrade: requested upgrade to %s but running version is %s — either Talos auto-rolled back, or the node is still booting beyond the 90s window", target, running),
+		postUpgradeVersionMismatchHint,
+	)
+}
+
+// parseTargetVersion lifts a Talos version literal from an installer
+// image reference. Inputs:
+//
+//   - ghcr.io/cozystack/cozystack/talos:v1.12.6 -> v1.12.6
+//   - ghcr.io/siderolabs/installer:v1.13.0 -> v1.13.0
+//   - factory.talos.dev/installer/<sha256>:v1.13.0 -> v1.13.0
+//   - registry.local:5000/foo/installer (no tag, port in registry) -> ""
+//   - "" or no tag separator -> ""
+//
+// Digest-pinned references (image@sha256:...) are rejected up front:
+// Phase 2C can only verify when the tag carries the version literal.
+// A digest-pinned upgrade silently passes the check (the operator
+// opted into a content-addressed install and can read it back via
+// talos image list).
+//
+// Returns the tag verbatim; comparison is done via machineryconfig
+// VersionContract so v1.13 and v1.13.0 are equivalent at the minor
+// level (point-release upgrades pass silently).
+func parseTargetVersion(image string) string {
+	// Reject digest pins up front. The "@sha256:" / "@sha512:" marker
+	// is the authoritative signal — searching for `/` in the
+	// post-`:` substring catches some shapes but misses hex-only
+	// digests like `image@sha256:abc123def456`.
+	if strings.Contains(image, "@sha256:") || strings.Contains(image, "@sha512:") {
+		return ""
+	}
+
+	// The tag — when present — is always the last path component's
+	// suffix after `:`. Splitting on `/` first isolates that
+	// component, so a registry port (`registry.local:5000`) in an
+	// earlier path component cannot be mistaken for a tag separator.
+	lastSlash := strings.LastIndex(image, "/")
+
+	tail := image
+	if lastSlash >= 0 {
+		tail = image[lastSlash+1:]
+	}
+
+	idx := strings.LastIndex(tail, ":")
+	if idx < 0 || idx == len(tail)-1 {
+		return ""
+	}
+
+	return tail[idx+1:]
+}
diff --git a/pkg/commands/preflight_upgrade_verify_test.go b/pkg/commands/preflight_upgrade_verify_test.go
new file mode 100644
index 00000000..5733bfdc
--- /dev/null
+++ b/pkg/commands/preflight_upgrade_verify_test.go
@@ -0,0 +1,304 @@
+// Copyright Cozystack Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package commands
+
+import (
+	"bytes"
+	"context"
+	"strings"
+	"testing"
+
+	"github.com/cockroachdb/errors"
+)
+
+// TestShouldRunPostUpgradeVerify_SkipMatrix pins the predicate that
+// gates Phase 2C scheduling. The gate cannot produce a meaningful
+// result on --insecure (no auth COSI path) or --stage (new partition
+// not yet booted); both must be skipped to avoid false-positive
+// blockers. The skip flag overrides everything (operator opt-out).
+func TestShouldRunPostUpgradeVerify_SkipMatrix(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name     string
+		insecure bool
+		staged   bool
+		skip     bool
+		want     bool
+	}{
+		{"default runs", false, false, false, true},
+		{"--skip-post-upgrade-verify suppresses everything", false, false, true, false},
+		{"--insecure skipped (no auth COSI)", true, false, false, false},
+		{"--stage skipped (new partition not booted)", false, true, false, false},
+		{"--insecure + --stage skipped", true, true, false, false},
+		{"all-on skipped", true, true, true, false},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+
+			got := shouldRunPostUpgradeVerify(tc.insecure, tc.staged, tc.skip)
+			if got != tc.want {
+				t.Errorf("shouldRunPostUpgradeVerify(insecure=%v, staged=%v, skip=%v) = %v, want %v",
+					tc.insecure, tc.staged, tc.skip, got, tc.want)
+			}
+		})
+	}
+}
+
+// TestParseTargetVersion pins the image-tag → version-literal contract.
+// The walker handles ghcr / quay / factory references and silently
+// surrenders on digest pins (we can't infer the version without an
+// extra registry round-trip).
+func TestParseTargetVersion(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		in   string
+		want string
+	}{
+		{"ghcr cozystack", "ghcr.io/cozystack/cozystack/talos:v1.12.6", "v1.12.6"},
+		{"ghcr siderolabs installer", "ghcr.io/siderolabs/installer:v1.13.0", "v1.13.0"},
+		{"factory installer", "factory.talos.dev/installer/abcd1234:v1.13.0", "v1.13.0"},
+		{"no tag", "ghcr.io/cozystack/cozystack/talos", ""},
+		{"empty tag", "ghcr.io/cozystack/cozystack/talos:", ""},
+		{"digest pin with slash in hex (rare)", "ghcr.io/cozystack/cozystack/talos@sha256:abc/def", ""},
+		{"realistic digest pin (hex only)", "ghcr.io/cozystack/cozystack/talos@sha256:abc123def456", ""},
+		{"sha512 digest", "ghcr.io/foo/bar@sha512:0123456789abcdef", ""},
+		{"registry with port, no tag", "registry.local:5000/foo/installer", ""},
+		{"registry with port AND tag", "registry.local:5000/foo/installer:v1.13.0", "v1.13.0"},
+		{"empty input", "", ""},
+		{"only colon", ":", ""},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+
+			got := parseTargetVersion(tc.in)
+			if got != tc.want {
+				t.Errorf("parseTargetVersion(%q) = %q, want %q", tc.in, got, tc.want)
+			}
+		})
+	}
+}
+
+// TestVerifyPostUpgradeVersion_Match_NoError pins the silent-pass
+// contract: running version's contract matches the target's contract
+// → no output, no error. This is the post-upgrade equivalent of
+// 'apply -dry-run says 0/0/0 unchanged'.
+func TestVerifyPostUpgradeVersion_Match_NoError(t *testing.T) {
+	t.Parallel()
+
+	buf := &bytes.Buffer{}
+	err := verifyPostUpgradeVersion(
+		context.Background(),
+		stubReader("v1.13.0", true),
+		"ghcr.io/siderolabs/installer:v1.13.0",
+		buf,
+	)
+	if err != nil {
+		t.Errorf("matching versions should not error, got %v", err)
+	}
+
+	if buf.Len() != 0 {
+		t.Errorf("matching versions should be silent, got %q", buf.String())
+	}
+}
+
+// TestVerifyPostUpgradeVersion_MinorMismatch_Blocks pins the
+// silent-rollback detection: running v1.12 + target v1.13 means
+// Talos auto-rolled back. The gate must surface a blocker with a
+// hint pointing at the cross-vendor / missing-extension footgun.
+func TestVerifyPostUpgradeVersion_MinorMismatch_Blocks(t *testing.T) {
+	t.Parallel()
+
+	buf := &bytes.Buffer{}
+	err := verifyPostUpgradeVersion(
+		context.Background(),
+		stubReader("v1.12.6", true),
+		"ghcr.io/siderolabs/installer:v1.13.0",
+		buf,
+	)
+	if err == nil {
+		t.Fatal("version mismatch must block, got nil error")
+	}
+
+	msg := err.Error() + " " + buf.String()
+	if !strings.Contains(msg, "v1.12.6") {
+		t.Errorf("error should cite running version, got %q", msg)
+	}
+
+	if !strings.Contains(msg, "v1.13") {
+		t.Errorf("error should cite target version, got %q", msg)
+	}
+}
+
+// TestVerifyPostUpgradeVersion_PatchVersion_Match pins the point-
+// release case: running v1.12.6 + target v1.12.7 are the same
+// minor contract (both 1.12). Phase 2C considers this a success —
+// point releases don't trip the rollback detection.
+func TestVerifyPostUpgradeVersion_PatchVersion_Match(t *testing.T) {
+	t.Parallel()
+
+	err := verifyPostUpgradeVersion(
+		context.Background(),
+		stubReader("v1.12.6", true),
+		"ghcr.io/cozystack/cozystack/talos:v1.12.7",
+		&bytes.Buffer{},
+	)
+	if err != nil {
+		t.Errorf("point release (same minor contract) should pass, got %v", err)
+	}
+}
+
+// TestVerifyPostUpgradeVersion_UnparseableTag_Skip pins the
+// best-effort contract: if the target image's tag is missing,
+// digest-pinned, or unparseable, the gate cannot verify — it
+// should pass silently rather than block a legitimate upgrade.
+func TestVerifyPostUpgradeVersion_UnparseableTag_Skip(t *testing.T) {
+	t.Parallel()
+
+	tests := []string{
+		"",
+		"no-tag-image",
+		"ghcr.io/foo/bar@sha256:abc/def",
+		"ghcr.io/foo/bar@sha256:abc123def456",
+		"ghcr.io/foo/bar:not-a-version-string",
+	}
+
+	for _, image := range tests {
+		t.Run(image, func(t *testing.T) {
+			t.Parallel()
+
+			err := verifyPostUpgradeVersion(
+				context.Background(),
+				stubReader("v1.12.6", true),
+				image,
+				&bytes.Buffer{},
+			)
+			if err != nil {
+				t.Errorf("unparseable image %q should pass silently, got %v", image, err)
+			}
+		})
+	}
+}
+
+// TestVerifyPostUpgradeVersion_ReaderConnectionRefused_NotSilent is
+// the structural guard for the three-valued versionReader contract.
+// A post-upgrade reader returning ("", false, err) — the shape a real
+// silent rollback or hung boot produces (connection refused / context
+// deadline exceeded from COSI) — must surface as a hint-bearing
+// blocker citing both hypotheses (rollback OR slow boot), NOT as a
+// silent warning. The original two-valued reader signature collapsed
+// every read failure to ok=false and silent-passed it, leaving the
+// silent-rollback detection with a documented blind spot. The new
+// three-valued signature lets the gate distinguish "by design
+// unreachable" (ok=false err=nil → silent pass, see
+// ReaderFails_SoftWarning_NoBlock) from "real read failure"
+// (ok=false err!=nil → blocker, this test).
+func TestVerifyPostUpgradeVersion_ReaderConnectionRefused_NotSilent(t *testing.T) {
+	t.Parallel()
+
+	buf := &bytes.Buffer{}
+	err := verifyPostUpgradeVersion(
+		context.Background(),
+		stubReaderErr(errors.New("connection refused")),
+		"ghcr.io/siderolabs/installer:v1.13.0",
+		buf,
+	)
+	if err == nil {
+		t.Fatal("connection-refused read must surface as a blocker — it IS the rollback signal")
+	}
+
+	msg := err.Error()
+	if !strings.Contains(msg, "connection refused") {
+		t.Errorf("blocker should wrap the underlying read err so the operator sees the cause, got %q", msg)
+	}
+
+	if !strings.Contains(msg, "v1.13.0") {
+		t.Errorf("blocker should cite the target version the upgrade was supposed to reach, got %q", msg)
+	}
+
+	hint := errors.GetAllHints(err)
+	if len(hint) == 0 {
+		t.Errorf("blocker must carry the two-hypothesis hint (rollback OR slow boot), got no hints on err=%v", err)
+	}
+}
+
+// TestVerifyPostUpgradeVersion_ReaderFails_SoftWarning_NoBlock is
+// the pin that backs the test-plan's Phase 2C "Reader failure" row:
+// when the post-upgrade reader returns ok=false, the gate emits a
+// soft warning AND returns nil — never a hint-bearing blocker.
+// Direct guard against the test-plan claim drifting back to "wrap
+// the underlying err": the versionReader signature has no err to
+// wrap, and the gate is informational-by-design for reader-side
+// transients. Companion to TestVerifyPostUpgradeVersion_ReaderFails_BestEffort
+// (which asserts the warning text) — this one asserts the
+// contractual shape (no error, no WithHint chain).
+func TestVerifyPostUpgradeVersion_ReaderFails_SoftWarning_NoBlock(t *testing.T) {
+	t.Parallel()
+
+	buf := &bytes.Buffer{}
+	err := verifyPostUpgradeVersion(
+		context.Background(),
+		stubReader("", false),
+		"ghcr.io/siderolabs/installer:v1.13.0",
+		buf,
+	)
+	if err != nil {
+		t.Fatalf("reader failure must NOT block: got err=%v", err)
+	}
+
+	// Soft warning visible to the operator.
+	if !strings.Contains(buf.String(), "warning") {
+		t.Errorf("soft warning expected in output, got %q", buf.String())
+	}
+
+	// Direct guard against the test-plan claim drifting back to
+	// "Hint-bearing blocker with wrapped underlying err". If a future
+	// refactor flips this branch to a blocker, both this test and
+	// TestVerifyPostUpgradeVersion_ReaderFails_BestEffort must be
+	// updated in lockstep with the doc — and the versionReader
+	// signature must grow an error return for "wrapped err" to even
+	// be meaningful.
+}
+
+// TestVerifyPostUpgradeVersion_ReaderFails_BestEffort pins the
+// reader-failure path. Phase 2C is informational at its core — if
+// the node is unreachable post-upgrade, the gate prints a warning
+// to w and returns nil. The blocking class is reserved for actual
+// detected mismatches; a transient unreachable node should not
+// block the rest of the apply pipeline.
+func TestVerifyPostUpgradeVersion_ReaderFails_BestEffort(t *testing.T) {
+	t.Parallel()
+
+	buf := &bytes.Buffer{}
+	err := verifyPostUpgradeVersion(
+		context.Background(),
+		stubReader("", false),
+		"ghcr.io/siderolabs/installer:v1.13.0",
+		buf,
+	)
+	if err != nil {
+		t.Errorf("reader failure should pass best-effort, got %v", err)
+	}
+
+	if !strings.Contains(buf.String(), "could not read") {
+		t.Errorf("expected explanatory line about read failure, got %q", buf.String())
+	}
+}
diff --git a/pkg/commands/upgrade_handler.go b/pkg/commands/upgrade_handler.go
index 9c5a4366..f79771f6 100644
--- a/pkg/commands/upgrade_handler.go
+++ b/pkg/commands/upgrade_handler.go
@@ -15,19 +15,41 @@
 package commands
 
 import (
+	"context"
 	"fmt"
+	"io"
 	"os"
+	"time"
 
 	"github.com/cockroachdb/errors"
 	"github.com/cozystack/talm/pkg/engine"
+	"github.com/siderolabs/talos/pkg/machinery/client"
 	"github.com/siderolabs/talos/pkg/machinery/config/configloader"
 	"github.com/spf13/cobra"
 )
 
+// postUpgradeReconcileWindow is how long we wait after talosctl
+// upgrade returns before re-reading the running version. Talos
+// reboots and reaches "running" stage in well under a minute on
+// healthy hardware; auto-rollback adds ~30s on top of that. 90s
+// covers both paths with margin.
+const postUpgradeReconcileWindow = 90 * time.Second
+
+// upgradeCmdFlags carries the talm-side flags layered on top of the
+// talosctl-derived upgrade command (set up in wrapUpgradeCommand).
+//
+//nolint:gochecknoglobals // command-scoped flag struct, mirrors applyCmdFlags pattern.
+var upgradeCmdFlags struct {
+	skipPostUpgradeVerify bool
+}
+
 // wrapUpgradeCommand adds special handling for upgrade command: extract image from config and set --image flag
 //
 //nolint:gocognit,gocyclo,cyclop,funlen,nestif // cobra wrapper branching over (image extraction, file paths, modeline) for the upgrade flow; each branch is short.
 func wrapUpgradeCommand(wrappedCmd *cobra.Command, originalRunE func(*cobra.Command, []string) error) {
+	wrappedCmd.Flags().BoolVar(&upgradeCmdFlags.skipPostUpgradeVerify, "skip-post-upgrade-verify", false,
+		"skip the post-upgrade check that compares running Talos version against the target image's tag (Phase 2C; detects silent A/B rollback per #175)")
+
 	wrappedCmd.RunE = func(cmd *cobra.Command, args []string) error {
 		// Get config files from --file flag
 		var filesToProcess []string
@@ -130,15 +152,175 @@ func wrapUpgradeCommand(wrappedCmd *cobra.Command, originalRunE func(*cobra.Comm
 			}
 		}
 
+		// Capture the upgrade target image + path-shaping flags
+		// BEFORE original RunE runs. talosctl's own upgrade handler
+		// can overwrite the --image flag with the node's
+		// currently-running install.image (the no-op-upgrade path),
+		// which would mask the version mismatch Phase 2C exists to
+		// catch. --insecure and --stage are captured here too so
+		// the post-upgrade gate's mode predicate sees what the
+		// operator actually asked for, not whatever state talosctl
+		// left in the flags afterwards.
+		targetImage, _ := cmd.Flags().GetString("image")
+		insecure, _ := cmd.Flags().GetBool("insecure")
+		staged, _ := cmd.Flags().GetBool("stage")
+
 		// Execute original command
-		if originalRunE != nil {
-			return originalRunE(cmd, args)
-		} else if wrappedCmd.Run != nil {
+		var execErr error
+
+		switch {
+		case originalRunE != nil:
+			execErr = originalRunE(cmd, args)
+		case wrappedCmd.Run != nil:
 			wrappedCmd.Run(cmd, args)
+		}
+
+		if execErr != nil {
+			return execErr
+		}
 
+		// Phase 2C: post-upgrade version verify. Detects the silent
+		// auto-rollback case (#175): talosctl upgrade acks the RPC,
+		// Talos pulls + writes the new install, A/B boot fails its
+		// readiness check, Talos rolls back to the prior partition,
+		// and the operator's "successful" upgrade silently no-ops.
+		// Skip predicate documents the cases where this gate cannot
+		// produce a meaningful result.
+		if !shouldRunPostUpgradeVerify(insecure, staged, upgradeCmdFlags.skipPostUpgradeVerify) {
 			return nil
 		}
 
+		if targetImage == "" {
+			fmt.Fprintln(os.Stderr, "post-upgrade verify: skipped, no target image to compare against")
+
+			return nil
+		}
+
+		return runPostUpgradeVersionVerify(cmd.Context(), targetImage)
+	}
+}
+
+// shouldRunPostUpgradeVerify is the pure predicate for Phase 2C
+// scheduling. The gate cannot produce a meaningful result when:
+//
+//   - --skip-post-upgrade-verify is set (operator opt-out).
+//   - --insecure was passed to upgrade: the maintenance / pre-auth
+//     connection cannot reach the auth-only COSI ctx WithClient
+//     builds. Pre-fix, the gate fell through to WithClient and
+//     either silently surrendered on "version unreadable" or
+//     connected to an unrelated node from talosconfig context.
+//     Mirrors cosiMachineConfigReader's insecure-path branch in
+//     pkg/commands/preflight_apply_safety.go.
+//   - --stage was passed to upgrade: talosctl --stage writes the
+//     new image to the inactive partition without activating it;
+//     activation happens on the next reboot. runtime.Version still
+//     reports the OLD version because the new partition isn't
+//     booted — a guaranteed false-positive blocker without this
+//     skip. Mirrors shouldRunPostApplyVerify's STAGED case in
+//     pkg/commands/apply.go.
+func shouldRunPostUpgradeVerify(insecure, staged, skip bool) bool {
+	if skip {
+		return false
+	}
+
+	if insecure {
+		return false
+	}
+
+	if staged {
+		return false
+	}
+
+	return true
+}
+
+// runPostUpgradeVersionVerify waits a reconcile window, then for
+// each --nodes target reads the running Talos version and compares
+// against the version parsed from the target image tag. Surfaces
+// the first divergence as a blocker.
+//
+// parentCtx is used only for the reconcile-window wait; the actual
+// COSI reads run under the ctx WithClient constructs from
+// talosconfig, which is the right scope for per-node addressing.
+//
+//nolint:contextcheck // intentional ctx boundary at WithClient.
+func runPostUpgradeVersionVerify(parentCtx context.Context, image string) error {
+	if parentCtx == nil {
+		parentCtx = context.Background()
+	}
+
+	return WithClient(func(ctx context.Context, c *client.Client) error {
+		ctxNodes := []string(nil)
+		if cfg := c.GetConfigContext(); cfg != nil {
+			ctxNodes = cfg.Nodes
+		}
+
+		nodes := resolveUpgradeTargetNodes(GlobalArgs.Nodes, ctxNodes)
+
+		return runPostUpgradeVersionVerifyInner(parentCtx, ctx, nodes, image, cosiVersionReader(c), postUpgradeReconcileWindow, os.Stderr)
+	})
+}
+
+// resolveUpgradeTargetNodes picks the per-node target list for the
+// post-upgrade verify. CLI `--nodes` wins outright when non-empty;
+// otherwise the talosconfig context's pre-configured node list is
+// used. Returns a freshly-allocated slice so callers can append
+// without mutating either source.
+func resolveUpgradeTargetNodes(cliNodes, ctxNodes []string) []string {
+	if len(cliNodes) > 0 {
+		return append([]string(nil), cliNodes...)
+	}
+
+	return append([]string(nil), ctxNodes...)
+}
+
+// runPostUpgradeVersionVerifyInner is the testable body of Phase 2C.
+// It resolves the "no work to do" case BEFORE the reconcile-window
+// wait so an operator with empty `--nodes` and no talosconfig nodes
+// doesn't sit through 90s of a "waiting for the node to finish
+// booting..." line just to be told there was no target node.
+//
+// reconcileWindow and stderr are dependency-injected so the test can
+// run with a sub-millisecond window and capture output without
+// touching the package globals.
+func runPostUpgradeVersionVerifyInner(
+	parentCtx, clientCtx context.Context,
+	nodes []string,
+	image string,
+	read versionReader,
+	reconcileWindow time.Duration,
+	stderr io.Writer,
+) error {
+	if len(nodes) == 0 {
+		_, _ = fmt.Fprintln(stderr, "post-upgrade verify: skipped, no target nodes resolved from --nodes or talosconfig context")
+
 		return nil
 	}
+
+	_, _ = fmt.Fprintf(stderr, "post-upgrade verify: waiting %s for the node to finish booting...\n", reconcileWindow)
+
+	select {
+	case <-time.After(reconcileWindow):
+	case <-parentCtx.Done():
+		return errors.Wrap(parentCtx.Err(), "post-upgrade verify: context cancelled while waiting for reconcile window")
+	}
+
+	// Collect per-node verify errors and join at the end rather than
+	// short-circuiting on the first failure. Mirrors runPostApplyGates
+	// (apply.go) — talosctl upgrade has already executed on every
+	// node before this loop starts, so stopping at the first failure
+	// hides partial rollbacks on the remaining nodes. The operator
+	// sees one blocker now, fixes it, re-runs, and discovers the
+	// second blocker — wasting an upgrade cycle. errors.Join keeps
+	// every node's verdict in the final error.
+	var perNodeErrs []error
+
+	for _, node := range nodes {
+		nodeCtx := client.WithNode(clientCtx, node)
+		if err := verifyPostUpgradeVersion(nodeCtx, read, image, stderr); err != nil {
+			perNodeErrs = append(perNodeErrs, errors.Wrapf(err, "node %s", node))
+		}
+	}
+
+	return errors.Join(perNodeErrs...)
 }
diff --git a/pkg/commands/upgrade_handler_test.go b/pkg/commands/upgrade_handler_test.go
new file mode 100644
index 00000000..4cd11017
--- /dev/null
+++ b/pkg/commands/upgrade_handler_test.go
@@ -0,0 +1,262 @@
+// Copyright Cozystack Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package commands
+
+import (
+	"bytes"
+	"context"
+	"reflect"
+	"strings"
+	"testing"
+	"time"
+)
+
+// TestResolveUpgradeTargetNodes_CLINodesWin pins the resolution
+// contract: --nodes overrides the talosconfig context's pre-
+// configured node list when non-empty. A non-empty CLI list MUST
+// shadow the context entirely (not merge), so an operator can
+// scope a one-off upgrade without editing talosconfig.
+func TestResolveUpgradeTargetNodes_CLINodesWin(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		cli  []string
+		ctx  []string
+		want []string
+	}{
+		{"both empty", nil, nil, nil},
+		{"cli wins outright", []string{"192.0.2.10"}, []string{"192.0.2.20"}, []string{"192.0.2.10"}},
+		{"only cli set", []string{"192.0.2.10"}, nil, []string{"192.0.2.10"}},
+		{"only ctx set falls through", nil, []string{"192.0.2.20"}, []string{"192.0.2.20"}},
+		{"both empty slices (distinct from nil)", []string{}, []string{}, []string{}},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+
+			got := resolveUpgradeTargetNodes(tc.cli, tc.ctx)
+
+			// Empty-vs-nil semantics: callers iterate via range,
+			// which handles both identically. The test treats
+			// (len == 0 && want len == 0) as a match regardless
+			// of nil-ness — the contract is "no nodes", not "nil".
+			if len(got) == 0 && len(tc.want) == 0 {
+				return
+			}
+
+			if !reflect.DeepEqual(got, tc.want) {
+				t.Errorf("resolveUpgradeTargetNodes(%v, %v) = %v, want %v", tc.cli, tc.ctx, got, tc.want)
+			}
+		})
+	}
+}
+
+// TestResolveUpgradeTargetNodes_DoesNotMutateInputs pins the
+// no-aliasing contract: the resolver returns a freshly-allocated
+// slice; callers can append to the result without poisoning
+// GlobalArgs.Nodes or the talosconfig context's Nodes slice.
+func TestResolveUpgradeTargetNodes_DoesNotMutateInputs(t *testing.T) {
+	t.Parallel()
+
+	cli := []string{"192.0.2.10"}
+	ctx := []string{"192.0.2.20"}
+
+	got := resolveUpgradeTargetNodes(cli, ctx)
+	if len(got) > 0 {
+		// Mutating the clone must NOT leak into either input slice.
+		got[0] = "mutated-by-test"
+	}
+
+	if cli[0] != "192.0.2.10" || len(cli) != 1 {
+		t.Errorf("cli mutated by resolver: %v", cli)
+	}
+
+	if ctx[0] != "192.0.2.20" || len(ctx) != 1 {
+		t.Errorf("ctx mutated by resolver: %v", ctx)
+	}
+}
+
+// TestRunPostUpgradeVersionVerifyInner_EmptyNodes_SkipsImmediately
+// is the wall-clock regression pin: when there are no nodes to
+// verify against, the early-return must fire BEFORE the reconcile-
+// window wait. Previously the function logged "waiting 90s..." and
+// only THEN discovered the empty list, wasting 90s of the operator's
+// life. The assertion uses a 1-second budget against a 5-minute
+// reconcile window — any sane implementation that places the wait
+// after the empty-check sails through this; one that places it
+// before fails by 4+ orders of magnitude.
+func TestRunPostUpgradeVersionVerifyInner_EmptyNodes_SkipsImmediately(t *testing.T) {
+	t.Parallel()
+
+	buf := &bytes.Buffer{}
+
+	start := time.Now()
+	err := runPostUpgradeVersionVerifyInner(
+		context.Background(),
+		context.Background(),
+		nil,
+		"ghcr.io/siderolabs/installer:v1.13.0",
+		stubReader("v1.13.0", true),
+		// Deliberately huge — if the implementation drifts back
+		// to waiting before resolving nodes, this test stalls for
+		// 5 minutes and the failure message names the regression.
+		5*time.Minute,
+		buf,
+	)
+	elapsed := time.Since(start)
+
+	if err != nil {
+		t.Fatalf("empty-nodes path should not error: got %v", err)
+	}
+
+	if elapsed > time.Second {
+		t.Errorf("empty-nodes path stalled for %v — the reconcile-window wait must come AFTER the node-resolution check; if you see this, the wait was placed before the empty-check again", elapsed)
+	}
+
+	if !strings.Contains(buf.String(), "skipped, no target nodes resolved") {
+		t.Errorf("empty-nodes path should emit the 'skipped, no target nodes' line, got %q", buf.String())
+	}
+
+	if strings.Contains(buf.String(), "waiting") {
+		t.Errorf("empty-nodes path must NOT print the 'waiting ... for the node to finish booting' line, got %q", buf.String())
+	}
+}
+
+// TestRunPostUpgradeVersionVerifyInner_MultiNode_AllErrorsReported
+// pins the collect-then-block semantics for multi-node upgrades.
+// talosctl upgrade has already executed on every node by the time
+// this loop runs, so short-circuiting on the first failing node
+// would hide partial rollbacks on the rest — operator fixes node 1,
+// re-runs, discovers node 3 separately, wasting an upgrade cycle.
+// The error must name every failing node so a single re-run cycle
+// surfaces every problem.
+func TestRunPostUpgradeVersionVerifyInner_MultiNode_AllErrorsReported(t *testing.T) {
+	t.Parallel()
+
+	buf := &bytes.Buffer{}
+
+	// Reader signals a rollback on every node — running version is
+	// pinned at v1.12.6 while the upgrade asked for v1.13.0.
+	read := func(_ context.Context) (string, bool, error) {
+		return "v1.12.6", true, nil
+	}
+
+	err := runPostUpgradeVersionVerifyInner(
+		context.Background(),
+		context.Background(),
+		[]string{"192.0.2.10", "192.0.2.11", "192.0.2.12"},
+		"ghcr.io/siderolabs/installer:v1.13.0",
+		read,
+		time.Millisecond,
+		buf,
+	)
+	if err == nil {
+		t.Fatal("three rollbacked nodes must surface as an error, got nil")
+	}
+
+	msg := err.Error()
+	for _, node := range []string{"192.0.2.10", "192.0.2.11", "192.0.2.12"} {
+		if !strings.Contains(msg, node) {
+			t.Errorf("joined error must cite node %s, got:\n%s", node, msg)
+		}
+	}
+}
+
+// TestRunPostUpgradeVersionVerifyInner_MultiNode_FirstFailureDoesNotBlockRest
+// pins that one node's failure doesn't short-circuit the loop. The
+// stub here returns alternating verdicts via a call counter — call
+// 1 (node 192.0.2.10) rolls back, call 2 (node 192.0.2.11) matches.
+// Without collect-then-block the loop would exit after node 1 and
+// node 2's call wouldn't happen; with collect-then-block both are
+// invoked and the joined error names the failing one.
+func TestRunPostUpgradeVersionVerifyInner_MultiNode_FirstFailureDoesNotBlockRest(t *testing.T) {
+	t.Parallel()
+
+	buf := &bytes.Buffer{}
+
+	// Alternate per call: first call → mismatch, second call → match.
+	calls := 0
+	read := func(_ context.Context) (string, bool, error) {
+		calls++
+
+		if calls == 1 {
+			return "v1.12.6", true, nil // version mismatch
+		}
+
+		return "v1.13.0", true, nil // matches target
+	}
+
+	err := runPostUpgradeVersionVerifyInner(
+		context.Background(),
+		context.Background(),
+		[]string{"192.0.2.10", "192.0.2.11"},
+		"ghcr.io/siderolabs/installer:v1.13.0",
+		read,
+		time.Millisecond,
+		buf,
+	)
+	if err == nil {
+		t.Fatal("first-node mismatch must surface as an error, got nil")
+	}
+
+	if calls != 2 {
+		t.Errorf("expected both nodes' readers to be invoked (collect-then-block); got %d call(s)", calls)
+	}
+
+	if !strings.Contains(err.Error(), "192.0.2.10") {
+		t.Errorf("error must cite the failing node 192.0.2.10, got: %v", err)
+	}
+}
+
+// TestRunPostUpgradeVersionVerifyInner_NonEmptyNodes_WaitsAndVerifies
+// is the companion check: with a non-empty node list the function
+// DOES print the wait line and DOES invoke the reader. Uses a 1ms
+// reconcile window so the test runs quickly while still going
+// through the time.After branch.
+func TestRunPostUpgradeVersionVerifyInner_NonEmptyNodes_WaitsAndVerifies(t *testing.T) {
+	t.Parallel()
+
+	buf := &bytes.Buffer{}
+
+	called := 0
+	read := func(_ context.Context) (string, bool, error) {
+		called++
+
+		return "v1.13.0", true, nil
+	}
+
+	err := runPostUpgradeVersionVerifyInner(
+		context.Background(),
+		context.Background(),
+		[]string{"192.0.2.10"},
+		"ghcr.io/siderolabs/installer:v1.13.0",
+		read,
+		time.Millisecond,
+		buf,
+	)
+	if err != nil {
+		t.Fatalf("matching version path should not error: got %v", err)
+	}
+
+	if called != 1 {
+		t.Errorf("expected reader invoked once per node, got %d", called)
+	}
+
+	if !strings.Contains(buf.String(), "waiting") {
+		t.Errorf("non-empty path should print the 'waiting' line, got %q", buf.String())
+	}
+}
diff --git a/pkg/engine/engine_test.go b/pkg/engine/engine_test.go
index 9f46d930..d8cee0b6 100644
--- a/pkg/engine/engine_test.go
+++ b/pkg/engine/engine_test.go
@@ -743,6 +743,13 @@ func TestNoWorkflowLeakageInRepoSource(t *testing.T) {
 		"address review",
 		"in-flight rebase",
 		"rebase notes",
+		// Private operator identifiers — test stand hostnames,
+		// home-directory paths, internal cluster names. These are
+		// reproducibility context that belongs in chat / memory,
+		// not in published artefacts. Add new entries when a new
+		// stand name or username surfaces.
+		"dev17",
+		"lexfrei",
 	}
 	scanExt := map[string]bool{
 		".go":   true,