Launchers - Kubernetes - Jobs - Sometimes, logs cannot be retrieved when job has been suspended (Kueue) then resumed


```
% kubectl --kubeconfig nebius.kubeconfig.yaml --namespace oasis-jobs logs tangle-ce-019d4eb66c44f3abd9b7-1-wrlwf   Defaulted container "main" out of: main, gcsfuse-init (init)
unable to retrieve container logs for containerd://6a263159bca0bd9287a2dccec50230532ec0dd1a4b7cf9688d77bb17d18c10ea
```
We see that many pods are stuck in Terminating phase.

```
% kubectl --kubeconfig nebius.kubeconfig.yaml --namespace oasis-jobs get pods --selector job-name=tangle-ce-019d4eb66c44f3abd9b7
NAME                                     READY   STATUS        RESTARTS   AGE
tangle-ce-019d4eb66c44f3abd9b7-0-5ljwq   2/2     Running       0          8h
tangle-ce-019d4eb66c44f3abd9b7-0-69vlf   0/2     Terminating   0          9h
tangle-ce-019d4eb66c44f3abd9b7-0-c6j4t   0/2     Terminating   0          12h
tangle-ce-019d4eb66c44f3abd9b7-0-dk49w   0/2     Terminating   0          14h
tangle-ce-019d4eb66c44f3abd9b7-1-7jxpf   0/2     Terminating   0          12h
tangle-ce-019d4eb66c44f3abd9b7-1-hz4lm   0/2     Terminating   0          14h
tangle-ce-019d4eb66c44f3abd9b7-1-s5rsm   0/2     Terminating   0          12h
tangle-ce-019d4eb66c44f3abd9b7-1-t2g7h   0/2     Terminating   0          12h
tangle-ce-019d4eb66c44f3abd9b7-1-wc2f6   2/2     Running       0          8h
tangle-ce-019d4eb66c44f3abd9b7-1-wrlwf   0/2     Terminating   0          9h
```


```
% kubectl --kubeconfig nebius.kubeconfig.yaml --namespace oasis-jobs get pod tangle-ce-019d4eb66c44f3abd9b7-1-wrlwf -o yaml

apiVersion: v1
kind: Pod
metadata:
  annotations:
    batch.kubernetes.io/job-completion-index: "1"
    batch.kubernetes.io/job-index-failure-count: "0"
    kueue.x-k8s.io/workload: job-tangle-ce-019d4eb66c44f3abd9b7-1a728
  creationTimestamp: "2026-04-02T20:21:02Z"
  deletionGracePeriodSeconds: 30
  deletionTimestamp: "2026-04-02T21:01:12Z"
  generateName: tangle-ce-019d4eb66c44f3abd9b7-1-
  labels:
    batch.kubernetes.io/controller-uid: db09a903-e264-4891-8d3a-d4f0682a8a8f
    batch.kubernetes.io/job-completion-index: "1"
    batch.kubernetes.io/job-name: tangle-ce-019d4eb66c44f3abd9b7
    controller-uid: db09a903-e264-4891-8d3a-d4f0682a8a8f
    job-name: tangle-ce-019d4eb66c44f3abd9b7
    kueue.x-k8s.io/podset: main
    kueue.x-k8s.io/priority-class: default
    kueue.x-k8s.io/queue-name: sidekick-data
    kueue.x-k8s.io/tas: "true"
  name: tangle-ce-019d4eb66c44f3abd9b7-1-wrlwf
  namespace: oasis-jobs
  ownerReferences:
  - apiVersion: batch/v1
    blockOwnerDeletion: true
    controller: true
    kind: Job
    name: tangle-ce-019d4eb66c44f3abd9b7
    uid: db09a903-e264-4891-8d3a-d4f0682a8a8f
  resourceVersion: "570693812"
  uid: caa6d2a3-293c-4e31-a1b7-b8c402628d8b
spec:
  containers:
  - command: xxx
    env:
    - name: NVIDIA_VISIBLE_DEVICES
      value: none
    - name: _TANGLE_MULTI_NODE_NODE_INDEX
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
    - name: JOB_COMPLETION_INDEX
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: metadata.labels['batch.kubernetes.io/job-completion-index']
    image: xxx
    imagePullPolicy: IfNotPresent
    name: main
    resources:
      limits:
        memory: 1200Gi
        nvidia.com/gpu: "8"
      requests:
        cpu: "16"
        memory: 1200Gi
        nvidia.com/gpu: "8"
    securityContext:
      capabilities:
        add:
        - IPC_LOCK
    terminationMessagePath: /dev/termination-log
    terminationMessagePolicy: File
    volumeMounts:
    - mountPath: /tmp/inputs/model_config_data
      name: gcsfuse-xxx
      readOnly: true
      subPath: artifacts/by_execution/019d4eb66c44f3abd9b7/inputs/model_config_data
    - mountPath: /tmp/outputs/model_output_dir
      name: gcsfuse-xxx
      subPath: artifacts/by_execution/019d4eb66c44f3abd9b7/outputs/model_output_dir
    - mountPath: /dev/shm
      name: shared-memory
    - mountPath: /var/run/service-account
      name: token
      readOnly: true
    - mountPath: /etc/workload-identity
      name: credential-configuration
      readOnly: true
    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
      name: kube-api-access-mlsct
      readOnly: true
  dnsPolicy: ClusterFirst
  enableServiceLinks: true
  hostname: tangle-ce-019d4eb66c44f3abd9b7-1
  imagePullSecrets:
  - name: gcp-gar-secret
  initContainers:
  - command:
    - bash
    - -c
    - xxx; sleep infinity
    env:
    - name: GOOGLE_APPLICATION_CREDENTIALS
      value: /etc/workload-identity/credential-configuration.json
    - name: JOB_COMPLETION_INDEX
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: metadata.labels['batch.kubernetes.io/job-completion-index']
    image: gcsfuse
    imagePullPolicy: IfNotPresent
    lifecycle:
      preStop:
        exec:
          command:
          - bash
          - -c
          - echo '[gcsfuse-prestop] unmounting FUSE mounts...' > /proc/1/fd/1 2>/dev/null
            || true; for mp in /gcs_buckets/*/; do if [ -d "$mp" ]; then echo "[gcsfuse-prestop]
            unmounting $mp" > /proc/1/fd/1 2>/dev/null || true; fusermount -u "$mp"
            2>/dev/null || true; fi; done; echo '[gcsfuse-prestop] done' > /proc/1/fd/1
            2>/dev/null || true
    name: gcsfuse-init
    resources: {}
    restartPolicy: Always
      failureThreshold: 3
      periodSeconds: 10
      successThreshold: 1
      timeoutSeconds: 1
    terminationMessagePath: /dev/termination-log
    terminationMessagePolicy: File
    volumeMounts:
    - mountPath: /var/run/service-account
      name: token
      readOnly: true
    - mountPath: /etc/workload-identity
      name: credential-configuration
      readOnly: true
    - mountPath: /gcs_buckets/prd-oasis-tmp
      mountPropagation: Bidirectional
      name: gcsfuse-prd-oasis-tmp
    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
      name: kube-api-access-mlsct
      readOnly: true
  nodeName: xxx
  nodeSelector:
    kubernetes.io/hostname: xxx
    nvidia.com/gpu.product: NVIDIA-H200
  preemptionPolicy: PreemptLowerPriority
  priority: 300
  priorityClassName: default
  restartPolicy: Never
  schedulerName: default-scheduler
  securityContext: {}
  serviceAccount: xxx
  serviceAccountName: xxxx
  subdomain: tangle-ce-019d4eb66c44f3abd9b7
  terminationGracePeriodSeconds: 30
  tolerations:
  - effect: NoSchedule
    key: nvidia.com/gpu
    operator: Exists
  - effect: NoExecute
    key: node.kubernetes.io/not-ready
    operator: Exists
    tolerationSeconds: 300
  - effect: NoExecute
    key: node.kubernetes.io/unreachable
    operator: Exists
    tolerationSeconds: 300
  volumes:
  - emptyDir: {}
    name: gcsfuse-prd-oasis-tmp
  - emptyDir:
      medium: Memory
      sizeLimit: 40Gi
    name: shared-memory
  - configMap:
      defaultMode: 420
      name: workload-service-account-credential-configuration
    name: credential-configuration
  - name: token
    projected:
      defaultMode: 420
      sources:
      - serviceAccountToken:
          audience: xxx
          expirationSeconds: 3600
          path: token
  - name: kube-api-access-mlsct
    projected:
      defaultMode: 420
      sources:
      - serviceAccountToken:
          expirationSeconds: 3607
          path: token
      - configMap:
          items:
          - key: ca.crt
            path: ca.crt
          name: kube-root-ca.crt
      - downwardAPI:
          items:
          - fieldRef:
              apiVersion: v1
              fieldPath: metadata.namespace
            path: namespace
status:
  conditions:
  - lastProbeTime: null
    lastTransitionTime: "2026-04-02T21:01:31Z"
    status: "False"
    type: PodReadyToStartContainers
  - lastProbeTime: null
    lastTransitionTime: "2026-04-02T20:21:17Z"
    status: "True"
    type: Initialized
  - lastProbeTime: null
    lastTransitionTime: "2026-04-02T21:01:31Z"
    reason: PodFailed
    status: "False"
    type: Ready
  - lastProbeTime: null
    lastTransitionTime: "2026-04-02T21:01:31Z"
    reason: PodFailed
    status: "False"
    type: ContainersReady
  - lastProbeTime: null
    lastTransitionTime: "2026-04-02T20:21:03Z"
    status: "True"
    type: PodScheduled
  containerStatuses:
  - containerID: containerd://6a263159bca0bd9287a2dccec50230532ec0dd1a4b7cf9688d77bb17d18c10ea
    image: sha256:b122b1a561464c6770cb583f7b6e39536e7583d8060997478f27e611eb0c2806
    imageID: xxx
    lastState: {}
    name: main
    ready: false
    restartCount: 0
    started: false
    state:
      terminated:
        containerID: containerd://6a263159bca0bd9287a2dccec50230532ec0dd1a4b7cf9688d77bb17d18c10ea
        exitCode: 137
        finishedAt: "2026-04-02T21:01:29Z"
        reason: Error
        startedAt: "2026-04-02T20:21:21Z"
  hostIP: 10.97.188.45
  hostIPs:
  - ip: 10.97.188.45
  initContainerStatuses:
  - containerID: containerd://a3db01d0ee933dd830e1b36964c2e2af1be27e953f84f9954c64dfcfa8b7ca2e
    image: gcsfuse
    imageID: gcsfuse
    lastState: {}
    name: gcsfuse-init
    ready: false
    restartCount: 0
    started: false
    state:
      terminated:
        containerID: containerd://a3db01d0ee933dd830e1b36964c2e2af1be27e953f84f9954c64dfcfa8b7ca2e
        exitCode: 137
        finishedAt: "2026-04-02T21:01:14Z"
        reason: Error
        startedAt: "2026-04-02T20:21:08Z"
  phase: Failed
  podIP: 10.100.184.233
  podIPs:
  - ip: 10.100.184.233
  qosClass: Burstable
  startTime: "2026-04-02T20:21:06Z"
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Launchers - Kubernetes - Jobs - Sometimes, logs cannot be retrieved when job has been suspended (Kueue) then resumed #197

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Launchers - Kubernetes - Jobs - Sometimes, logs cannot be retrieved when job has been suspended (Kueue) then resumed #197

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions