Skip to content

Launchers - Kubernetes - Jobs - Sometimes, logs cannot be retrieved when job has been suspended (Kueue) then resumed #197

@Ark-kun

Description

@Ark-kun
% kubectl --kubeconfig nebius.kubeconfig.yaml --namespace oasis-jobs logs tangle-ce-019d4eb66c44f3abd9b7-1-wrlwf   Defaulted container "main" out of: main, gcsfuse-init (init)
unable to retrieve container logs for containerd://6a263159bca0bd9287a2dccec50230532ec0dd1a4b7cf9688d77bb17d18c10ea

We see that many pods are stuck in Terminating phase.

% kubectl --kubeconfig nebius.kubeconfig.yaml --namespace oasis-jobs get pods --selector job-name=tangle-ce-019d4eb66c44f3abd9b7
NAME                                     READY   STATUS        RESTARTS   AGE
tangle-ce-019d4eb66c44f3abd9b7-0-5ljwq   2/2     Running       0          8h
tangle-ce-019d4eb66c44f3abd9b7-0-69vlf   0/2     Terminating   0          9h
tangle-ce-019d4eb66c44f3abd9b7-0-c6j4t   0/2     Terminating   0          12h
tangle-ce-019d4eb66c44f3abd9b7-0-dk49w   0/2     Terminating   0          14h
tangle-ce-019d4eb66c44f3abd9b7-1-7jxpf   0/2     Terminating   0          12h
tangle-ce-019d4eb66c44f3abd9b7-1-hz4lm   0/2     Terminating   0          14h
tangle-ce-019d4eb66c44f3abd9b7-1-s5rsm   0/2     Terminating   0          12h
tangle-ce-019d4eb66c44f3abd9b7-1-t2g7h   0/2     Terminating   0          12h
tangle-ce-019d4eb66c44f3abd9b7-1-wc2f6   2/2     Running       0          8h
tangle-ce-019d4eb66c44f3abd9b7-1-wrlwf   0/2     Terminating   0          9h
% kubectl --kubeconfig nebius.kubeconfig.yaml --namespace oasis-jobs get pod tangle-ce-019d4eb66c44f3abd9b7-1-wrlwf -o yaml

apiVersion: v1
kind: Pod
metadata:
  annotations:
    batch.kubernetes.io/job-completion-index: "1"
    batch.kubernetes.io/job-index-failure-count: "0"
    kueue.x-k8s.io/workload: job-tangle-ce-019d4eb66c44f3abd9b7-1a728
  creationTimestamp: "2026-04-02T20:21:02Z"
  deletionGracePeriodSeconds: 30
  deletionTimestamp: "2026-04-02T21:01:12Z"
  generateName: tangle-ce-019d4eb66c44f3abd9b7-1-
  labels:
    batch.kubernetes.io/controller-uid: db09a903-e264-4891-8d3a-d4f0682a8a8f
    batch.kubernetes.io/job-completion-index: "1"
    batch.kubernetes.io/job-name: tangle-ce-019d4eb66c44f3abd9b7
    controller-uid: db09a903-e264-4891-8d3a-d4f0682a8a8f
    job-name: tangle-ce-019d4eb66c44f3abd9b7
    kueue.x-k8s.io/podset: main
    kueue.x-k8s.io/priority-class: default
    kueue.x-k8s.io/queue-name: sidekick-data
    kueue.x-k8s.io/tas: "true"
  name: tangle-ce-019d4eb66c44f3abd9b7-1-wrlwf
  namespace: oasis-jobs
  ownerReferences:
  - apiVersion: batch/v1
    blockOwnerDeletion: true
    controller: true
    kind: Job
    name: tangle-ce-019d4eb66c44f3abd9b7
    uid: db09a903-e264-4891-8d3a-d4f0682a8a8f
  resourceVersion: "570693812"
  uid: caa6d2a3-293c-4e31-a1b7-b8c402628d8b
spec:
  containers:
  - command: xxx
    env:
    - name: NVIDIA_VISIBLE_DEVICES
      value: none
    - name: _TANGLE_MULTI_NODE_NODE_INDEX
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
    - name: JOB_COMPLETION_INDEX
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: metadata.labels['batch.kubernetes.io/job-completion-index']
    image: xxx
    imagePullPolicy: IfNotPresent
    name: main
    resources:
      limits:
        memory: 1200Gi
        nvidia.com/gpu: "8"
      requests:
        cpu: "16"
        memory: 1200Gi
        nvidia.com/gpu: "8"
    securityContext:
      capabilities:
        add:
        - IPC_LOCK
    terminationMessagePath: /dev/termination-log
    terminationMessagePolicy: File
    volumeMounts:
    - mountPath: /tmp/inputs/model_config_data
      name: gcsfuse-xxx
      readOnly: true
      subPath: artifacts/by_execution/019d4eb66c44f3abd9b7/inputs/model_config_data
    - mountPath: /tmp/outputs/model_output_dir
      name: gcsfuse-xxx
      subPath: artifacts/by_execution/019d4eb66c44f3abd9b7/outputs/model_output_dir
    - mountPath: /dev/shm
      name: shared-memory
    - mountPath: /var/run/service-account
      name: token
      readOnly: true
    - mountPath: /etc/workload-identity
      name: credential-configuration
      readOnly: true
    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
      name: kube-api-access-mlsct
      readOnly: true
  dnsPolicy: ClusterFirst
  enableServiceLinks: true
  hostname: tangle-ce-019d4eb66c44f3abd9b7-1
  imagePullSecrets:
  - name: gcp-gar-secret
  initContainers:
  - command:
    - bash
    - -c
    - xxx; sleep infinity
    env:
    - name: GOOGLE_APPLICATION_CREDENTIALS
      value: /etc/workload-identity/credential-configuration.json
    - name: JOB_COMPLETION_INDEX
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: metadata.labels['batch.kubernetes.io/job-completion-index']
    image: gcsfuse
    imagePullPolicy: IfNotPresent
    lifecycle:
      preStop:
        exec:
          command:
          - bash
          - -c
          - echo '[gcsfuse-prestop] unmounting FUSE mounts...' > /proc/1/fd/1 2>/dev/null
            || true; for mp in /gcs_buckets/*/; do if [ -d "$mp" ]; then echo "[gcsfuse-prestop]
            unmounting $mp" > /proc/1/fd/1 2>/dev/null || true; fusermount -u "$mp"
            2>/dev/null || true; fi; done; echo '[gcsfuse-prestop] done' > /proc/1/fd/1
            2>/dev/null || true
    name: gcsfuse-init
    resources: {}
    restartPolicy: Always
      failureThreshold: 3
      periodSeconds: 10
      successThreshold: 1
      timeoutSeconds: 1
    terminationMessagePath: /dev/termination-log
    terminationMessagePolicy: File
    volumeMounts:
    - mountPath: /var/run/service-account
      name: token
      readOnly: true
    - mountPath: /etc/workload-identity
      name: credential-configuration
      readOnly: true
    - mountPath: /gcs_buckets/prd-oasis-tmp
      mountPropagation: Bidirectional
      name: gcsfuse-prd-oasis-tmp
    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
      name: kube-api-access-mlsct
      readOnly: true
  nodeName: xxx
  nodeSelector:
    kubernetes.io/hostname: xxx
    nvidia.com/gpu.product: NVIDIA-H200
  preemptionPolicy: PreemptLowerPriority
  priority: 300
  priorityClassName: default
  restartPolicy: Never
  schedulerName: default-scheduler
  securityContext: {}
  serviceAccount: xxx
  serviceAccountName: xxxx
  subdomain: tangle-ce-019d4eb66c44f3abd9b7
  terminationGracePeriodSeconds: 30
  tolerations:
  - effect: NoSchedule
    key: nvidia.com/gpu
    operator: Exists
  - effect: NoExecute
    key: node.kubernetes.io/not-ready
    operator: Exists
    tolerationSeconds: 300
  - effect: NoExecute
    key: node.kubernetes.io/unreachable
    operator: Exists
    tolerationSeconds: 300
  volumes:
  - emptyDir: {}
    name: gcsfuse-prd-oasis-tmp
  - emptyDir:
      medium: Memory
      sizeLimit: 40Gi
    name: shared-memory
  - configMap:
      defaultMode: 420
      name: workload-service-account-credential-configuration
    name: credential-configuration
  - name: token
    projected:
      defaultMode: 420
      sources:
      - serviceAccountToken:
          audience: xxx
          expirationSeconds: 3600
          path: token
  - name: kube-api-access-mlsct
    projected:
      defaultMode: 420
      sources:
      - serviceAccountToken:
          expirationSeconds: 3607
          path: token
      - configMap:
          items:
          - key: ca.crt
            path: ca.crt
          name: kube-root-ca.crt
      - downwardAPI:
          items:
          - fieldRef:
              apiVersion: v1
              fieldPath: metadata.namespace
            path: namespace
status:
  conditions:
  - lastProbeTime: null
    lastTransitionTime: "2026-04-02T21:01:31Z"
    status: "False"
    type: PodReadyToStartContainers
  - lastProbeTime: null
    lastTransitionTime: "2026-04-02T20:21:17Z"
    status: "True"
    type: Initialized
  - lastProbeTime: null
    lastTransitionTime: "2026-04-02T21:01:31Z"
    reason: PodFailed
    status: "False"
    type: Ready
  - lastProbeTime: null
    lastTransitionTime: "2026-04-02T21:01:31Z"
    reason: PodFailed
    status: "False"
    type: ContainersReady
  - lastProbeTime: null
    lastTransitionTime: "2026-04-02T20:21:03Z"
    status: "True"
    type: PodScheduled
  containerStatuses:
  - containerID: containerd://6a263159bca0bd9287a2dccec50230532ec0dd1a4b7cf9688d77bb17d18c10ea
    image: sha256:b122b1a561464c6770cb583f7b6e39536e7583d8060997478f27e611eb0c2806
    imageID: xxx
    lastState: {}
    name: main
    ready: false
    restartCount: 0
    started: false
    state:
      terminated:
        containerID: containerd://6a263159bca0bd9287a2dccec50230532ec0dd1a4b7cf9688d77bb17d18c10ea
        exitCode: 137
        finishedAt: "2026-04-02T21:01:29Z"
        reason: Error
        startedAt: "2026-04-02T20:21:21Z"
  hostIP: 10.97.188.45
  hostIPs:
  - ip: 10.97.188.45
  initContainerStatuses:
  - containerID: containerd://a3db01d0ee933dd830e1b36964c2e2af1be27e953f84f9954c64dfcfa8b7ca2e
    image: gcsfuse
    imageID: gcsfuse
    lastState: {}
    name: gcsfuse-init
    ready: false
    restartCount: 0
    started: false
    state:
      terminated:
        containerID: containerd://a3db01d0ee933dd830e1b36964c2e2af1be27e953f84f9954c64dfcfa8b7ca2e
        exitCode: 137
        finishedAt: "2026-04-02T21:01:14Z"
        reason: Error
        startedAt: "2026-04-02T20:21:08Z"
  phase: Failed
  podIP: 10.100.184.233
  podIPs:
  - ip: 10.100.184.233
  qosClass: Burstable
  startTime: "2026-04-02T20:21:06Z"

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions