-
Notifications
You must be signed in to change notification settings - Fork 14
Launchers - Kubernetes - Jobs - Sometimes, logs cannot be retrieved when job has been suspended (Kueue) then resumed #197
Copy link
Copy link
Open
Description
% kubectl --kubeconfig nebius.kubeconfig.yaml --namespace oasis-jobs logs tangle-ce-019d4eb66c44f3abd9b7-1-wrlwf Defaulted container "main" out of: main, gcsfuse-init (init)
unable to retrieve container logs for containerd://6a263159bca0bd9287a2dccec50230532ec0dd1a4b7cf9688d77bb17d18c10ea
We see that many pods are stuck in Terminating phase.
% kubectl --kubeconfig nebius.kubeconfig.yaml --namespace oasis-jobs get pods --selector job-name=tangle-ce-019d4eb66c44f3abd9b7
NAME READY STATUS RESTARTS AGE
tangle-ce-019d4eb66c44f3abd9b7-0-5ljwq 2/2 Running 0 8h
tangle-ce-019d4eb66c44f3abd9b7-0-69vlf 0/2 Terminating 0 9h
tangle-ce-019d4eb66c44f3abd9b7-0-c6j4t 0/2 Terminating 0 12h
tangle-ce-019d4eb66c44f3abd9b7-0-dk49w 0/2 Terminating 0 14h
tangle-ce-019d4eb66c44f3abd9b7-1-7jxpf 0/2 Terminating 0 12h
tangle-ce-019d4eb66c44f3abd9b7-1-hz4lm 0/2 Terminating 0 14h
tangle-ce-019d4eb66c44f3abd9b7-1-s5rsm 0/2 Terminating 0 12h
tangle-ce-019d4eb66c44f3abd9b7-1-t2g7h 0/2 Terminating 0 12h
tangle-ce-019d4eb66c44f3abd9b7-1-wc2f6 2/2 Running 0 8h
tangle-ce-019d4eb66c44f3abd9b7-1-wrlwf 0/2 Terminating 0 9h
% kubectl --kubeconfig nebius.kubeconfig.yaml --namespace oasis-jobs get pod tangle-ce-019d4eb66c44f3abd9b7-1-wrlwf -o yaml
apiVersion: v1
kind: Pod
metadata:
annotations:
batch.kubernetes.io/job-completion-index: "1"
batch.kubernetes.io/job-index-failure-count: "0"
kueue.x-k8s.io/workload: job-tangle-ce-019d4eb66c44f3abd9b7-1a728
creationTimestamp: "2026-04-02T20:21:02Z"
deletionGracePeriodSeconds: 30
deletionTimestamp: "2026-04-02T21:01:12Z"
generateName: tangle-ce-019d4eb66c44f3abd9b7-1-
labels:
batch.kubernetes.io/controller-uid: db09a903-e264-4891-8d3a-d4f0682a8a8f
batch.kubernetes.io/job-completion-index: "1"
batch.kubernetes.io/job-name: tangle-ce-019d4eb66c44f3abd9b7
controller-uid: db09a903-e264-4891-8d3a-d4f0682a8a8f
job-name: tangle-ce-019d4eb66c44f3abd9b7
kueue.x-k8s.io/podset: main
kueue.x-k8s.io/priority-class: default
kueue.x-k8s.io/queue-name: sidekick-data
kueue.x-k8s.io/tas: "true"
name: tangle-ce-019d4eb66c44f3abd9b7-1-wrlwf
namespace: oasis-jobs
ownerReferences:
- apiVersion: batch/v1
blockOwnerDeletion: true
controller: true
kind: Job
name: tangle-ce-019d4eb66c44f3abd9b7
uid: db09a903-e264-4891-8d3a-d4f0682a8a8f
resourceVersion: "570693812"
uid: caa6d2a3-293c-4e31-a1b7-b8c402628d8b
spec:
containers:
- command: xxx
env:
- name: NVIDIA_VISIBLE_DEVICES
value: none
- name: _TANGLE_MULTI_NODE_NODE_INDEX
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
- name: JOB_COMPLETION_INDEX
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.labels['batch.kubernetes.io/job-completion-index']
image: xxx
imagePullPolicy: IfNotPresent
name: main
resources:
limits:
memory: 1200Gi
nvidia.com/gpu: "8"
requests:
cpu: "16"
memory: 1200Gi
nvidia.com/gpu: "8"
securityContext:
capabilities:
add:
- IPC_LOCK
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /tmp/inputs/model_config_data
name: gcsfuse-xxx
readOnly: true
subPath: artifacts/by_execution/019d4eb66c44f3abd9b7/inputs/model_config_data
- mountPath: /tmp/outputs/model_output_dir
name: gcsfuse-xxx
subPath: artifacts/by_execution/019d4eb66c44f3abd9b7/outputs/model_output_dir
- mountPath: /dev/shm
name: shared-memory
- mountPath: /var/run/service-account
name: token
readOnly: true
- mountPath: /etc/workload-identity
name: credential-configuration
readOnly: true
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: kube-api-access-mlsct
readOnly: true
dnsPolicy: ClusterFirst
enableServiceLinks: true
hostname: tangle-ce-019d4eb66c44f3abd9b7-1
imagePullSecrets:
- name: gcp-gar-secret
initContainers:
- command:
- bash
- -c
- xxx; sleep infinity
env:
- name: GOOGLE_APPLICATION_CREDENTIALS
value: /etc/workload-identity/credential-configuration.json
- name: JOB_COMPLETION_INDEX
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.labels['batch.kubernetes.io/job-completion-index']
image: gcsfuse
imagePullPolicy: IfNotPresent
lifecycle:
preStop:
exec:
command:
- bash
- -c
- echo '[gcsfuse-prestop] unmounting FUSE mounts...' > /proc/1/fd/1 2>/dev/null
|| true; for mp in /gcs_buckets/*/; do if [ -d "$mp" ]; then echo "[gcsfuse-prestop]
unmounting $mp" > /proc/1/fd/1 2>/dev/null || true; fusermount -u "$mp"
2>/dev/null || true; fi; done; echo '[gcsfuse-prestop] done' > /proc/1/fd/1
2>/dev/null || true
name: gcsfuse-init
resources: {}
restartPolicy: Always
failureThreshold: 3
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /var/run/service-account
name: token
readOnly: true
- mountPath: /etc/workload-identity
name: credential-configuration
readOnly: true
- mountPath: /gcs_buckets/prd-oasis-tmp
mountPropagation: Bidirectional
name: gcsfuse-prd-oasis-tmp
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: kube-api-access-mlsct
readOnly: true
nodeName: xxx
nodeSelector:
kubernetes.io/hostname: xxx
nvidia.com/gpu.product: NVIDIA-H200
preemptionPolicy: PreemptLowerPriority
priority: 300
priorityClassName: default
restartPolicy: Never
schedulerName: default-scheduler
securityContext: {}
serviceAccount: xxx
serviceAccountName: xxxx
subdomain: tangle-ce-019d4eb66c44f3abd9b7
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
volumes:
- emptyDir: {}
name: gcsfuse-prd-oasis-tmp
- emptyDir:
medium: Memory
sizeLimit: 40Gi
name: shared-memory
- configMap:
defaultMode: 420
name: workload-service-account-credential-configuration
name: credential-configuration
- name: token
projected:
defaultMode: 420
sources:
- serviceAccountToken:
audience: xxx
expirationSeconds: 3600
path: token
- name: kube-api-access-mlsct
projected:
defaultMode: 420
sources:
- serviceAccountToken:
expirationSeconds: 3607
path: token
- configMap:
items:
- key: ca.crt
path: ca.crt
name: kube-root-ca.crt
- downwardAPI:
items:
- fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
path: namespace
status:
conditions:
- lastProbeTime: null
lastTransitionTime: "2026-04-02T21:01:31Z"
status: "False"
type: PodReadyToStartContainers
- lastProbeTime: null
lastTransitionTime: "2026-04-02T20:21:17Z"
status: "True"
type: Initialized
- lastProbeTime: null
lastTransitionTime: "2026-04-02T21:01:31Z"
reason: PodFailed
status: "False"
type: Ready
- lastProbeTime: null
lastTransitionTime: "2026-04-02T21:01:31Z"
reason: PodFailed
status: "False"
type: ContainersReady
- lastProbeTime: null
lastTransitionTime: "2026-04-02T20:21:03Z"
status: "True"
type: PodScheduled
containerStatuses:
- containerID: containerd://6a263159bca0bd9287a2dccec50230532ec0dd1a4b7cf9688d77bb17d18c10ea
image: sha256:b122b1a561464c6770cb583f7b6e39536e7583d8060997478f27e611eb0c2806
imageID: xxx
lastState: {}
name: main
ready: false
restartCount: 0
started: false
state:
terminated:
containerID: containerd://6a263159bca0bd9287a2dccec50230532ec0dd1a4b7cf9688d77bb17d18c10ea
exitCode: 137
finishedAt: "2026-04-02T21:01:29Z"
reason: Error
startedAt: "2026-04-02T20:21:21Z"
hostIP: 10.97.188.45
hostIPs:
- ip: 10.97.188.45
initContainerStatuses:
- containerID: containerd://a3db01d0ee933dd830e1b36964c2e2af1be27e953f84f9954c64dfcfa8b7ca2e
image: gcsfuse
imageID: gcsfuse
lastState: {}
name: gcsfuse-init
ready: false
restartCount: 0
started: false
state:
terminated:
containerID: containerd://a3db01d0ee933dd830e1b36964c2e2af1be27e953f84f9954c64dfcfa8b7ca2e
exitCode: 137
finishedAt: "2026-04-02T21:01:14Z"
reason: Error
startedAt: "2026-04-02T20:21:08Z"
phase: Failed
podIP: 10.100.184.233
podIPs:
- ip: 10.100.184.233
qosClass: Burstable
startTime: "2026-04-02T20:21:06Z"
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels