forked from DARPA-CRITICALMAAS/sri-ta3
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeployment.yaml
87 lines (79 loc) · 2.42 KB
/
deployment.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
apiVersion: batch/v1
kind: Job
metadata:
namespace: ${NAMESPACE}
name: ${PROJECT_NAME}${JOB_TAG}
labels:
app: ${PROJECT_NAME}${JOB_TAG}
type: ${MODE}
spec:
template:
metadata:
labels:
app: ${PROJECT_NAME}${JOB_TAG}
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: gpuType # A5000 or 2080Ti
operator: In
values:
- ${GPU_TYPE}
# - key: kubernetes.io/hostname
# operator: In
# values:
# - cse-k8s-cvt-011.k8s.sri.com
restartPolicy: Never
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: rw-vol
persistentVolumeClaim:
claimName: ${RW_VOLUME}
- name: ro-vol
persistentVolumeClaim:
claimName: ${RO_VOLUME}
imagePullSecrets:
- name: ${SECRET}
containers:
- name: ${PROJECT_NAME}
image: ${REPO_HOST}/${REPO}:${PROJECT_NAME}-${DUSER}-v${VERSION}
workingDir: /workspace
resources:
requests:
memory: ${TOTAL_MEM}Gi
cpu: ${TOTAL_CPU}
nvidia.com/gpu: ${NGPU}
limits:
memory: ${TOTAL_MEM}Gi
cpu: ${TOTAL_CPU}
nvidia.com/gpu: ${NGPU}
ports: # List of ports to expose from the container (can be forwarded using k9s to local machine)
- containerPort: 8888
name: notebook-port
volumeMounts:
# Working directory for source code
- name: rw-vol
subPath: ${DUSER}/code/${K8_SRC_PATH}
mountPath: /workspace/${SRC_PATH}
# shared directory for storing data
- name: ro-vol
subPath: data
mountPath: /workspace/data
# Working directory for checkpoints, output, etc.
- name: rw-vol
subPath: ${DUSER}/logs
mountPath: /workspace/logs
# Virtual memory
- name: dshm
mountPath: /dev/shm
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: WANDB_API_KEY
value: ${WANDB_API_KEY}