Skip to content

Commit

Permalink
Example with Jobs and with Workloads
Browse files Browse the repository at this point in the history
  • Loading branch information
akram committed Nov 5, 2024
1 parent 83f04f8 commit ae71465
Show file tree
Hide file tree
Showing 8 changed files with 304 additions and 0 deletions.
51 changes: 51 additions & 0 deletions examples/kueue-usage/kueue-with-jobs/00-common.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: "default-flavor"
spec: {}

---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: "gpu"
spec:
labels:
nodeLabels:
instance-type: spot


---
apiVersion: kueue.x-k8s.io/v1beta1
kind: WorkloadPriorityClass
metadata:
name: human-critical
description: "Use for critical human critical workloads like research on disease or natural disaster avoidance"
preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods
value: 1000000 # 1M out of 1B, higher is better
globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default

---
apiVersion: kueue.x-k8s.io/v1beta1
kind: WorkloadPriorityClass
metadata:
name: business-impacting
description: "Use for business critical impacting workloads"
#preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods
value: 1000 # 1M out of 1B, higher is better
globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default


---
apiVersion: kueue.x-k8s.io/v1beta1
kind: WorkloadPriorityClass
metadata:
name: long-term-research
description: "Use for long term research processes like extraterrestiral research"
#preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods
value: 1 # 1M out of 1B, higher is better
globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default



80 changes: 80 additions & 0 deletions examples/kueue-usage/kueue-with-jobs/01-cluster-queues.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: "emergency-cluster-queue"
spec:
description: "Cluster queue for the emergency training jobs (Climate Change, Alzheimer, Cancer)."
cohort: "ai-for-humanity"
namespaceSelector: {} # match all.
flavorFungibility:
whenCanBorrow: Borrow
whenCanPreempt: Preempt
preemption:
reclaimWithinCohort: Any
borrowWithinCohort:
policy: LowerPriority
withinClusterQueue: LowerPriority
resourceGroups:
- coveredResources: ["cpu", "memory"]
flavors:
- name: "default-flavor"
resources:
- name: "cpu"
nominalQuota: 1
- name: "memory"
nominalQuota: 2000Mi
borrowingLimit: 500Mi

---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: llm-cluster-queue
spec:
description: "Cluster queue for LLM model workloads"
cohort: ai-for-humanity
namespaceSelector: {}
flavorFungibility:
whenCanBorrow: Borrow
whenCanPreempt: TryNextFlavor
preemption:
reclaimWithinCohort: LowerPriority # only preempt Workloads in the cohort that have lower priority than the pending Workload.
namespaceSelector: {} # match all.
resourceGroups:
- coveredResources:
- "cpu"
- "memory"
flavors:
- name: "default-flavor"
resources:
- name: "cpu"
nominalQuota: 500m
- name: "memory"
nominalQuota: 500Mi
borrowingLimit: 500Mi

---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: gai-cluster-queue
spec:
description: "Cluster queue for GAI model workloads"
cohort: ai-against-humanity
namespaceSelector: {}
preemption:
reclaimWithinCohort: Never # do not preempt Workloads in the cohort.
flavorFungibility:
whenCanBorrow: Borrow # this is the default but I'm making it explicit here
whenCanPreempt: Preempt # ensures that accelerators aren't hit with compute workloads
resourceGroups:
- coveredResources:
- "gpu"
flavors:
- name: "gpu"
resources:
- name: "gpu"
nominalQuota: 48Gi


29 changes: 29 additions & 0 deletions examples/kueue-usage/kueue-with-jobs/02-local-queues.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
name: emergency-queue
spec:
description: "Queue for the emergency training jobs (Climate Change, Alzheimer, Cancer)."
clusterQueue: emergency-cluster-queue

---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
name: llm-queue
spec:
description: "Queue for the LLM model's training jobs."
clusterQueue: llm-cluster-queue


---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
name: gai-queue
spec:
description: "Queue for the GAI (General Artificial Intelligence) model's training jobs."
clusterQueue: gai-cluster-queue


26 changes: 26 additions & 0 deletions examples/kueue-usage/kueue-with-jobs/03-gai-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
apiVersion: batch/v1
kind: Job
metadata:
generateName: gai-
labels:
kueue.x-k8s.io/priority-class: long-term-research
kueue.x-k8s.io/queue-name: gai-queue
spec:
parallelism: 3
completions: 3
suspend: true
template:
spec:
containers:
- name: gai-training-brain
image: gcr.io/k8s-staging-perf-tests/sleep:v0.1.0
args: ["300s"]
resources:
requests:
cpu: 1
memory: "1Gi"
restartPolicy: Never



26 changes: 26 additions & 0 deletions examples/kueue-usage/kueue-with-jobs/04-llm-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
apiVersion: batch/v1
kind: Job
metadata:
generateName: llm-
labels:
kueue.x-k8s.io/priority-class: business-impacting
kueue.x-k8s.io/queue-name: llm-queue
spec:
parallelism: 3
completions: 3
suspend: true
template:
spec:
containers:
- name: dummy-job
image: gcr.io/k8s-staging-perf-tests/sleep:v0.1.0
args: ["300s"]
resources:
requests:
cpu: 50m
memory: "50Mi"
restartPolicy: Never



26 changes: 26 additions & 0 deletions examples/kueue-usage/kueue-with-jobs/05-cure-cancer-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
apiVersion: batch/v1
kind: Job
metadata:
generateName: cure-cancer-
labels:
kueue.x-k8s.io/priority-class: human-critical
kueue.x-k8s.io/queue-name: emergency-queue
spec:
parallelism: 3
completions: 3
suspend: true
template:
spec:
containers:
- name: cancer-treatment-model-update
image: gcr.io/k8s-staging-perf-tests/sleep:v0.1.0
args: ["60s"]
resources:
requests:
cpu: 50m
memory: "50Mi"
restartPolicy: Never



Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: WorkloadPriorityClass
metadata:
name: human-critical
description: "Use for critical human critical workloads like research on disease or natural disaster avoidance"
preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods
value: 1000000 # 1M out of 1B, higher is better
globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default

---
apiVersion: kueue.x-k8s.io/v1beta1
kind: WorkloadPriorityClass
metadata:
name: business-impacting
description: "Use for business critical impacting workloads"
#preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods
value: 1000 # 1M out of 1B, higher is better
globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default


---
apiVersion: kueue.x-k8s.io/v1beta1
kind: WorkloadPriorityClass
metadata:
name: long-term-research
description: "Use for long term research processes like extraterrestiral research"
#preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods
value: 1 # 1M out of 1B, higher is better
globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default



33 changes: 33 additions & 0 deletions examples/kueue-usage/kueue-with-jobs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Kueue Jobs Example

Here are a set of resources that can be used to test `kueue` features like preemption.

## Install
Install the base resources
```
for i in 00-common.yaml 01-cluster-queues.yaml 02-local-queues.yaml; do
oc create -f $i
done
```

## Run
Run the workloads from jobs

```
for i in $(seq 1 7); do oc create -f 04-llm-job.yaml; done
for i in $(seq 5); do oc create -f 05-cancer-cure-research.yaml; done
```

## Observe

Observe that `workloads` get created=
```
oc get wl
```
Observe that `workloads` get preempted using `oc describe wl <some not admitted workload>`






0 comments on commit ae71465

Please sign in to comment.