From ae71465bfa8b90515bc8a7a9a00b67e5c2b620aa Mon Sep 17 00:00:00 2001 From: Akram Ben Aissi Date: Mon, 21 Oct 2024 18:35:29 +0100 Subject: [PATCH] Example with Jobs and with Workloads --- .../kueue-with-jobs/00-common.yaml | 51 ++++++++++++ .../kueue-with-jobs/01-cluster-queues.yaml | 80 +++++++++++++++++++ .../kueue-with-jobs/02-local-queues.yaml | 29 +++++++ .../kueue-with-jobs/03-gai-job.yaml | 26 ++++++ .../kueue-with-jobs/04-llm-job.yaml | 26 ++++++ .../kueue-with-jobs/05-cure-cancer-job.yaml | 26 ++++++ .../07-workload-priority-classes.yaml | 33 ++++++++ .../kueue-usage/kueue-with-jobs/README.md | 33 ++++++++ 8 files changed, 304 insertions(+) create mode 100644 examples/kueue-usage/kueue-with-jobs/00-common.yaml create mode 100644 examples/kueue-usage/kueue-with-jobs/01-cluster-queues.yaml create mode 100644 examples/kueue-usage/kueue-with-jobs/02-local-queues.yaml create mode 100644 examples/kueue-usage/kueue-with-jobs/03-gai-job.yaml create mode 100644 examples/kueue-usage/kueue-with-jobs/04-llm-job.yaml create mode 100644 examples/kueue-usage/kueue-with-jobs/05-cure-cancer-job.yaml create mode 100644 examples/kueue-usage/kueue-with-jobs/07-workload-priority-classes.yaml create mode 100644 examples/kueue-usage/kueue-with-jobs/README.md diff --git a/examples/kueue-usage/kueue-with-jobs/00-common.yaml b/examples/kueue-usage/kueue-with-jobs/00-common.yaml new file mode 100644 index 00000000..1599195e --- /dev/null +++ b/examples/kueue-usage/kueue-with-jobs/00-common.yaml @@ -0,0 +1,51 @@ +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ResourceFlavor +metadata: + name: "default-flavor" +spec: {} + +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ResourceFlavor +metadata: + name: "gpu" +spec: + labels: + nodeLabels: + instance-type: spot + + +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: WorkloadPriorityClass +metadata: + name: human-critical +description: "Use for critical human critical workloads like research on disease or natural disaster avoidance" +preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods +value: 1000000 # 1M out of 1B, higher is better +globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default + +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: WorkloadPriorityClass +metadata: + name: business-impacting +description: "Use for business critical impacting workloads" +#preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods +value: 1000 # 1M out of 1B, higher is better +globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default + + +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: WorkloadPriorityClass +metadata: + name: long-term-research +description: "Use for long term research processes like extraterrestiral research" +#preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods +value: 1 # 1M out of 1B, higher is better +globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default + + + diff --git a/examples/kueue-usage/kueue-with-jobs/01-cluster-queues.yaml b/examples/kueue-usage/kueue-with-jobs/01-cluster-queues.yaml new file mode 100644 index 00000000..6386e5ba --- /dev/null +++ b/examples/kueue-usage/kueue-with-jobs/01-cluster-queues.yaml @@ -0,0 +1,80 @@ +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ClusterQueue +metadata: + name: "emergency-cluster-queue" +spec: + description: "Cluster queue for the emergency training jobs (Climate Change, Alzheimer, Cancer)." + cohort: "ai-for-humanity" + namespaceSelector: {} # match all. + flavorFungibility: + whenCanBorrow: Borrow + whenCanPreempt: Preempt + preemption: + reclaimWithinCohort: Any + borrowWithinCohort: + policy: LowerPriority + withinClusterQueue: LowerPriority + resourceGroups: + - coveredResources: ["cpu", "memory"] + flavors: + - name: "default-flavor" + resources: + - name: "cpu" + nominalQuota: 1 + - name: "memory" + nominalQuota: 2000Mi + borrowingLimit: 500Mi + +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ClusterQueue +metadata: + name: llm-cluster-queue +spec: + description: "Cluster queue for LLM model workloads" + cohort: ai-for-humanity + namespaceSelector: {} + flavorFungibility: + whenCanBorrow: Borrow + whenCanPreempt: TryNextFlavor + preemption: + reclaimWithinCohort: LowerPriority # only preempt Workloads in the cohort that have lower priority than the pending Workload. + namespaceSelector: {} # match all. + resourceGroups: + - coveredResources: + - "cpu" + - "memory" + flavors: + - name: "default-flavor" + resources: + - name: "cpu" + nominalQuota: 500m + - name: "memory" + nominalQuota: 500Mi + borrowingLimit: 500Mi + +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ClusterQueue +metadata: + name: gai-cluster-queue +spec: + description: "Cluster queue for GAI model workloads" + cohort: ai-against-humanity + namespaceSelector: {} + preemption: + reclaimWithinCohort: Never # do not preempt Workloads in the cohort. + flavorFungibility: + whenCanBorrow: Borrow # this is the default but I'm making it explicit here + whenCanPreempt: Preempt # ensures that accelerators aren't hit with compute workloads + resourceGroups: + - coveredResources: + - "gpu" + flavors: + - name: "gpu" + resources: + - name: "gpu" + nominalQuota: 48Gi + + diff --git a/examples/kueue-usage/kueue-with-jobs/02-local-queues.yaml b/examples/kueue-usage/kueue-with-jobs/02-local-queues.yaml new file mode 100644 index 00000000..c8e90ff4 --- /dev/null +++ b/examples/kueue-usage/kueue-with-jobs/02-local-queues.yaml @@ -0,0 +1,29 @@ +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + name: emergency-queue +spec: + description: "Queue for the emergency training jobs (Climate Change, Alzheimer, Cancer)." + clusterQueue: emergency-cluster-queue + +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + name: llm-queue +spec: + description: "Queue for the LLM model's training jobs." + clusterQueue: llm-cluster-queue + + +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + name: gai-queue +spec: + description: "Queue for the GAI (General Artificial Intelligence) model's training jobs." + clusterQueue: gai-cluster-queue + + diff --git a/examples/kueue-usage/kueue-with-jobs/03-gai-job.yaml b/examples/kueue-usage/kueue-with-jobs/03-gai-job.yaml new file mode 100644 index 00000000..3eb23da9 --- /dev/null +++ b/examples/kueue-usage/kueue-with-jobs/03-gai-job.yaml @@ -0,0 +1,26 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: gai- + labels: + kueue.x-k8s.io/priority-class: long-term-research + kueue.x-k8s.io/queue-name: gai-queue +spec: + parallelism: 3 + completions: 3 + suspend: true + template: + spec: + containers: + - name: gai-training-brain + image: gcr.io/k8s-staging-perf-tests/sleep:v0.1.0 + args: ["300s"] + resources: + requests: + cpu: 1 + memory: "1Gi" + restartPolicy: Never + + + diff --git a/examples/kueue-usage/kueue-with-jobs/04-llm-job.yaml b/examples/kueue-usage/kueue-with-jobs/04-llm-job.yaml new file mode 100644 index 00000000..f2b119bf --- /dev/null +++ b/examples/kueue-usage/kueue-with-jobs/04-llm-job.yaml @@ -0,0 +1,26 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: llm- + labels: + kueue.x-k8s.io/priority-class: business-impacting + kueue.x-k8s.io/queue-name: llm-queue +spec: + parallelism: 3 + completions: 3 + suspend: true + template: + spec: + containers: + - name: dummy-job + image: gcr.io/k8s-staging-perf-tests/sleep:v0.1.0 + args: ["300s"] + resources: + requests: + cpu: 50m + memory: "50Mi" + restartPolicy: Never + + + diff --git a/examples/kueue-usage/kueue-with-jobs/05-cure-cancer-job.yaml b/examples/kueue-usage/kueue-with-jobs/05-cure-cancer-job.yaml new file mode 100644 index 00000000..b7003944 --- /dev/null +++ b/examples/kueue-usage/kueue-with-jobs/05-cure-cancer-job.yaml @@ -0,0 +1,26 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: cure-cancer- + labels: + kueue.x-k8s.io/priority-class: human-critical + kueue.x-k8s.io/queue-name: emergency-queue +spec: + parallelism: 3 + completions: 3 + suspend: true + template: + spec: + containers: + - name: cancer-treatment-model-update + image: gcr.io/k8s-staging-perf-tests/sleep:v0.1.0 + args: ["60s"] + resources: + requests: + cpu: 50m + memory: "50Mi" + restartPolicy: Never + + + diff --git a/examples/kueue-usage/kueue-with-jobs/07-workload-priority-classes.yaml b/examples/kueue-usage/kueue-with-jobs/07-workload-priority-classes.yaml new file mode 100644 index 00000000..53222ffc --- /dev/null +++ b/examples/kueue-usage/kueue-with-jobs/07-workload-priority-classes.yaml @@ -0,0 +1,33 @@ +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: WorkloadPriorityClass +metadata: + name: human-critical +description: "Use for critical human critical workloads like research on disease or natural disaster avoidance" +preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods +value: 1000000 # 1M out of 1B, higher is better +globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default + +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: WorkloadPriorityClass +metadata: + name: business-impacting +description: "Use for business critical impacting workloads" +#preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods +value: 1000 # 1M out of 1B, higher is better +globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default + + +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: WorkloadPriorityClass +metadata: + name: long-term-research +description: "Use for long term research processes like extraterrestiral research" +#preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods +value: 1 # 1M out of 1B, higher is better +globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default + + + diff --git a/examples/kueue-usage/kueue-with-jobs/README.md b/examples/kueue-usage/kueue-with-jobs/README.md new file mode 100644 index 00000000..991f1348 --- /dev/null +++ b/examples/kueue-usage/kueue-with-jobs/README.md @@ -0,0 +1,33 @@ +# Kueue Jobs Example + +Here are a set of resources that can be used to test `kueue` features like preemption. + +## Install +Install the base resources +``` +for i in 00-common.yaml 01-cluster-queues.yaml 02-local-queues.yaml; do + oc create -f $i +done +``` + +## Run +Run the workloads from jobs + +``` +for i in $(seq 1 7); do oc create -f 04-llm-job.yaml; done +for i in $(seq 5); do oc create -f 05-cancer-cure-research.yaml; done +``` + +## Observe + +Observe that `workloads` get created= +``` +oc get wl +``` +Observe that `workloads` get preempted using `oc describe wl ` + + + + + +