kubernetes-sigs · kfswain · Jan 16, 2025 · Jan 16, 2025 · Jan 18, 2025
diff --git a/api/v1alpha1/inferencemodel_types.go b/api/v1alpha1/inferencemodel_types.go
@@ -21,9 +21,18 @@ import (
 )
 
 // InferenceModel is the Schema for the InferenceModels API.
+// The InferenceModel is intended to represent a model workload (also referred to as a model use case) within Kubernetes.
+// The management of the model server is not done by the InferenceModel. Instead, the
+// focus of the InferenceModel is to provide the tools needed to effectively manage multiple models
+// that share the same base model (currently the focus is LoRA adapters). Fields such as TargetModel
+// are intended to simplify A/B testing and version rollout of adapters. While Criticality assists with
+// governance of multiplexing many usecases over shared hardware.
 //
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
+// +kubebuilder:printcolumn:name="ModelName",type=string,JSONPath=`.spec.modelName`
+// +kubebuilder:printcolumn:name="Accepted",type=string,JSONPath=`.status.conditions[?(@.type=="Accepted")].status`
+// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp`
 // +genclient
 type InferenceModel struct {
 	metav1.TypeMeta   `json:",inline"`
@@ -42,27 +51,20 @@ type InferenceModelList struct {
 	Items           []InferenceModel `json:"items"`
 }
 
-// InferenceModelSpec represents the desired state of a specific model use case. This resource is
+// InferenceModelSpec represents the desired state of an InferenceModel. This resource is
 // managed by the "Inference Workload Owner" persona.
 //
 // The Inference Workload Owner persona is someone that trains, verifies, and
-// leverages a large language model from a model frontend, drives the lifecycle
-// and rollout of new versions of those models, and defines the specific
+// leverages a large language model focusing on model fidelity performance, and
+// less on inference performance (which is managed by the Inference Platform Admin).
+// They also drive the lifecycle and rollout of new versions of those models, and defines the specific
 // performance and latency goals for the model. These workloads are
 // expected to operate within an InferencePool sharing compute capacity with other
-// InferenceModels, defined by the Inference Platform Admin.
-//
-// InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool,
-// if the name is reused, an error will be shown on the status of a
-// InferenceModel that attempted to reuse. The oldest InferenceModel, based on
-// creation timestamp, will be selected to remain valid. In the event of a race
-// condition, one will be selected at random.
+// InferenceModels, with specific governance defined by the Inference Platform Admin.
 type InferenceModelSpec struct {
 	// ModelName is the name of the model as the users set in the "model" parameter in the requests.
 	// The name should be unique among the workloads that reference the same backend pool.
-	// This is the parameter that will be used to match the request with. In the future, we may
-	// allow to match on other request parameters. The other approach to support matching
-	// on other request parameters is to use a different ModelName per HTTPFilter.
+	// This is the parameter that will be used to match the request with.
 	// Names can be reserved without implementing an actual model in the pool.
 	// This can be done by specifying a target model and setting the weight to zero,
 	// an error will be returned specifying that no valid target model is found.
@@ -78,9 +80,19 @@ type InferenceModelSpec struct {
 	Criticality *Criticality `json:"criticality,omitempty"`
 
 	// TargetModels allow multiple versions of a model for traffic splitting.
-	// If not specified, the target model name is defaulted to the modelName parameter.
+	// Traffic splitting is handled via weights. The targetModel field is optional, however,
+	// if not specified, the target model name is defaulted to the modelName parameter.
 	// modelName is often in reference to a LoRA adapter.
 	//
+	// Examples:
+	// - A model server serving `llama2-7b` may be represented by:
+	//   - setting the modelName to `llama2-7b` and setting no targetModels
+	//   - setting the modelName to `hello-world` and setting a single targetModel to `llama2-7b`, and setting no weights
+	//   - setting modelName to 'my-fine-tune' setting 2 targetModels 'fine-tune-v1' & 'fine-tune-v2' and setting no weights.
+	//       This has the effect of weighing the two models equally
+	//   - setting modelName to 'my-fine-tune' setting 2 targetModels 'fine-tune-v1' w/weight: 10 & 'fine-tune-v2' w/weight: 1.
+	//       This has the effect of the fine-tune-v1 being selected 10x as often as v2
+	//
 	// +optional
 	// +kubebuilder:validation:MaxItems=10
 	TargetModels []TargetModel `json:"targetModels,omitempty"`
@@ -144,7 +156,7 @@ const (
 // to exist at request time, the error is processed by the Inference Gateway
 // and emitted on the appropriate InferenceModel object.
 type TargetModel struct {
-	// Name is the name of the adapter or base model, as expected by the ModelServer.
+	// Name is the name of the LoRA adapter or base model, as expected by the ModelServer.
 	//
 	// +kubebuilder:validation:MaxLength=253
 	// +kubebuilder:validation:Required

diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml
@@ -14,10 +14,22 @@ spec:
     singular: inferencemodel
   scope: Namespaced
   versions:
-  - name: v1alpha1
+  - additionalPrinterColumns:
+    - jsonPath: .spec.modelName
+      name: ModelName
+      type: string
+    - jsonPath: .status.conditions[?(@.type=="Accepted")].status
+      name: Accepted
+      type: string
+    - jsonPath: .metadata.creationTimestamp
+      name: Age
+      type: date
+    name: v1alpha1
     schema:
       openAPIV3Schema:
-        description: InferenceModel is the Schema for the InferenceModels API.
+        description: |-
+          InferenceModel is the Schema for the InferenceModels API.
+          The InferenceModel is intended to represent a model workload within Kubernetes.
         properties:
           apiVersion:
             description: |-
@@ -47,12 +59,6 @@ spec:
               performance and latency goals for the model. These workloads are
               expected to operate within an InferencePool sharing compute capacity with other
               InferenceModels, defined by the Inference Platform Admin.
-
-              InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool,
-              if the name is reused, an error will be shown on the status of a
-              InferenceModel that attempted to reuse. The oldest InferenceModel, based on
-              creation timestamp, will be selected to remain valid. In the event of a race
-              condition, one will be selected at random.
             properties:
               criticality:
                 default: Default
@@ -67,9 +73,7 @@ spec:
                 description: |-
                   ModelName is the name of the model as the users set in the "model" parameter in the requests.
                   The name should be unique among the workloads that reference the same backend pool.
-                  This is the parameter that will be used to match the request with. In the future, we may
-                  allow to match on other request parameters. The other approach to support matching
-                  on other request parameters is to use a different ModelName per HTTPFilter.
+                  This is the parameter that will be used to match the request with.
                   Names can be reserved without implementing an actual model in the pool.
                   This can be done by specifying a target model and setting the weight to zero,
                   an error will be returned specifying that no valid target model is found.
@@ -103,8 +107,18 @@ spec:
               targetModels:
                 description: |-
                   TargetModels allow multiple versions of a model for traffic splitting.
-                  If not specified, the target model name is defaulted to the modelName parameter.
+                  Traffic splitting is handled via weights. The targetModel field is optional, however,
+                  if not specified, the target model name is defaulted to the modelName parameter.
                   modelName is often in reference to a LoRA adapter.
+
+                  Examples:
+                  - A model server serving `llama2-7b` may be represented by:
+                    - setting the modelName to `llama2-7b` and setting no targetModels
+                    - setting the modelName to `hello-world` and setting a single targetModel to `llama2-7b`, and setting no weights
+                    - setting modelName to 'my-fine-tune' setting 2 targetModels 'fine-tune-v1' & 'fine-tune-v2' and setting no weights.
+                        This has the effect of weighing the two models equally
+                    - setting modelName to 'my-fine-tune' setting 2 targetModels 'fine-tune-v1' w/weight: 10 & 'fine-tune-v2' w/weight: 1.
+                        This has the effect of the fine-tune-v1 being selected 10x as often as v2
                 items:
                   description: |-
                     TargetModel represents a deployed model or a LoRA adapter. The
@@ -116,7 +130,7 @@ spec:
                     and emitted on the appropriate InferenceModel object.
                   properties:
                     name:
-                      description: Name is the name of the adapter or base model,
+                      description: Name is the name of the LoRA adapter or base model,
                         as expected by the ModelServer.
                       maxLength: 253
                       type: string