diff --git a/Makefile b/Makefile
index 6afb675..c6c0cf6 100644
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,5 @@
+include Makefile-deps.mk
+
 # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary.
 ENVTEST_K8S_VERSION = 1.28.3
 ENVTEST_LWS_VERSION = v0.4.0
@@ -86,6 +88,10 @@ generate: controller-gen code-generator ## Generate code containing DeepCopy, De
 	$(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..."
 	./hack/update-codegen.sh go $(PROJECT_DIR)/bin
 
+.PHONY: generate-apiref
+generate-apiref: genref
+	cd $(PROJECT_DIR)/hack/genref/ && $(GENREF) -o $(PROJECT_DIR)/docs/reference
+
 # Use same code-generator version as k8s.io/api
 CODEGEN_VERSION := $(shell go list -m -f '{{.Version}}' k8s.io/api)
 CODEGEN = $(shell pwd)/bin/code-generator
diff --git a/Makefile-deps.mk b/Makefile-deps.mk
new file mode 100644
index 0000000..f9a6f38
--- /dev/null
+++ b/Makefile-deps.mk
@@ -0,0 +1,14 @@
+PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))
+
+ifeq (,$(shell go env GOBIN))
+	GOBIN=$(shell go env GOPATH)/bin
+else
+	GOBIN=$(shell go env GOBIN)
+endif
+GO_CMD ?= go
+
+
+GENREF = $(PROJECT_DIR)/bin/genref
+.PHONY: genref
+genref: ## Download genref locally if necessary.
+	@GOBIN=$(PROJECT_DIR)/bin $(GO_CMD) install github.com/kubernetes-sigs/reference-docs/genref@v0.28.0
diff --git a/README.md b/README.md
index 8e7d0c1..f55700d 100644
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ spec:
     modelName: opt-125m
 ```
 
-### Test
+### Verify
 
 #### Expose the service
 
diff --git a/docs/develop.md b/docs/develop.md
index 70f9508..f04b125 100644
--- a/docs/develop.md
+++ b/docs/develop.md
@@ -6,6 +6,10 @@ A develop guidance for people who want to learn more about this project.
 
 ```structure
 llmaz # root
+├── bin # where the binaries locates, like the kustomize, ginkgo, etc.
+├── chart # where the helm chart locates
+├── cmd # where the main entry locates
+├── docs # where all the documents locate, like examples, installation guidance, etc.
 ├── llmaz # where the model loader logic locates
 ├── pkg # where the main logic for Kubernetes controllers locates
 ```
@@ -14,10 +18,8 @@ llmaz # root
 
 ### Core APIs
 
-**OpenModel**: `OpenModel` is mostly like to store the open sourced models as a cluster-scope object. We may need namespaced models in the future for tenant isolation. Usually, the cloud provider or model provider should set this object because they know models well, like the accelerators or the scaling primitives.
+See the [API Reference](./reference/core.v1alpha1.md) for more details.
 
 ### Inference APIs
 
-**Playground**: `Playground` is for easy usage, people who has little knowledge about cloud can quick deploy a large language model with minimal configurations. `Playground` is integrated with the SOTA inference engines already, like vLLM.
-
-**Service**: `Service` is the real inference workload, people has advanced configuration requirements can deploy with `Service` directly if `Playground` can not meet their demands like they have a customized inference engine, which hasn't been integrated with llmaz yet. Or they have different topology requirements to align with the Pods.
+See the [API Reference](./reference/inference.v1alpha1.md) for more details.
\ No newline at end of file
diff --git a/docs/reference/core.v1alpha1.md b/docs/reference/core.v1alpha1.md
new file mode 100644
index 0000000..dea462c
--- /dev/null
+++ b/docs/reference/core.v1alpha1.md
@@ -0,0 +1,406 @@
+---
+title: llmaz core API
+content_type: tool-reference
+package: llmaz.io/v1alpha1
+auto_generated: true
+description: Generated API reference documentation for llmaz.io/v1alpha1.
+---
+
+
+## Resource Types
+
+
+- [OpenModel](#llmaz-io-v1alpha1-OpenModel)
+  
+
+## `OpenModel`     {#llmaz-io-v1alpha1-OpenModel}
+
+
+**Appears in:**
+
+
+
+<p>OpenModel is the Schema for the open models API</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+<tr><td><code>apiVersion</code><br/>string</td><td><code>llmaz.io/v1alpha1</code></td></tr>
+<tr><td><code>kind</code><br/>string</td><td><code>OpenModel</code></td></tr>
+    
+  
+<tr><td><code>spec</code> <B>[Required]</B><br/>
+<a href="#llmaz-io-v1alpha1-ModelSpec"><code>ModelSpec</code></a>
+</td>
+<td>
+   <span class="text-muted">No description provided.</span></td>
+</tr>
+<tr><td><code>status</code> <B>[Required]</B><br/>
+<a href="#llmaz-io-v1alpha1-ModelStatus"><code>ModelStatus</code></a>
+</td>
+<td>
+   <span class="text-muted">No description provided.</span></td>
+</tr>
+</tbody>
+</table>
+
+## `Flavor`     {#llmaz-io-v1alpha1-Flavor}
+
+
+**Appears in:**
+
+- [InferenceConfig](#llmaz-io-v1alpha1-InferenceConfig)
+
+
+<p>Flavor defines the accelerator requirements for a model and the necessary parameters
+in autoscaling. Right now, it will be used in two places:</p>
+<ul>
+<li>Pod scheduling with node selectors specified.</li>
+<li>Cluster autoscaling with essential parameters provided.</li>
+</ul>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>name</code> <B>[Required]</B><br/>
+<a href="#llmaz-io-v1alpha1-FlavorName"><code>FlavorName</code></a>
+</td>
+<td>
+   <p>Name represents the flavor name, which will be used in model claim.</p>
+</td>
+</tr>
+<tr><td><code>requests</code><br/>
+<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourcelist-v1-core"><code>k8s.io/api/core/v1.ResourceList</code></a>
+</td>
+<td>
+   <p>Requests defines the required accelerators to serve the model for each replica,
+like &lt;nvidia.com/gpu: 8&gt;. For multi-hosts cases, the requests here indicates
+the resource requirements for each replica, usually equals to the TP size.
+Not recommended to set the cpu and memory usage here:</p>
+<ul>
+<li>if using playground, you can define the cpu/mem usage at backendConfig.</li>
+<li>if using inference service, you can define the cpu/mem at the container resources.
+However, if you define the same accelerator requests at playground/service as well,
+the requests will be overwritten by the flavor requests.</li>
+</ul>
+</td>
+</tr>
+<tr><td><code>nodeSelector</code><br/>
+<code>map[string]string</code>
+</td>
+<td>
+   <p>NodeSelector represents the node candidates for Pod placements, if a node doesn't
+meet the nodeSelector, it will be filtered out in the resourceFungibility scheduler plugin.
+If nodeSelector is empty, it means every node is a candidate.</p>
+</td>
+</tr>
+<tr><td><code>params</code><br/>
+<code>map[string]string</code>
+</td>
+<td>
+   <p>Params stores other useful parameters and will be consumed by cluster-autoscaler / Karpenter
+for autoscaling or be defined as model parallelism parameters like TP or PP size.
+E.g. with autoscaling, when scaling up nodes with 8x Nvidia A00, the parameter can be injected
+with &lt;INSTANCE-TYPE: p4d.24xlarge&gt; for AWS.
+Preset parameters: TP, PP, INSTANCE-TYPE.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `FlavorName`     {#llmaz-io-v1alpha1-FlavorName}
+
+(Alias of `string`)
+
+**Appears in:**
+
+- [Flavor](#llmaz-io-v1alpha1-Flavor)
+
+
+
+
+
+
+
+## `InferenceConfig`     {#llmaz-io-v1alpha1-InferenceConfig}
+
+
+**Appears in:**
+
+- [ModelSpec](#llmaz-io-v1alpha1-ModelSpec)
+
+
+<p>InferenceConfig represents the inference configurations for the model.</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>flavors</code><br/>
+<a href="#llmaz-io-v1alpha1-Flavor"><code>[]Flavor</code></a>
+</td>
+<td>
+   <p>Flavors represents the accelerator requirements to serve the model.
+Flavors are fungible following the priority represented by the slice order.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `ModelHub`     {#llmaz-io-v1alpha1-ModelHub}
+
+
+**Appears in:**
+
+- [ModelSource](#llmaz-io-v1alpha1-ModelSource)
+
+
+<p>ModelHub represents the model registry for model downloads.</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>name</code><br/>
+<code>string</code>
+</td>
+<td>
+   <p>Name refers to the model registry, such as huggingface.</p>
+</td>
+</tr>
+<tr><td><code>modelID</code> <B>[Required]</B><br/>
+<code>string</code>
+</td>
+<td>
+   <p>ModelID refers to the model identifier on model hub,
+such as meta-llama/Meta-Llama-3-8B.</p>
+</td>
+</tr>
+<tr><td><code>filename</code> <B>[Required]</B><br/>
+<code>string</code>
+</td>
+<td>
+   <p>Filename refers to a specified model file rather than the whole repo.
+This is helpful to download a specified GGUF model rather than downloading
+the whole repo which includes all kinds of quantized models.
+TODO: this is only supported with Huggingface, add support for ModelScope
+in the near future.
+Note: once filename is set, allowPatterns and ignorePatterns should be left unset.</p>
+</td>
+</tr>
+<tr><td><code>revision</code><br/>
+<code>string</code>
+</td>
+<td>
+   <p>Revision refers to a Git revision id which can be a branch name, a tag, or a commit hash.</p>
+</td>
+</tr>
+<tr><td><code>allowPatterns</code><br/>
+<code>[]string</code>
+</td>
+<td>
+   <p>AllowPatterns refers to files matched with at least one pattern will be downloaded.</p>
+</td>
+</tr>
+<tr><td><code>ignorePatterns</code><br/>
+<code>[]string</code>
+</td>
+<td>
+   <p>IgnorePatterns refers to files matched with any of the patterns will not be downloaded.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `ModelName`     {#llmaz-io-v1alpha1-ModelName}
+
+(Alias of `string`)
+
+**Appears in:**
+
+
+- [ModelRef](#llmaz-io-v1alpha1-ModelRef)
+
+- [ModelSpec](#llmaz-io-v1alpha1-ModelSpec)
+
+
+
+
+
+## `ModelRef`     {#llmaz-io-v1alpha1-ModelRef}
+
+
+**Appears in:**
+
+
+
+<p>ModelRef refers to a created Model with it's role.</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>name</code> <B>[Required]</B><br/>
+<a href="#llmaz-io-v1alpha1-ModelName"><code>ModelName</code></a>
+</td>
+<td>
+   <p>Name represents the model name.</p>
+</td>
+</tr>
+<tr><td><code>role</code><br/>
+<a href="#llmaz-io-v1alpha1-ModelRole"><code>ModelRole</code></a>
+</td>
+<td>
+   <p>Role represents the model role once more than one model is required.
+Such as a draft role, which means running with SpeculativeDecoding,
+and default arguments for backend will be searched in backendRuntime
+with the name of speculative-decoding.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `ModelRole`     {#llmaz-io-v1alpha1-ModelRole}
+
+(Alias of `string`)
+
+**Appears in:**
+
+- [ModelRef](#llmaz-io-v1alpha1-ModelRef)
+
+
+
+
+
+## `ModelSource`     {#llmaz-io-v1alpha1-ModelSource}
+
+
+**Appears in:**
+
+- [ModelSpec](#llmaz-io-v1alpha1-ModelSpec)
+
+
+<p>ModelSource represents the source of the model.
+Only one model source will be used.</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>modelHub</code><br/>
+<a href="#llmaz-io-v1alpha1-ModelHub"><code>ModelHub</code></a>
+</td>
+<td>
+   <p>ModelHub represents the model registry for model downloads.</p>
+</td>
+</tr>
+<tr><td><code>uri</code><br/>
+<a href="#llmaz-io-v1alpha1-URIProtocol"><code>URIProtocol</code></a>
+</td>
+<td>
+   <p>URI represents a various kinds of model sources following the uri protocol, protocol://<!-- raw HTML omitted -->, e.g.</p>
+<ul>
+<li>oss://<!-- raw HTML omitted -->.<!-- raw HTML omitted -->/<!-- raw HTML omitted --></li>
+<li>ollama://llama3.3</li>
+<li>host://<!-- raw HTML omitted --></li>
+</ul>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `ModelSpec`     {#llmaz-io-v1alpha1-ModelSpec}
+
+
+**Appears in:**
+
+- [OpenModel](#llmaz-io-v1alpha1-OpenModel)
+
+
+<p>ModelSpec defines the desired state of Model</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>familyName</code> <B>[Required]</B><br/>
+<a href="#llmaz-io-v1alpha1-ModelName"><code>ModelName</code></a>
+</td>
+<td>
+   <p>FamilyName represents the model type, like llama2, which will be auto injected
+to the labels with the key of <code>llmaz.io/model-family-name</code>.</p>
+</td>
+</tr>
+<tr><td><code>source</code> <B>[Required]</B><br/>
+<a href="#llmaz-io-v1alpha1-ModelSource"><code>ModelSource</code></a>
+</td>
+<td>
+   <p>Source represents the source of the model, there're several ways to load
+the model such as loading from huggingface, OCI registry, s3, host path and so on.</p>
+</td>
+</tr>
+<tr><td><code>inferenceConfig</code> <B>[Required]</B><br/>
+<a href="#llmaz-io-v1alpha1-InferenceConfig"><code>InferenceConfig</code></a>
+</td>
+<td>
+   <p>InferenceConfig represents the inference configurations for the model.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `ModelStatus`     {#llmaz-io-v1alpha1-ModelStatus}
+
+
+**Appears in:**
+
+- [OpenModel](#llmaz-io-v1alpha1-OpenModel)
+
+
+<p>ModelStatus defines the observed state of Model</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>conditions</code> <B>[Required]</B><br/>
+<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta"><code>[]k8s.io/apimachinery/pkg/apis/meta/v1.Condition</code></a>
+</td>
+<td>
+   <p>Conditions represents the Inference condition.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `URIProtocol`     {#llmaz-io-v1alpha1-URIProtocol}
+
+(Alias of `string`)
+
+**Appears in:**
+
+- [ModelSource](#llmaz-io-v1alpha1-ModelSource)
+
+
+<p>URIProtocol represents the protocol of the URI.</p>
+
+
+
+  
\ No newline at end of file
diff --git a/docs/reference/inference.v1alpha1.md b/docs/reference/inference.v1alpha1.md
new file mode 100644
index 0000000..9eca04a
--- /dev/null
+++ b/docs/reference/inference.v1alpha1.md
@@ -0,0 +1,792 @@
+---
+title: llmaz inference API
+content_type: tool-reference
+package: inference.llmaz.io/v1alpha1
+auto_generated: true
+description: Generated API reference documentation for inference.llmaz.io/v1alpha1.
+---
+
+
+## Resource Types
+
+
+- [Playground](#inference-llmaz-io-v1alpha1-Playground)
+- [Service](#inference-llmaz-io-v1alpha1-Service)
+  
+
+## `Playground`     {#inference-llmaz-io-v1alpha1-Playground}
+
+
+**Appears in:**
+
+
+
+<p>Playground is the Schema for the playgrounds API</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+<tr><td><code>apiVersion</code><br/>string</td><td><code>inference.llmaz.io/v1alpha1</code></td></tr>
+<tr><td><code>kind</code><br/>string</td><td><code>Playground</code></td></tr>
+    
+  
+<tr><td><code>spec</code> <B>[Required]</B><br/>
+<a href="#inference-llmaz-io-v1alpha1-PlaygroundSpec"><code>PlaygroundSpec</code></a>
+</td>
+<td>
+   <span class="text-muted">No description provided.</span></td>
+</tr>
+<tr><td><code>status</code> <B>[Required]</B><br/>
+<a href="#inference-llmaz-io-v1alpha1-PlaygroundStatus"><code>PlaygroundStatus</code></a>
+</td>
+<td>
+   <span class="text-muted">No description provided.</span></td>
+</tr>
+</tbody>
+</table>
+
+## `Service`     {#inference-llmaz-io-v1alpha1-Service}
+
+
+**Appears in:**
+
+
+
+<p>Service is the Schema for the services API</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+<tr><td><code>apiVersion</code><br/>string</td><td><code>inference.llmaz.io/v1alpha1</code></td></tr>
+<tr><td><code>kind</code><br/>string</td><td><code>Service</code></td></tr>
+    
+  
+<tr><td><code>spec</code> <B>[Required]</B><br/>
+<a href="#inference-llmaz-io-v1alpha1-ServiceSpec"><code>ServiceSpec</code></a>
+</td>
+<td>
+   <span class="text-muted">No description provided.</span></td>
+</tr>
+<tr><td><code>status</code> <B>[Required]</B><br/>
+<a href="#inference-llmaz-io-v1alpha1-ServiceStatus"><code>ServiceStatus</code></a>
+</td>
+<td>
+   <span class="text-muted">No description provided.</span></td>
+</tr>
+</tbody>
+</table>
+
+## `BackendName`     {#inference-llmaz-io-v1alpha1-BackendName}
+
+(Alias of `string`)
+
+**Appears in:**
+
+- [BackendRuntimeConfig](#inference-llmaz-io-v1alpha1-BackendRuntimeConfig)
+
+
+
+
+
+## `BackendRuntime`     {#inference-llmaz-io-v1alpha1-BackendRuntime}
+
+
+**Appears in:**
+
+
+
+<p>BackendRuntime is the Schema for the backendRuntime API</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>spec</code> <B>[Required]</B><br/>
+<a href="#inference-llmaz-io-v1alpha1-BackendRuntimeSpec"><code>BackendRuntimeSpec</code></a>
+</td>
+<td>
+   <span class="text-muted">No description provided.</span></td>
+</tr>
+<tr><td><code>status</code> <B>[Required]</B><br/>
+<a href="#inference-llmaz-io-v1alpha1-BackendRuntimeStatus"><code>BackendRuntimeStatus</code></a>
+</td>
+<td>
+   <span class="text-muted">No description provided.</span></td>
+</tr>
+</tbody>
+</table>
+
+## `BackendRuntimeArg`     {#inference-llmaz-io-v1alpha1-BackendRuntimeArg}
+
+
+**Appears in:**
+
+- [BackendRuntimeConfig](#inference-llmaz-io-v1alpha1-BackendRuntimeConfig)
+
+- [BackendRuntimeSpec](#inference-llmaz-io-v1alpha1-BackendRuntimeSpec)
+
+
+<p>BackendRuntimeArg is the preset arguments for easy to use.
+Three preset names are provided: default, speculative-decoding, model-parallelism,
+do not change the name.</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>name</code><br/>
+<code>string</code>
+</td>
+<td>
+   <p>Name represents the identifier of the backendRuntime argument.</p>
+</td>
+</tr>
+<tr><td><code>flags</code> <B>[Required]</B><br/>
+<code>[]string</code>
+</td>
+<td>
+   <p>Flags represents all the preset configurations.
+Flag around with {{ .CONFIG }} is a configuration waiting for render.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `BackendRuntimeConfig`     {#inference-llmaz-io-v1alpha1-BackendRuntimeConfig}
+
+
+**Appears in:**
+
+- [PlaygroundSpec](#inference-llmaz-io-v1alpha1-PlaygroundSpec)
+
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>name</code><br/>
+<a href="#inference-llmaz-io-v1alpha1-BackendName"><code>BackendName</code></a>
+</td>
+<td>
+   <p>Name represents the inference backend under the hood, e.g. vLLM.</p>
+</td>
+</tr>
+<tr><td><code>version</code><br/>
+<code>string</code>
+</td>
+<td>
+   <p>Version represents the backend version if you want a different one
+from the default version.</p>
+</td>
+</tr>
+<tr><td><code>args</code> <B>[Required]</B><br/>
+<a href="#inference-llmaz-io-v1alpha1-BackendRuntimeArg"><code>BackendRuntimeArg</code></a>
+</td>
+<td>
+   <p>Args represents the specified arguments of the backendRuntime,
+will be append to the backendRuntime.spec.Args.</p>
+</td>
+</tr>
+<tr><td><code>envs</code><br/>
+<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core"><code>[]k8s.io/api/core/v1.EnvVar</code></a>
+</td>
+<td>
+   <p>Envs represents the environments set to the container.</p>
+</td>
+</tr>
+<tr><td><code>resources</code> <B>[Required]</B><br/>
+<a href="#inference-llmaz-io-v1alpha1-ResourceRequirements"><code>ResourceRequirements</code></a>
+</td>
+<td>
+   <p>Resources represents the resource requirements for backend, like cpu/mem,
+accelerators like GPU should not be defined here, but at the model flavors,
+or the values here will be overwritten.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `BackendRuntimeSpec`     {#inference-llmaz-io-v1alpha1-BackendRuntimeSpec}
+
+
+**Appears in:**
+
+- [BackendRuntime](#inference-llmaz-io-v1alpha1-BackendRuntime)
+
+
+<p>BackendRuntimeSpec defines the desired state of BackendRuntime</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>commands</code><br/>
+<code>[]string</code>
+</td>
+<td>
+   <p>Commands represents the default commands for the backendRuntime.</p>
+</td>
+</tr>
+<tr><td><code>multiHostCommands</code><br/>
+<a href="#inference-llmaz-io-v1alpha1-MultiHostCommands"><code>MultiHostCommands</code></a>
+</td>
+<td>
+   <p>MultiHostCommands represents leader and worker commands for nodes with
+different roles.</p>
+</td>
+</tr>
+<tr><td><code>image</code> <B>[Required]</B><br/>
+<code>string</code>
+</td>
+<td>
+   <p>Image represents the default image registry of the backendRuntime.
+It will work together with version to make up a real image.</p>
+</td>
+</tr>
+<tr><td><code>version</code> <B>[Required]</B><br/>
+<code>string</code>
+</td>
+<td>
+   <p>Version represents the default version of the backendRuntime.
+It will be appended to the image as a tag.</p>
+</td>
+</tr>
+<tr><td><code>args</code> <B>[Required]</B><br/>
+<a href="#inference-llmaz-io-v1alpha1-BackendRuntimeArg"><code>[]BackendRuntimeArg</code></a>
+</td>
+<td>
+   <p>Args represents the preset arguments of the backendRuntime.
+They can be appended or overwritten by the Playground backendRuntimeConfig.</p>
+</td>
+</tr>
+<tr><td><code>envs</code><br/>
+<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core"><code>[]k8s.io/api/core/v1.EnvVar</code></a>
+</td>
+<td>
+   <p>Envs represents the environments set to the container.</p>
+</td>
+</tr>
+<tr><td><code>resources</code> <B>[Required]</B><br/>
+<a href="#inference-llmaz-io-v1alpha1-ResourceRequirements"><code>ResourceRequirements</code></a>
+</td>
+<td>
+   <p>Resources represents the resource requirements for backendRuntime, like cpu/mem,
+accelerators like GPU should not be defined here, but at the model flavors,
+or the values here will be overwritten.</p>
+</td>
+</tr>
+<tr><td><code>livenessProbe</code><br/>
+<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core"><code>k8s.io/api/core/v1.Probe</code></a>
+</td>
+<td>
+   <p>Periodic probe of backend liveness.
+Backend will be restarted if the probe fails.
+Cannot be updated.</p>
+</td>
+</tr>
+<tr><td><code>readinessProbe</code><br/>
+<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core"><code>k8s.io/api/core/v1.Probe</code></a>
+</td>
+<td>
+   <p>Periodic probe of backend readiness.
+Backend will be removed from service endpoints if the probe fails.</p>
+</td>
+</tr>
+<tr><td><code>startupProbe</code><br/>
+<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core"><code>k8s.io/api/core/v1.Probe</code></a>
+</td>
+<td>
+   <p>StartupProbe indicates that the Backend has successfully initialized.
+If specified, no other probes are executed until this completes successfully.
+If this probe fails, the backend will be restarted, just as if the livenessProbe failed.
+This can be used to provide different probe parameters at the beginning of a backend's lifecycle,
+when it might take a long time to load data or warm a cache, than during steady-state operation.</p>
+</td>
+</tr>
+<tr><td><code>scaleTriggers</code><br/>
+<a href="#inference-llmaz-io-v1alpha1-NamedScaleTrigger"><code>[]NamedScaleTrigger</code></a>
+</td>
+<td>
+   <p>ScaleTriggers represents a set of triggers preset to be used by Playground.
+If Playground not specify the scale trigger, the 0-index trigger will be used.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `BackendRuntimeStatus`     {#inference-llmaz-io-v1alpha1-BackendRuntimeStatus}
+
+
+**Appears in:**
+
+- [BackendRuntime](#inference-llmaz-io-v1alpha1-BackendRuntime)
+
+
+<p>BackendRuntimeStatus defines the observed state of BackendRuntime</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>conditions</code> <B>[Required]</B><br/>
+<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta"><code>[]k8s.io/apimachinery/pkg/apis/meta/v1.Condition</code></a>
+</td>
+<td>
+   <p>Conditions represents the Inference condition.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `ElasticConfig`     {#inference-llmaz-io-v1alpha1-ElasticConfig}
+
+
+**Appears in:**
+
+- [PlaygroundSpec](#inference-llmaz-io-v1alpha1-PlaygroundSpec)
+
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>minReplicas</code><br/>
+<code>int32</code>
+</td>
+<td>
+   <p>MinReplicas indicates the minimum number of inference workloads based on the traffic.
+Default to 1.
+MinReplicas couldn't be 0 now, will support serverless in the future.</p>
+</td>
+</tr>
+<tr><td><code>maxReplicas</code><br/>
+<code>int32</code>
+</td>
+<td>
+   <p>MaxReplicas indicates the maximum number of inference workloads based on the traffic.
+Default to nil means there's no limit for the instance number.</p>
+</td>
+</tr>
+<tr><td><code>scaleTriggerRef</code><br/>
+<a href="#inference-llmaz-io-v1alpha1-ScaleTriggerRef"><code>ScaleTriggerRef</code></a>
+</td>
+<td>
+   <p>ScaleTriggerRef refers to the configured scaleTrigger in the backendRuntime
+with tuned target value.
+ScaleTriggerRef and ScaleTrigger can't be set at the same time.</p>
+</td>
+</tr>
+<tr><td><code>scaleTrigger</code><br/>
+<a href="#inference-llmaz-io-v1alpha1-ScaleTrigger"><code>ScaleTrigger</code></a>
+</td>
+<td>
+   <p>ScaleTrigger defines a set of triggers to scale the workloads.
+If not defined, trigger configured in backendRuntime will be used,
+otherwise, trigger defined here will overwrite the defaulted ones.
+ScaleTriggerRef and ScaleTrigger can't be set at the same time.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `HPATrigger`     {#inference-llmaz-io-v1alpha1-HPATrigger}
+
+
+**Appears in:**
+
+- [NamedScaleTrigger](#inference-llmaz-io-v1alpha1-NamedScaleTrigger)
+
+- [ScaleTrigger](#inference-llmaz-io-v1alpha1-ScaleTrigger)
+
+
+<p>HPATrigger represents the configuration of the HorizontalPodAutoscaler.
+Inspired by kubernetes.io/pkg/apis/autoscaling/types.go#HorizontalPodAutoscalerSpec.
+Note: HPA component should be installed in prior.</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>metrics</code><br/>
+<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#metricspec-v2-autoscaling"><code>[]k8s.io/api/autoscaling/v2.MetricSpec</code></a>
+</td>
+<td>
+   <p>metrics contains the specifications for which to use to calculate the
+desired replica count (the maximum replica count across all metrics will
+be used).  The desired replica count is calculated multiplying the
+ratio between the target value and the current value by the current
+number of pods.  Ergo, metrics used must decrease as the pod count is
+increased, and vice-versa.  See the individual metric source types for
+more information about how each type of metric must respond.</p>
+</td>
+</tr>
+<tr><td><code>behavior</code><br/>
+<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#horizontalpodautoscalerbehavior-v2-autoscaling"><code>k8s.io/api/autoscaling/v2.HorizontalPodAutoscalerBehavior</code></a>
+</td>
+<td>
+   <p>behavior configures the scaling behavior of the target
+in both Up and Down directions (scaleUp and scaleDown fields respectively).
+If not set, the default HPAScalingRules for scale up and scale down are used.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `MultiHostCommands`     {#inference-llmaz-io-v1alpha1-MultiHostCommands}
+
+
+**Appears in:**
+
+- [BackendRuntimeSpec](#inference-llmaz-io-v1alpha1-BackendRuntimeSpec)
+
+
+<p>MultiHostCommands represents leader &amp; worker commands for multiple nodes scenarios.</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>leader</code> <B>[Required]</B><br/>
+<code>[]string</code>
+</td>
+<td>
+   <span class="text-muted">No description provided.</span></td>
+</tr>
+<tr><td><code>worker</code> <B>[Required]</B><br/>
+<code>[]string</code>
+</td>
+<td>
+   <span class="text-muted">No description provided.</span></td>
+</tr>
+</tbody>
+</table>
+
+## `NamedScaleTrigger`     {#inference-llmaz-io-v1alpha1-NamedScaleTrigger}
+
+
+**Appears in:**
+
+- [BackendRuntimeSpec](#inference-llmaz-io-v1alpha1-BackendRuntimeSpec)
+
+
+<p>NamedScaleTrigger defines the rules to scale the workloads.
+Only one trigger cloud work at a time. The name is used to identify
+the trigger in backendRuntime.</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>name</code> <B>[Required]</B><br/>
+<code>string</code>
+</td>
+<td>
+   <p>Name represents the identifier of the scale trigger, e.g. some triggers defined for
+latency sensitive workloads, some are defined for throughput sensitive workloads.</p>
+</td>
+</tr>
+<tr><td><code>hpa</code> <B>[Required]</B><br/>
+<a href="#inference-llmaz-io-v1alpha1-HPATrigger"><code>HPATrigger</code></a>
+</td>
+<td>
+   <p>HPA represents the trigger configuration of the HorizontalPodAutoscaler.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `PlaygroundSpec`     {#inference-llmaz-io-v1alpha1-PlaygroundSpec}
+
+
+**Appears in:**
+
+- [Playground](#inference-llmaz-io-v1alpha1-Playground)
+
+
+<p>PlaygroundSpec defines the desired state of Playground</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>replicas</code><br/>
+<code>int32</code>
+</td>
+<td>
+   <p>Replicas represents the replica number of inference workloads.</p>
+</td>
+</tr>
+<tr><td><code>modelClaim</code><br/>
+<a href="#llmaz-io-v1alpha1-ModelClaim"><code>ModelClaim</code></a>
+</td>
+<td>
+   <p>ModelClaim represents claiming for one model, it's a simplified use case
+of modelClaims. Most of the time, modelClaim is enough.
+ModelClaim and modelClaims are exclusive configured.</p>
+</td>
+</tr>
+<tr><td><code>modelClaims</code><br/>
+<a href="#llmaz-io-v1alpha1-ModelClaims"><code>ModelClaims</code></a>
+</td>
+<td>
+   <p>ModelClaims represents claiming for multiple models for more complicated
+use cases like speculative-decoding.
+ModelClaims and modelClaim are exclusive configured.</p>
+</td>
+</tr>
+<tr><td><code>backendRuntimeConfig</code><br/>
+<a href="#inference-llmaz-io-v1alpha1-BackendRuntimeConfig"><code>BackendRuntimeConfig</code></a>
+</td>
+<td>
+   <p>BackendRuntimeConfig represents the inference backendRuntime configuration
+under the hood, e.g. vLLM, which is the default backendRuntime.</p>
+</td>
+</tr>
+<tr><td><code>elasticConfig</code><br/>
+<a href="#inference-llmaz-io-v1alpha1-ElasticConfig"><code>ElasticConfig</code></a>
+</td>
+<td>
+   <p>ElasticConfig defines the configuration for elastic usage,
+e.g. the max/min replicas.
+Note: this requires to install the HPA first or will report error.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `PlaygroundStatus`     {#inference-llmaz-io-v1alpha1-PlaygroundStatus}
+
+
+**Appears in:**
+
+- [Playground](#inference-llmaz-io-v1alpha1-Playground)
+
+
+<p>PlaygroundStatus defines the observed state of Playground</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>conditions</code> <B>[Required]</B><br/>
+<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta"><code>[]k8s.io/apimachinery/pkg/apis/meta/v1.Condition</code></a>
+</td>
+<td>
+   <p>Conditions represents the Inference condition.</p>
+</td>
+</tr>
+<tr><td><code>replicas</code> <B>[Required]</B><br/>
+<code>int32</code>
+</td>
+<td>
+   <p>Replicas track the replicas that have been created, whether ready or not.</p>
+</td>
+</tr>
+<tr><td><code>selector</code> <B>[Required]</B><br/>
+<code>string</code>
+</td>
+<td>
+   <p>Selector points to the string form of a label selector which will be used by HPA.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `ResourceRequirements`     {#inference-llmaz-io-v1alpha1-ResourceRequirements}
+
+
+**Appears in:**
+
+- [BackendRuntimeConfig](#inference-llmaz-io-v1alpha1-BackendRuntimeConfig)
+
+- [BackendRuntimeSpec](#inference-llmaz-io-v1alpha1-BackendRuntimeSpec)
+
+
+<p>TODO: Do not support DRA yet, we can support that once needed.</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>limits</code><br/>
+<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourcelist-v1-core"><code>k8s.io/api/core/v1.ResourceList</code></a>
+</td>
+<td>
+   <p>Limits describes the maximum amount of compute resources allowed.
+More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/</p>
+</td>
+</tr>
+<tr><td><code>requests</code><br/>
+<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourcelist-v1-core"><code>k8s.io/api/core/v1.ResourceList</code></a>
+</td>
+<td>
+   <p>Requests describes the minimum amount of compute resources required.
+If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+otherwise to an implementation-defined value. Requests cannot exceed Limits.
+More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `ScaleTrigger`     {#inference-llmaz-io-v1alpha1-ScaleTrigger}
+
+
+**Appears in:**
+
+- [ElasticConfig](#inference-llmaz-io-v1alpha1-ElasticConfig)
+
+
+<p>ScaleTrigger defines the rules to scale the workloads.
+Only one trigger cloud work at a time, mostly used in Playground.</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>hpa</code> <B>[Required]</B><br/>
+<a href="#inference-llmaz-io-v1alpha1-HPATrigger"><code>HPATrigger</code></a>
+</td>
+<td>
+   <p>HPA represents the trigger configuration of the HorizontalPodAutoscaler.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `ScaleTriggerRef`     {#inference-llmaz-io-v1alpha1-ScaleTriggerRef}
+
+
+**Appears in:**
+
+- [ElasticConfig](#inference-llmaz-io-v1alpha1-ElasticConfig)
+
+
+<p>ScaleTriggerRef refers to the configured scaleTrigger in the backendRuntime.</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>name</code> <B>[Required]</B><br/>
+<code>string</code>
+</td>
+<td>
+   <p>Name represents the scale trigger name defined in the backendRuntime.scaleTriggers.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `ServiceSpec`     {#inference-llmaz-io-v1alpha1-ServiceSpec}
+
+
+**Appears in:**
+
+- [Service](#inference-llmaz-io-v1alpha1-Service)
+
+
+<p>ServiceSpec defines the desired state of Service.
+Service controller will maintain multi-flavor of workloads with
+different accelerators for cost or performance considerations.</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>modelClaims</code> <B>[Required]</B><br/>
+<a href="#llmaz-io-v1alpha1-ModelClaims"><code>ModelClaims</code></a>
+</td>
+<td>
+   <p>ModelClaims represents multiple claims for different models.</p>
+</td>
+</tr>
+<tr><td><code>workloadTemplate</code> <B>[Required]</B><br/>
+<code>sigs.k8s.io/lws/api/leaderworkerset/v1.LeaderWorkerSetSpec</code>
+</td>
+<td>
+   <p>WorkloadTemplate defines the underlying workload layout and configuration.
+Note: the LWS spec might be twisted with various LWS instances to support
+accelerator fungibility or other cutting-edge researches.
+LWS supports both single-host and multi-host scenarios, for single host
+cases, only need to care about replicas, rolloutStrategy and workerTemplate.</p>
+</td>
+</tr>
+</tbody>
+</table>
+
+## `ServiceStatus`     {#inference-llmaz-io-v1alpha1-ServiceStatus}
+
+
+**Appears in:**
+
+- [Service](#inference-llmaz-io-v1alpha1-Service)
+
+
+<p>ServiceStatus defines the observed state of Service</p>
+
+
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    
+  
+<tr><td><code>conditions</code> <B>[Required]</B><br/>
+<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta"><code>[]k8s.io/apimachinery/pkg/apis/meta/v1.Condition</code></a>
+</td>
+<td>
+   <p>Conditions represents the Inference condition.</p>
+</td>
+</tr>
+<tr><td><code>replicas</code> <B>[Required]</B><br/>
+<code>int32</code>
+</td>
+<td>
+   <p>Replicas track the replicas that have been created, whether ready or not.</p>
+</td>
+</tr>
+<tr><td><code>selector</code> <B>[Required]</B><br/>
+<code>string</code>
+</td>
+<td>
+   <p>Selector points to the string form of a label selector, the HPA will be
+able to autoscale your resource.</p>
+</td>
+</tr>
+</tbody>
+</table>
+  
\ No newline at end of file
diff --git a/hack/genref/config.yaml b/hack/genref/config.yaml
new file mode 100644
index 0000000..427e154
--- /dev/null
+++ b/hack/genref/config.yaml
@@ -0,0 +1,23 @@
+hiddenMemberFields:
+  - "TypeMeta"
+  - "ObjectMeta"
+
+apis:
+  - name: core
+    title: llmaz core API
+    package: github.com/inftyai/llmaz
+    path: api/core/v1alpha1
+  - name: inference
+    title: llmaz inference API
+    package: github.com/inftyai/llmaz
+    path: api/inference/v1alpha1
+
+externalPackages:
+  - match: ^k8s\.io/(api|apimachinery/pkg/apis)/
+    target: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#{{- lower .TypeIdentifier -}}-{{- arrIndex .PackageSegments -1 -}}-{{- arrIndex .PackageSegments -2 -}}
+  - match: ^k8s\.io/apimachinery/pkg/api/resource\.Quantity$
+    target: https://pkg.go.dev/k8s.io/apimachinery/pkg/api/resource#Quantity
+  - match: ^k8s.io/component-base/config/v1alpha1.
+    target: https://pkg.go.dev/k8s.io/component-base/config/v1alpha1#{{- .TypeIdentifier -}}
+  - match: ^time\.Duration$
+    target: https://pkg.go.dev/time#Duration
diff --git a/hack/genref/markdown/members.tpl b/hack/genref/markdown/members.tpl
new file mode 100644
index 0000000..b970218
--- /dev/null
+++ b/hack/genref/markdown/members.tpl
@@ -0,0 +1,34 @@
+{{ define "members" }}
+  {{/* . is a apiType */}}
+  {{- range .GetMembers -}}
+    {{/* . is a apiMember */}}
+    {{- if not .Hidden }}
+<tr><td><code>{{ .FieldName }}</code>
+      {{- if not .IsOptional }} <B>[Required]</B>{{- end -}}
+<br/>
+{{/* Link for type reference */}}
+      {{- with .GetType -}}
+        {{- if .Link -}}
+<a href="{{ .Link }}"><code>{{ .DisplayName }}</code></a>
+        {{- else -}}
+<code>{{ .DisplayName }}</code>
+        {{- end -}}
+      {{- end }}
+</td>
+<td>
+   {{- if .IsInline -}}
+(Members of <code>{{ .FieldName }}</code> are embedded into this type.)
+   {{- end }}
+   {{ if .GetComment -}}
+   {{ .GetComment }}
+   {{- else -}}
+   <span class="text-muted">No description provided.</span>
+   {{- end }}
+   {{- if and (eq (.GetType.Name.Name) "ObjectMeta") -}}
+Refer to the Kubernetes API documentation for the fields of the <code>metadata</code> field.
+   {{- end -}}
+</td>
+</tr>
+    {{- end }}
+  {{- end }}
+{{ end }}
diff --git a/hack/genref/markdown/pkg.tpl b/hack/genref/markdown/pkg.tpl
new file mode 100644
index 0000000..313b069
--- /dev/null
+++ b/hack/genref/markdown/pkg.tpl
@@ -0,0 +1,46 @@
+{{ define "packages" -}}
+
+{{- range $idx, $val := .packages -}}
+{{/* Special handling for config */}}
+  {{- if .IsMain -}}
+---
+title: {{ .Title }}
+content_type: tool-reference
+package: {{ .DisplayName }}
+auto_generated: true
+description: Generated API reference documentation for {{ if ne .GroupName "" -}} {{ .DisplayName }}{{ else -}} LeaderWorkerSet Configuration{{- end -}}.
+---
+{{ .GetComment -}}
+  {{- end -}}
+{{- end }}
+
+## Resource Types
+
+{{ range .packages -}}
+  {{ $isConfig := (eq .GroupName "") }}
+  {{- range .VisibleTypes -}}
+    {{- if or .IsExported (and $isConfig (eq .DisplayName "Configuration")) }}
+- [{{ .DisplayName }}]({{ .Link }})
+    {{- end -}}
+  {{- end -}}
+{{- end -}}
+
+{{ range .packages }}
+  {{ if ne .GroupName "" -}}
+    {{/* For package with a group name, list all type definitions in it. */}}
+    {{- range .VisibleTypes }}
+      {{- if or .Referenced .IsExported -}}
+{{ template "type" . }}
+      {{- end -}}
+    {{ end }}
+  {{ else }}
+    {{/* For package w/o group name, list only types referenced. */}}
+    {{ $isConfig := (eq .GroupName "") }}
+    {{- range .VisibleTypes -}}
+      {{- if or .Referenced $isConfig -}}
+{{ template "type" . }}
+      {{- end -}}
+    {{- end }}
+  {{- end }}
+{{- end }}
+{{- end }}
diff --git a/hack/genref/markdown/type.tpl b/hack/genref/markdown/type.tpl
new file mode 100644
index 0000000..f64dbe6
--- /dev/null
+++ b/hack/genref/markdown/type.tpl
@@ -0,0 +1,37 @@
+{{ define "type" }}
+
+## `{{ .Name.Name }}`     {#{{ .Anchor }}}
+
+{{ if eq .Kind "Alias" -}}
+(Alias of `{{ .Underlying }}`)
+{{ end }}
+
+{{- with .References }}
+**Appears in:**
+{{ range . }}
+{{ if or .Referenced .IsExported -}}
+- [{{ .DisplayName }}]({{ .Link }})
+{{ end -}}
+{{- end -}}
+{{- end }}
+
+{{ if .GetComment -}}
+{{ .GetComment }}
+{{ end }}
+{{ if .GetMembers -}}
+<table class="table">
+<thead><tr><th width="30%">Field</th><th>Description</th></tr></thead>
+<tbody>
+    {{/* . is a apiType */}}
+    {{- if .IsExported -}}
+{{/* Add apiVersion and kind rows if deemed necessary */}}
+<tr><td><code>apiVersion</code><br/>string</td><td><code>{{- .APIGroup -}}</code></td></tr>
+<tr><td><code>kind</code><br/>string</td><td><code>{{- .Name.Name -}}</code></td></tr>
+    {{ end -}}
+
+{{/* The actual list of members is in the following template */}}
+{{- template "members" . -}}
+</tbody>
+</table>
+{{- end -}}
+{{- end -}}

Field	Description
`apiVersion` string	`llmaz.io/v1alpha1`
`kind` string	`OpenModel`
`spec` [Required] +`ModelSpec` +	+ No description provided.
`status` [Required] +`ModelStatus` +	+ No description provided.
Field	Description
`name` [Required] +`FlavorName` +	+ Name represents the flavor name, which will be used in model claim. +
`requests` +`k8s.io/api/core/v1.ResourceList` +	+ Requests defines the required accelerators to serve the model for each replica, +like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates +the resource requirements for each replica, usually equals to the TP size. +Not recommended to set the cpu and memory usage here: + + if using playground, you can define the cpu/mem usage at backendConfig. + if using inference service, you can define the cpu/mem at the container resources. +However, if you define the same accelerator requests at playground/service as well, +the requests will be overwritten by the flavor requests. + +
`nodeSelector` +`map[string]string` +	+ NodeSelector represents the node candidates for Pod placements, if a node doesn't +meet the nodeSelector, it will be filtered out in the resourceFungibility scheduler plugin. +If nodeSelector is empty, it means every node is a candidate. +
`params` +`map[string]string` +	+ Params stores other useful parameters and will be consumed by cluster-autoscaler / Karpenter +for autoscaling or be defined as model parallelism parameters like TP or PP size. +E.g. with autoscaling, when scaling up nodes with 8x Nvidia A00, the parameter can be injected +with <INSTANCE-TYPE: p4d.24xlarge> for AWS. +Preset parameters: TP, PP, INSTANCE-TYPE. +
Field	Description
`name` +`string` +	+ Name refers to the model registry, such as huggingface. +
`modelID` [Required] +`string` +	+ ModelID refers to the model identifier on model hub, +such as meta-llama/Meta-Llama-3-8B. +
`filename` [Required] +`string` +	+ Filename refers to a specified model file rather than the whole repo. +This is helpful to download a specified GGUF model rather than downloading +the whole repo which includes all kinds of quantized models. +TODO: this is only supported with Huggingface, add support for ModelScope +in the near future. +Note: once filename is set, allowPatterns and ignorePatterns should be left unset. +
`revision` +`string` +	+ Revision refers to a Git revision id which can be a branch name, a tag, or a commit hash. +
`allowPatterns` +`[]string` +	+ AllowPatterns refers to files matched with at least one pattern will be downloaded. +
`ignorePatterns` +`[]string` +	+ IgnorePatterns refers to files matched with any of the patterns will not be downloaded. +
Field	Description
`name` [Required] +`ModelName` +	+ Name represents the model name. +
`role` +`ModelRole` +	+ Role represents the model role once more than one model is required. +Such as a draft role, which means running with SpeculativeDecoding, +and default arguments for backend will be searched in backendRuntime +with the name of speculative-decoding. +
Field	Description
`familyName` [Required] +`ModelName` +	+ FamilyName represents the model type, like llama2, which will be auto injected +to the labels with the key of `llmaz.io/model-family-name`. +
`source` [Required] +`ModelSource` +	+ Source represents the source of the model, there're several ways to load +the model such as loading from huggingface, OCI registry, s3, host path and so on. +
`inferenceConfig` [Required] +`InferenceConfig` +	+ InferenceConfig represents the inference configurations for the model. +
Field	Description
`apiVersion` string	`inference.llmaz.io/v1alpha1`
`kind` string	`Playground`
`spec` [Required] +`PlaygroundSpec` +	+ No description provided.
`status` [Required] +`PlaygroundStatus` +	+ No description provided.
Field	Description
`spec` [Required] +`BackendRuntimeSpec` +	+ No description provided.
`status` [Required] +`BackendRuntimeStatus` +	+ No description provided.
Field	Description
`name` +`string` +	+ Name represents the identifier of the backendRuntime argument. +
`flags` [Required] +`[]string` +	+ Flags represents all the preset configurations. +Flag around with {{ .CONFIG }} is a configuration waiting for render. +
Field	Description
`name` +`BackendName` +	+ Name represents the inference backend under the hood, e.g. vLLM. +
`version` +`string` +	+ Version represents the backend version if you want a different one +from the default version. +
`args` [Required] +`BackendRuntimeArg` +	+ Args represents the specified arguments of the backendRuntime, +will be append to the backendRuntime.spec.Args. +
`envs` +`[]k8s.io/api/core/v1.EnvVar` +	+ Envs represents the environments set to the container. +
`resources` [Required] +`ResourceRequirements` +	+ Resources represents the resource requirements for backend, like cpu/mem, +accelerators like GPU should not be defined here, but at the model flavors, +or the values here will be overwritten. +
Field	Description
`commands` +`[]string` +	+ Commands represents the default commands for the backendRuntime. +
`multiHostCommands` +`MultiHostCommands` +	+ MultiHostCommands represents leader and worker commands for nodes with +different roles. +
`image` [Required] +`string` +	+ Image represents the default image registry of the backendRuntime. +It will work together with version to make up a real image. +
`version` [Required] +`string` +	+ Version represents the default version of the backendRuntime. +It will be appended to the image as a tag. +
`args` [Required] +`[]BackendRuntimeArg` +	+ Args represents the preset arguments of the backendRuntime. +They can be appended or overwritten by the Playground backendRuntimeConfig. +
`envs` +`[]k8s.io/api/core/v1.EnvVar` +	+ Envs represents the environments set to the container. +
`resources` [Required] +`ResourceRequirements` +	+ Resources represents the resource requirements for backendRuntime, like cpu/mem, +accelerators like GPU should not be defined here, but at the model flavors, +or the values here will be overwritten. +
`livenessProbe` +`k8s.io/api/core/v1.Probe` +	+ Periodic probe of backend liveness. +Backend will be restarted if the probe fails. +Cannot be updated. +
`readinessProbe` +`k8s.io/api/core/v1.Probe` +	+ Periodic probe of backend readiness. +Backend will be removed from service endpoints if the probe fails. +
`startupProbe` +`k8s.io/api/core/v1.Probe` +	+ StartupProbe indicates that the Backend has successfully initialized. +If specified, no other probes are executed until this completes successfully. +If this probe fails, the backend will be restarted, just as if the livenessProbe failed. +This can be used to provide different probe parameters at the beginning of a backend's lifecycle, +when it might take a long time to load data or warm a cache, than during steady-state operation. +
`scaleTriggers` +`[]NamedScaleTrigger` +	+ ScaleTriggers represents a set of triggers preset to be used by Playground. +If Playground not specify the scale trigger, the 0-index trigger will be used. +