Skip to content

Commit

Permalink
Merge pull request #33 from kthcloud/23-gpu-booking
Browse files Browse the repository at this point in the history
Add Dockerfiles for GPU Booking Pod Terminator and Admission Controller; update GPU booking template and enhance KUBECONFIG handling in MAIA scripts
  • Loading branch information
SimoneBendazzoli93 authored Feb 9, 2025
2 parents fcce02c + be60714 commit 1453407
Show file tree
Hide file tree
Showing 12 changed files with 555 additions and 9 deletions.
16 changes: 16 additions & 0 deletions MAIA/dashboard_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,22 @@ def get_namespaces(id_token, api_urls, private_clusters = []):
for API_URL in api_urls:
if API_URL in private_clusters:
token = private_clusters[API_URL]
try:
response = requests.get(API_URL + "/api/v1/namespaces",
headers={"Authorization": "Bearer {}".format(token)}, verify=False)
except:
continue
else:
try:
response = requests.get(API_URL + "/api/v1/namespaces",
headers={"Authorization": "Bearer {}".format(id_token)}, verify=False)
except:
continue
namespaces = json.loads(response.text)
for namespace in namespaces['items']:
namespace_list.append(namespace['metadata']['name'])
return list(set(namespace_list))

def get_cluster_status(id_token, api_urls, cluster_names, private_clusters = []):
"""
Retrieve the status of clusters and their nodes.
Expand Down
10 changes: 10 additions & 0 deletions MAIA/maia_admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ def get_minio_config_if_exists(project_id):
- "console_secret_key": The console secret key, if found.
- "secret_key": The MinIO root password, if found.
"""
if not "KUBECONFIG_LOCAL" in os.environ:
os.environ["KUBECONFIG_LOCAL"] = os.environ["KUBECONFIG"]
kubeconfig = yaml.safe_load(Path(os.environ["KUBECONFIG_LOCAL"]).read_text())
config.load_kube_config_from_dict(kubeconfig)

Expand Down Expand Up @@ -146,6 +148,8 @@ def get_mlflow_config_if_exists(project_id):
kubernetes.client.exceptions.ApiException
If there is an error communicating with the Kubernetes API.
"""
if not "KUBECONFIG_LOCAL" in os.environ:
os.environ["KUBECONFIG_LOCAL"] = os.environ["KUBECONFIG"]
kubeconfig = yaml.safe_load(Path(os.environ["KUBECONFIG_LOCAL"]).read_text())
config.load_kube_config_from_dict(kubeconfig)

Expand Down Expand Up @@ -216,6 +220,8 @@ def get_mysql_config_if_exists(project_id):
variable "KUBECONFIG" and that the MySQL deployment name starts with the project ID followed
by "-mysql-mkg".
"""
if not "KUBECONFIG_LOCAL" in os.environ:
os.environ["KUBECONFIG_LOCAL"] = os.environ["KUBECONFIG"]
kubeconfig = yaml.safe_load(Path(os.environ["KUBECONFIG_LOCAL"]).read_text())
config.load_kube_config_from_dict(kubeconfig)

Expand Down Expand Up @@ -994,6 +1000,10 @@ def create_maia_dashboard_values(config_folder, project_id, cluster_config_dict,
"paths": [
{ "path": "/maia/",
"pathType": "Prefix"
},
{
"path": "/maia-api/",
"pathType": "Prefix"
}
]
}
Expand Down
20 changes: 19 additions & 1 deletion MAIA/maia_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,7 @@ def create_gpu_operator_values(config_folder, project_id, cluster_config_dict):

gpu_operator_values = {
"namespace": "gpu-operator",
"chart_version": "24.9.1",
"chart_version": "24.3.0",
"repo_url": "https://helm.ngc.nvidia.com/nvidia",
"chart_name": "gpu-operator"
} # TODO: Change this to updated values
Expand All @@ -732,13 +732,31 @@ def create_gpu_operator_values(config_folder, project_id, cluster_config_dict):
}
]
}


elif cluster_config_dict["k8s_distribution"] == "rke2":
gpu_operator_values["toolkit"] = {
"driver": {
"enabled": False
},
"env": [
{
"name": "CONTAINERD_SOCKET",
"value": "/run/k3s/containerd/containerd.sock"
}
,
{
"name": "CONTAINERD_CONFIG",
"value": "/var/lib/rancher/rke2/agent/etc/containerd/config.toml.tmpl"
},
{
"name": "CONTAINERD_RUNTIME_CLASS",
"value": "nvidia"
},
{
"name": "CONTAINERD_SET_AS_DEFAULT",
"value": "true"
}
]
}

Expand Down
10 changes: 8 additions & 2 deletions MAIA/maia_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ def get_ssh_port_dict(port_type,namespace,port_range, maia_metallb_ip=None):
A list of dictionaries with service names as keys and their corresponding used SSH ports as values.
Returns None if an exception occurs.
"""

if not "KUBECONFIG_LOCAL" in os.environ:
os.environ["KUBECONFIG_LOCAL"] = os.environ["KUBECONFIG"]
kubeconfig = yaml.safe_load(Path(os.environ["KUBECONFIG_LOCAL"]).read_text())
config.load_kube_config_from_dict(kubeconfig)

Expand Down Expand Up @@ -132,6 +133,8 @@ def get_ssh_ports(n_requested_ports, port_type, ip_range, maia_metallb_ip=None):
None
If an error occurs during the process.
"""
if not "KUBECONFIG_LOCAL" in os.environ:
os.environ["KUBECONFIG_LOCAL"] = os.environ["KUBECONFIG"]
kubeconfig = yaml.safe_load(Path(os.environ["KUBECONFIG_LOCAL"]).read_text())
config.load_kube_config_from_dict(kubeconfig)

Expand Down Expand Up @@ -370,6 +373,8 @@ def deploy_mysql(cluster_config, user_config, config_folder, mysql_configs):
A dictionary containing deployment details such as namespace, release name, chart name, repository URL, version, and values file path.
"""
namespace = user_config["group_ID"].lower().replace("_", "-")
if not "KUBECONFIG_LOCAL" in os.environ:
os.environ["KUBECONFIG_LOCAL"] = os.environ["KUBECONFIG"]
kubeconfig = yaml.safe_load(Path(os.environ["KUBECONFIG_LOCAL"]).read_text())

mysql_config = {
Expand Down Expand Up @@ -445,7 +450,8 @@ def deploy_mlflow(cluster_config, user_config, config_folder, mysql_config=None,
A dictionary containing deployment details such as namespace, release name, chart name, repository URL, chart version, and path to the values file.
"""
namespace = user_config["group_ID"].lower().replace("_", "-")

if not "KUBECONFIG_LOCAL" in os.environ:
os.environ["KUBECONFIG_LOCAL"] = os.environ["KUBECONFIG"]
kubeconfig = yaml.safe_load(Path(os.environ["KUBECONFIG_LOCAL"]).read_text())
config.load_kube_config_from_dict(kubeconfig)

Expand Down
24 changes: 21 additions & 3 deletions dashboard/apps/gpu_scheduler/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@ class GPUSchedulabilityAPIView(APIView):
def post(self, request, *args, **kwargs):
try:
user_email = request.data.get("user_email")
namespace = request.data.get("namespace")
if not user_email:
return Response({"error": "Missing user_email"}, status=400)
if not namespace:
return Response({"error": "Missing namespace"}, status=400)

secret_token = request.data.get("token")
if not secret_token or secret_token != settings.SECRET_KEY:
Expand All @@ -41,6 +44,7 @@ def post(self, request, *args, **kwargs):
# Calculate the number of days for the new booking
ending_time = datetime.strptime(booking_data["ending_time"], "%Y-%m-%d %H:%M:%S")
starting_time = datetime.strptime(booking_data["starting_time"], "%Y-%m-%d %H:%M:%S")
gpu = booking_data["gpu"]
new_booking_days = (ending_time - starting_time).days

# Verify that the sum of existing bookings and the new booking does not exceed 60 days
Expand All @@ -51,7 +55,9 @@ def post(self, request, *args, **kwargs):
GPUBooking.objects.create(
user_email=user_email,
start_date=booking_data["starting_time"],
end_date=booking_data["ending_time"]
end_date=booking_data["ending_time"],
namespace=namespace,
gpu=gpu
)
return Response({"message": "Booking created successfully"})

Expand All @@ -62,7 +68,7 @@ def post(self, request, *args, **kwargs):

current_time = datetime.now(timezone.utc)
is_schedulable = any(
status.start_date <= current_time and status.end_date >= current_time
status.start_date <= current_time and status.end_date >= current_time and status.namespace == namespace
for status in user_statuses
)
if is_schedulable:
Expand Down Expand Up @@ -119,9 +125,21 @@ def book_gpu(request):

@login_required(login_url="/maia/login/")
def gpu_booking_info(request):

id_token = request.session.get('oidc_id_token')
groups = request.user.groups.all()
namespaces = []
if request.user.is_superuser:
namespaces = get_namespaces(id_token)

else:
for group in groups:
if str(group) != "MAIA:users":
namespaces.append(str(group).split(":")[-1].lower().replace("_","-"))

bookings = GPUBooking.objects.filter(user_email=request.user.email)

total_days = 0
for booking in bookings:
total_days += (booking.end_date - booking.start_date).days
return render(request, "accounts/gpu_booking_info.html", {"dashboard_version": settings.DASHBOARD_VERSION, "bookings": bookings, "total_days": total_days})
return render(request, "accounts/gpu_booking_info.html", {"namespaces": namespaces, "dashboard_version": settings.DASHBOARD_VERSION, "bookings": bookings, "total_days": total_days})
6 changes: 3 additions & 3 deletions dashboard/apps/templates/accounts/gpu_booking.html
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,11 @@ <h5>Register a New GPU Booking</h5>
<span class="text-danger">{{ form.namespace.errors }}</span>
</div>

<div class="mb-3">
<div class="mb-3">
<label>Email</label>
{{ form.user_email }}
{{ form.user_email|attr:"disabled:true" }}
<span class="text-danger">{{ form.email.errors }}</span>
</div>
</div>

<div class="mb-3">
<label>GPU Request</label>
Expand Down
33 changes: 33 additions & 0 deletions docker/GPU_Booking_Admission_Controller/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Use a minimal Go image for building
FROM golang:1.23 AS builder

# Set working directory inside the container
WORKDIR /app

# Copy Go module files and download dependencies
COPY go.mod go.sum ./
RUN go mod download

# Copy the webhook source code
COPY webhook.go ./

# Build the webhook server binary
RUN CGO_ENABLED=0 GOOS=linux go build -o webhook webhook.go

# Use a minimal base image for the final container
FROM alpine:latest

WORKDIR /root/

# Copy the compiled binary
COPY --from=builder /app/webhook .

# Copy TLS certificates (Ensure they are generated beforehand)
#COPY certs/tls.crt /etc/webhook/tls.crt
#COPY certs/tls.key /etc/webhook/tls.key

# Expose the webhook port
EXPOSE 443

# Run the webhook server
CMD ["/root/webhook", "--tls-cert=/tls/tls.crt", "--tls-key=/tls/tls.key"]
28 changes: 28 additions & 0 deletions docker/GPU_Booking_Admission_Controller/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
module webhook.go

go 1.23.0

toolchain go1.23.6

require k8s.io/api v0.32.1

require (
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/x448/float16 v0.8.4 // indirect
golang.org/x/net v0.30.0 // indirect
golang.org/x/text v0.19.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
k8s.io/apimachinery v0.32.1 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.2 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect
)
Loading

0 comments on commit 1453407

Please sign in to comment.