Skip to content

Commit

Permalink
Add commands to install the Node Feature Discovery Operator and the N…
Browse files Browse the repository at this point in the history
…VIDIA GPU Operator

Signed-off-by: Jens Müller <[email protected]>
  • Loading branch information
jensmueller-com committed Apr 29, 2024
1 parent 86b52b7 commit ab08c48
Show file tree
Hide file tree
Showing 5 changed files with 462 additions and 0 deletions.
15 changes: 15 additions & 0 deletions cpo/commands/cluster/operator/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright 2024 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__doc__ = "Commands to install various operators"
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright 2024 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional

import click

import cpo.config.cluster_credentials_manager
import cpo.lib.click.utils
import cpo.utils.network

from cpo.lib.ansible.openshift_playbook_runner import OpenShiftPlaybookRunner
from cpo.lib.openshift.openshift_api_manager import OpenShiftAPIManager
from cpo.lib.openshift.utils.click import openshift_server_options
from cpo.utils.logging import loglevel_command


@loglevel_command(
context_settings=cpo.lib.click.utils.create_default_map_from_dict(
cpo.config.cluster_credentials_manager.cluster_credentials_manager.get_current_credentials()
)
)
@openshift_server_options
@click.option("--project", default="openshift-nfd", help="OpenShift project", show_default=True)
@click.pass_context
def install_node_feature_discovery_operator(
ctx: click.Context,
server: Optional[str],
username: Optional[str],
password: Optional[str],
token: Optional[str],
insecure_skip_tls_verify: Optional[bool],
use_cluster: Optional[str],
project: str,
):
"""Install the Node Feature Discovery Operator"""

credentials = cpo.lib.click.utils.get_cluster_credentials(ctx, locals().copy())
version = OpenShiftAPIManager(credentials).get_version()

OpenShiftPlaybookRunner(
"install_node_feature_discovery_operator_playbook.yaml",
credentials,
variables={
"openshift_server_version": f"{version.major}.{version.minor}",
"project": project,
},
).run_playbook()
56 changes: 56 additions & 0 deletions cpo/commands/cluster/operator/install_nvidia_gpu_operator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Copyright 2024 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional

import click

import cpo.config.cluster_credentials_manager
import cpo.lib.click.utils
import cpo.utils.network

from cpo.lib.ansible.openshift_playbook_runner import OpenShiftPlaybookRunner
from cpo.lib.openshift.utils.click import openshift_server_options
from cpo.utils.logging import loglevel_command


@loglevel_command(
context_settings=cpo.lib.click.utils.create_default_map_from_dict(
cpo.config.cluster_credentials_manager.cluster_credentials_manager.get_current_credentials()
)
)
@openshift_server_options
@click.option("--project", default="nvidia-gpu-operator", help="OpenShift project", show_default=True)
@click.pass_context
def install_nvidia_gpu_operator(
ctx: click.Context,
server: Optional[str],
username: Optional[str],
password: Optional[str],
token: Optional[str],
insecure_skip_tls_verify: Optional[bool],
use_cluster: Optional[str],
project: str,
):
"""Install the NVIDIA GPU Operator"""

credentials = cpo.lib.click.utils.get_cluster_credentials(ctx, locals().copy())

OpenShiftPlaybookRunner(
"install_nvidia_gpu_operator_playbook.yaml",
credentials,
variables={
"project": project,
},
).run_playbook()
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# Copyright 2024 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
- connection: local
hosts: localhost
gather_facts: false
tasks:
- name: "Create Namespace resource (name: {{ project }})"
kubernetes.core.k8s:
kubeconfig: "{{ kube_config }}"
resource_definition:
api_version: v1
kind: Namespace
metadata:
name: "{{ project }}"
state: present

- name: "Create OperatorGroup resource (name: openshift-nfd)"
kubernetes.core.k8s:
kubeconfig: "{{ kube_config }}"
resource_definition:
apiVersion: operators.coreos.com/v1
kind: OperatorGroup
metadata:
generateName: openshift-nfd-
name: openshift-nfd
namespace: "{{ project }}"
spec:
targetNamespaces:
- openshift-nfd
state: present

- name: "Create Subscription resource (name: nfd)"
kubernetes.core.k8s:
kubeconfig: "{{ kube_config }}"
resource_definition:
apiVersion: operators.coreos.com/v1alpha1
kind: Subscription
metadata:
name: nfd
namespace: "{{ project }}"
spec:
channel: stable
installPlanApproval: Automatic
name: nfd
source: redhat-operators
sourceNamespace: openshift-marketplace
state: present

- name: "Waiting for creation of custom resource definitions"
wait_for_custom_resource_definitions:
kubeconfig: "{{ kube_config }}"
custom_resource_definitions:
- "NodeFeatureDiscovery"
- "NodeFeatureRule"

- name: "Create NodeFeatureDiscovery resource (name: nfd-instance)"
kubernetes.core.k8s:
kubeconfig: "{{ kube_config }}"
resource_definition:
apiVersion: nfd.openshift.io/v1
kind: NodeFeatureDiscovery
metadata:
name: nfd-instance
namespace: "{{ project }}"
spec:
instance: "" # instance is empty by default
topologyupdater: false # False by default
operand:
image: "registry.redhat.io/openshift4/ose-node-feature-discovery:v{{openshift_server_version}}"
imagePullPolicy: Always
workerConfig:
configData: |
core:
# labelWhiteList:
# noPublish: false
sleepInterval: 60s
# sources: [all]
# klog:
# addDirHeader: false
# alsologtostderr: false
# logBacktraceAt:
# logtostderr: true
# skipHeaders: false
# stderrthreshold: 2
# v: 0
# vmodule:
## NOTE: the following options are not dynamically run-time configurable
## and require a nfd-worker restart to take effect after being changed
# logDir:
# logFile:
# logFileMaxSize: 1800
# skipLogHeaders: false
sources:
cpu:
cpuid:
# NOTE: whitelist has priority over blacklist
attributeBlacklist:
- "BMI1"
- "BMI2"
- "CLMUL"
- "CMOV"
- "CX16"
- "ERMS"
- "F16C"
- "HTT"
- "LZCNT"
- "MMX"
- "MMXEXT"
- "NX"
- "POPCNT"
- "RDRAND"
- "RDSEED"
- "RDTSCP"
- "SGX"
- "SSE"
- "SSE2"
- "SSE3"
- "SSE4.1"
- "SSE4.2"
- "SSSE3"
attributeWhitelist:
kernel:
kconfigFile: "/path/to/kconfig"
configOpts:
- "NO_HZ"
- "X86"
- "DMI"
pci:
deviceClassWhitelist:
- "0200"
- "03"
- "12"
deviceLabelFields:
- "vendor"
customConfig:
configData: |
- name: "more.kernel.features"
matchOn:
- loadedKMod: ["example_kmod3"]
Loading

0 comments on commit ab08c48

Please sign in to comment.