chore(example): add example for gpus on ske cluster #10
14 changed files with 256 additions and 9 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -67,3 +67,4 @@ go.work.sum
|
|||
### Jetbrains
|
||||
.idea
|
||||
ssh
|
||||
keys
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@
|
|||
|
||||
variable "stackit_project_id" {
|
||||
type = string
|
||||
default = "d75e6aab-b616-4b42-ae3b-aaf161ad626d"
|
||||
default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
||||
}
|
||||
|
||||
variable "stackit_region" {
|
||||
|
|
|
|||
|
|
@ -29,5 +29,5 @@ variable "zone" {
|
|||
variable "STACKIT_PROJECT_ID" {
|
||||
type = string
|
||||
description = "STACKIT Project ID"
|
||||
default = "16ec118f-90d0-466d-8393-99eea504c536"
|
||||
default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ resource "stackit_network_interface" "nic" {
|
|||
|
||||
data "stackit_security_group" "default" {
|
||||
project_id = var.STACKIT_PROJECT_ID
|
||||
security_group_id = "a6b4708e-b8ee-48ba-b084-a4892e9a73af"
|
||||
security_group_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
||||
}
|
||||
|
||||
data "stackit_network" "default" {
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ variable "stackit_service_account_key_path" {
|
|||
|
||||
variable "stackit_org_id" {
|
||||
type = string
|
||||
default = "03a34540-3c1a-4794-b2c6-7111ecf824ef"
|
||||
default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
||||
}
|
||||
|
||||
variable "owner_email" {
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@
|
|||
|
||||
variable "stackit_project_id" {
|
||||
type = string
|
||||
default = "d75e6aab-b616-4b42-ae3b-aaf161ad626d"
|
||||
default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
||||
}
|
||||
|
||||
variable "stackit_region" {
|
||||
|
|
|
|||
63
examples/ske-gpu-operator/.terraform.lock.hcl
generated
Normal file
63
examples/ske-gpu-operator/.terraform.lock.hcl
generated
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
# This file is maintained automatically by "terraform init".
|
||||
# Manual edits may be lost in future updates.
|
||||
|
||||
provider "registry.terraform.io/hashicorp/helm" {
|
||||
version = "3.1.1"
|
||||
hashes = [
|
||||
"h1:47CqNwkxctJtL/N/JuEj+8QMg8mRNI/NWeKO5/ydfZU=",
|
||||
"zh:1a6d5ce931708aec29d1f3d9e360c2a0c35ba5a54d03eeaff0ce3ca597cd0275",
|
||||
"zh:3411919ba2a5941801e677f0fea08bdd0ae22ba3c9ce3309f55554699e06524a",
|
||||
"zh:81b36138b8f2320dc7f877b50f9e38f4bc614affe68de885d322629dd0d16a29",
|
||||
"zh:95a2a0a497a6082ee06f95b38bd0f0d6924a65722892a856cfd914c0d117f104",
|
||||
"zh:9d3e78c2d1bb46508b972210ad706dd8c8b106f8b206ecf096cd211c54f46990",
|
||||
"zh:a79139abf687387a6efdbbb04289a0a8e7eaca2bd91cdc0ce68ea4f3286c2c34",
|
||||
"zh:aaa8784be125fbd50c48d84d6e171d3fb6ef84a221dbc5165c067ce05faab4c8",
|
||||
"zh:afecd301f469975c9d8f350cc482fe656e082b6ab0f677d1a816c3c615837cc1",
|
||||
"zh:c54c22b18d48ff9053d899d178d9ffef7d9d19785d9bf310a07d648b7aac075b",
|
||||
"zh:db2eefd55aea48e73384a555c72bac3f7d428e24147bedb64e1a039398e5b903",
|
||||
"zh:ee61666a233533fd2be971091cecc01650561f1585783c381b6f6e8a390198a4",
|
||||
"zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c",
|
||||
]
|
||||
}
|
||||
|
||||
provider "registry.terraform.io/hashicorp/kubernetes" {
|
||||
version = "3.0.1"
|
||||
constraints = ">= 2.14.0"
|
||||
hashes = [
|
||||
"h1:P0c8knzZnouTNFIRij8IS7+pqd0OKaFDYX0j4GRsiqo=",
|
||||
"zh:02d55b0b2238fd17ffa12d5464593864e80f402b90b31f6e1bd02249b9727281",
|
||||
"zh:20b93a51bfeed82682b3c12f09bac3031f5bdb4977c47c97a042e4df4fb2f9ba",
|
||||
"zh:6e14486ecfaee38c09ccf33d4fdaf791409f90795c1b66e026c226fad8bc03c7",
|
||||
"zh:8d0656ff422df94575668e32c310980193fccb1c28117e5c78dd2d4050a760a6",
|
||||
"zh:9795119b30ec0c1baa99a79abace56ac850b6e6fbce60e7f6067792f6eb4b5f4",
|
||||
"zh:b388c87acc40f6bd9620f4e23f01f3c7b41d9b88a68d5255dec0a72f0bdec249",
|
||||
"zh:b59abd0a980649c2f97f172392f080eaeb18e486b603f83bf95f5d93aeccc090",
|
||||
"zh:ba6e3060fddf4a022087d8f09e38aa0001c705f21170c2ded3d1c26c12f70d97",
|
||||
"zh:c12626d044b1d5501cf95ca78cbe507c13ad1dd9f12d4736df66eb8e5f336eb8",
|
||||
"zh:c55203240d50f4cdeb3df1e1760630d677679f5b1a6ffd9eba23662a4ad05119",
|
||||
"zh:ea206a5a32d6e0d6e32f1849ad703da9a28355d9c516282a8458b5cf1502b2a1",
|
||||
"zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c",
|
||||
]
|
||||
}
|
||||
|
||||
provider "registry.terraform.io/stackitcloud/stackit" {
|
||||
version = "0.91.0"
|
||||
constraints = "> 0.60.0"
|
||||
hashes = [
|
||||
"h1:8de9n+Roq6Z2Ltp9poBBBN9a4zSpx73VLpgFS5mTyoI=",
|
||||
"zh:0dde99e7b343fa01f8eefc378171fb8621bedb20f59157d6cc8e3d46c738105f",
|
||||
"zh:0ed12db90276ccd2d6f87135b7dd078657823c3ca33121c6a157d0bdf08f801e",
|
||||
"zh:160b32bcf1d01666784cf8469e10e0a38d4c3d24c80c0c5be470cc63ef27ea62",
|
||||
"zh:32e1909037235c24138b74131c6fb12ac99003f79750f1768ca5468cc05da6b0",
|
||||
"zh:4376f1cdafbb35ad5f220e28153741908390b23161d9eae3828f7830039ce8ef",
|
||||
"zh:458b054781ef6165d9136fc3d667f9bf37319e37d0f19300bbb63b703de2599d",
|
||||
"zh:54a1864cf1315a118c043f834e02f2a1ca0ecbc8c2a246460589a95847da6c80",
|
||||
"zh:83424712926ccef3c60cc011dfa298721bdbaee3598a0c8459da46bc6b7424cc",
|
||||
"zh:a3c38ebffdbca21dd177b06acf891bed1a903907ba252d0219d91ff0ecf9d861",
|
||||
"zh:c6325e583b77aa1e9df94e3b4b12479d7bf12c66a2ace71c1b8f64e46ac5c37e",
|
||||
"zh:de6db8deeee895af5670df2449c8b8c34df051277f8a6e2f19c5c9ec1f0ddb12",
|
||||
"zh:e18b05e7d8356caa6103c5c80b5ea373be3ff255b453cf577c68798ffe1b93ce",
|
||||
"zh:f4d9215f7a2888c882892642539b2edd3ea97cb25904e4fa358db4f001c3ccd0",
|
||||
"zh:f94d0c0c2bf843867122ababc8d8066d52257e68bbcb5c62a603f77c581e9668",
|
||||
]
|
||||
}
|
||||
9
examples/ske-gpu-operator/MAINTAINERS.md
Normal file
9
examples/ske-gpu-operator/MAINTAINERS.md
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
# Maintainers
|
||||
|
||||
General maintainers:
|
||||
|
||||
- Mauritz Uphoff (mauritz.uphoff@digits.schwarz)
|
||||
|
||||
This example is actively maintained. The owner is responsible for reviewing and updating dependencies and functionalities on a monthly basis.
|
||||
For questions, issues, or feature requests, please email general maintainers.
|
||||
Please include the BP name and version in your request. We will track your request as an issue.
|
||||
7
examples/ske-gpu-operator/README.md
Normal file
7
examples/ske-gpu-operator/README.md
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
# SKE Kubernetes GPU Operator Installation
|
||||
|
||||
## Overview
|
||||
|
||||
This example demonstrates how to deploy a SKE cluster with an NVIDIA H100 node pool and install the GPU Operator.
|
||||
|
||||
**Note:** Currently, GPU-enabled node pools on SKE are only supported when using Ubuntu as the node operating system.
|
||||
10
examples/ske-gpu-operator/gpu-operator-values.yaml.tftpl
Normal file
10
examples/ske-gpu-operator/gpu-operator-values.yaml.tftpl
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
dcgm:
|
||||
enabled: true
|
||||
|
||||
dcgmExporter:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
additionalLabels:
|
||||
# this label needs to be set for prometheus to use the service monitor
|
||||
release: kube-prometheus-stack
|
||||
157
examples/ske-gpu-operator/main.tf
Normal file
157
examples/ske-gpu-operator/main.tf
Normal file
|
|
@ -0,0 +1,157 @@
|
|||
# Copyright 2026 Schwarz Digits Cloud GmbH & Co. KG
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
terraform {
|
||||
required_providers {
|
||||
stackit = {
|
||||
source = "stackitcloud/stackit"
|
||||
version = ">=0.60.0"
|
||||
}
|
||||
kubernetes = {
|
||||
source = "hashicorp/kubernetes"
|
||||
version = ">=2.14.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
variable "project_id" {
|
||||
default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
||||
}
|
||||
|
||||
variable "stackit_service_account_key_path" {
|
||||
default = ""
|
||||
}
|
||||
|
||||
provider "kubernetes" {
|
||||
host = yamldecode(stackit_ske_kubeconfig.this.kube_config).clusters.0.cluster.server
|
||||
client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).users.0.user.client-certificate-data)
|
||||
client_key = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).users.0.user.client-key-data)
|
||||
cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).clusters.0.cluster.certificate-authority-data)
|
||||
}
|
||||
|
||||
provider "helm" {
|
||||
kubernetes = {
|
||||
host = yamldecode(stackit_ske_kubeconfig.this.kube_config).clusters.0.cluster.server
|
||||
client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).users.0.user.client-certificate-data)
|
||||
client_key = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).users.0.user.client-key-data)
|
||||
cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).clusters.0.cluster.certificate-authority-data)
|
||||
}
|
||||
}
|
||||
|
||||
provider "stackit" {
|
||||
default_region = "eu01"
|
||||
service_account_key_path = var.stackit_service_account_key_path
|
||||
}
|
||||
|
||||
resource "stackit_ske_kubeconfig" "this" {
|
||||
project_id = var.project_id
|
||||
cluster_name = stackit_ske_cluster.this.name
|
||||
refresh = true
|
||||
|
||||
depends_on = [stackit_ske_cluster.this]
|
||||
}
|
||||
|
||||
data "stackit_ske_kubernetes_versions" "this" {
|
||||
version_state = "SUPPORTED"
|
||||
}
|
||||
|
||||
data "stackit_ske_machine_image_versions" "this" {
|
||||
version_state = "SUPPORTED"
|
||||
}
|
||||
|
||||
locals {
|
||||
flatcar_supported_version = one(flatten([
|
||||
for mi in data.stackit_ske_machine_image_versions.this.machine_images : [
|
||||
for v in mi.versions :
|
||||
v.version
|
||||
if mi.name == "flatcar"
|
||||
]
|
||||
]))
|
||||
ubuntu_supported_version = one(flatten([
|
||||
for mi in data.stackit_ske_machine_image_versions.this.machine_images : [
|
||||
for v in mi.versions :
|
||||
v.version
|
||||
if mi.name == "ubuntu"
|
||||
]
|
||||
]))
|
||||
gpu_operator_helm_values = templatefile("${path.module}/gpu-operator-values.yaml.tftpl", {})
|
||||
}
|
||||
|
||||
resource "stackit_ske_cluster" "this" {
|
||||
project_id = var.project_id
|
||||
name = "ske-gpu"
|
||||
kubernetes_version_min = data.stackit_ske_kubernetes_versions.this.kubernetes_versions.0.version
|
||||
|
||||
maintenance = {
|
||||
enable_kubernetes_version_updates = true
|
||||
enable_machine_image_version_updates = true
|
||||
start = "01:00:00Z"
|
||||
end = "02:00:00Z"
|
||||
}
|
||||
|
||||
node_pools = [
|
||||
{
|
||||
name = "standard"
|
||||
machine_type = "g2i.4"
|
||||
minimum = "3"
|
||||
maximum = "9"
|
||||
max_surge = "3"
|
||||
availability_zones = ["eu01-1", "eu01-2", "eu01-3"]
|
||||
os_version_min = local.flatcar_supported_version
|
||||
os_name = "flatcar"
|
||||
volume_size = 150
|
||||
volume_type = "storage_premium_perf6"
|
||||
},
|
||||
{
|
||||
name = "gpu-pool-h100-2"
|
||||
machine_type = "n3.14d.g1"
|
||||
os_version_min = local.ubuntu_supported_version
|
||||
os_name = "ubuntu"
|
||||
minimum = "1"
|
||||
maximum = "1"
|
||||
max_surge = "1"
|
||||
availability_zones = ["eu01-2"]
|
||||
volume_size = 150
|
||||
volume_type = "storage_premium_perf6"
|
||||
labels = {
|
||||
"dedicated" = "gpu"
|
||||
}
|
||||
taints = [
|
||||
{
|
||||
effect = "NoSchedule"
|
||||
key = "nvidia.com/gpu"
|
||||
value = "true"
|
||||
},
|
||||
]
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
resource "kubernetes_namespace_v1" "gpu_operator" {
|
||||
metadata {
|
||||
name = "gpu-operator"
|
||||
}
|
||||
}
|
||||
|
||||
resource "helm_release" "gpu_operator" {
|
||||
name = "gpu-operator"
|
||||
namespace = kubernetes_namespace_v1.gpu_operator.metadata[0].name
|
||||
repository = "https://helm.ngc.nvidia.com/nvidia"
|
||||
chart = "gpu-operator"
|
||||
version = "25.3.1"
|
||||
|
||||
values = [
|
||||
local.gpu_operator_helm_values
|
||||
]
|
||||
}
|
||||
|
|
@ -14,7 +14,7 @@
|
|||
|
||||
variable "stackit_project_id" {
|
||||
type = string
|
||||
default = "d75e6aab-b616-4b42-ae3b-aaf161ad626d"
|
||||
default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
||||
}
|
||||
|
||||
variable "stackit_region" {
|
||||
|
|
|
|||
|
|
@ -41,11 +41,11 @@ variable "LOCAL_SUBNET" {
|
|||
variable "STACKIT_PROJECT_ID" {
|
||||
type = string
|
||||
description = "STACKIT Project ID"
|
||||
default = "16ec118f-90d0-466d-8393-99eea504c536"
|
||||
default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
||||
}
|
||||
|
||||
variable "STACKIT_ORG_ID" {
|
||||
type = string
|
||||
description = "STACKIT Org ID"
|
||||
default = "03a34540-3c1a-4794-b2c6-7111ecf824ef"
|
||||
default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ resource "stackit_resourcemanager_project" "sfs-no-folder" {
|
|||
}
|
||||
|
||||
resource "stackit_resourcemanager_project" "sfs-folder" {
|
||||
parent_container_id = "bc229fa8-4be4-42d5-8808-514fe6d39074" #Folder ID Demos
|
||||
parent_container_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" #Folder ID Demos
|
||||
name = "sfs-example-folder"
|
||||
labels = {
|
||||
"networkArea" = stackit_network_area.sfs.network_area_id
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue