chore(example): add example for gpus on ske cluster #10

Merged
mauritz.uphoff merged 2 commits from example/gpus-on-ske into main 2026-04-16 11:42:55 +00:00
14 changed files with 256 additions and 9 deletions

1
.gitignore vendored
View file

@ -67,3 +67,4 @@ go.work.sum
### Jetbrains
.idea
ssh
keys

View file

@ -14,7 +14,7 @@
variable "stackit_project_id" {
type = string
default = "d75e6aab-b616-4b42-ae3b-aaf161ad626d"
default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
}
variable "stackit_region" {

View file

@ -29,5 +29,5 @@ variable "zone" {
variable "STACKIT_PROJECT_ID" {
type = string
description = "STACKIT Project ID"
default = "16ec118f-90d0-466d-8393-99eea504c536"
default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
}

View file

@ -33,7 +33,7 @@ resource "stackit_network_interface" "nic" {
data "stackit_security_group" "default" {
project_id = var.STACKIT_PROJECT_ID
security_group_id = "a6b4708e-b8ee-48ba-b084-a4892e9a73af"
security_group_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
}
data "stackit_network" "default" {

View file

@ -24,7 +24,7 @@ variable "stackit_service_account_key_path" {
variable "stackit_org_id" {
type = string
default = "03a34540-3c1a-4794-b2c6-7111ecf824ef"
default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
}
variable "owner_email" {

View file

@ -14,7 +14,7 @@
variable "stackit_project_id" {
type = string
default = "d75e6aab-b616-4b42-ae3b-aaf161ad626d"
default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
}
variable "stackit_region" {

View file

@ -0,0 +1,63 @@
# This file is maintained automatically by "terraform init".
# Manual edits may be lost in future updates.
provider "registry.terraform.io/hashicorp/helm" {
version = "3.1.1"
hashes = [
"h1:47CqNwkxctJtL/N/JuEj+8QMg8mRNI/NWeKO5/ydfZU=",
"zh:1a6d5ce931708aec29d1f3d9e360c2a0c35ba5a54d03eeaff0ce3ca597cd0275",
"zh:3411919ba2a5941801e677f0fea08bdd0ae22ba3c9ce3309f55554699e06524a",
"zh:81b36138b8f2320dc7f877b50f9e38f4bc614affe68de885d322629dd0d16a29",
"zh:95a2a0a497a6082ee06f95b38bd0f0d6924a65722892a856cfd914c0d117f104",
"zh:9d3e78c2d1bb46508b972210ad706dd8c8b106f8b206ecf096cd211c54f46990",
"zh:a79139abf687387a6efdbbb04289a0a8e7eaca2bd91cdc0ce68ea4f3286c2c34",
"zh:aaa8784be125fbd50c48d84d6e171d3fb6ef84a221dbc5165c067ce05faab4c8",
"zh:afecd301f469975c9d8f350cc482fe656e082b6ab0f677d1a816c3c615837cc1",
"zh:c54c22b18d48ff9053d899d178d9ffef7d9d19785d9bf310a07d648b7aac075b",
"zh:db2eefd55aea48e73384a555c72bac3f7d428e24147bedb64e1a039398e5b903",
"zh:ee61666a233533fd2be971091cecc01650561f1585783c381b6f6e8a390198a4",
"zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c",
]
}
provider "registry.terraform.io/hashicorp/kubernetes" {
version = "3.0.1"
constraints = ">= 2.14.0"
hashes = [
"h1:P0c8knzZnouTNFIRij8IS7+pqd0OKaFDYX0j4GRsiqo=",
"zh:02d55b0b2238fd17ffa12d5464593864e80f402b90b31f6e1bd02249b9727281",
"zh:20b93a51bfeed82682b3c12f09bac3031f5bdb4977c47c97a042e4df4fb2f9ba",
"zh:6e14486ecfaee38c09ccf33d4fdaf791409f90795c1b66e026c226fad8bc03c7",
"zh:8d0656ff422df94575668e32c310980193fccb1c28117e5c78dd2d4050a760a6",
"zh:9795119b30ec0c1baa99a79abace56ac850b6e6fbce60e7f6067792f6eb4b5f4",
"zh:b388c87acc40f6bd9620f4e23f01f3c7b41d9b88a68d5255dec0a72f0bdec249",
"zh:b59abd0a980649c2f97f172392f080eaeb18e486b603f83bf95f5d93aeccc090",
"zh:ba6e3060fddf4a022087d8f09e38aa0001c705f21170c2ded3d1c26c12f70d97",
"zh:c12626d044b1d5501cf95ca78cbe507c13ad1dd9f12d4736df66eb8e5f336eb8",
"zh:c55203240d50f4cdeb3df1e1760630d677679f5b1a6ffd9eba23662a4ad05119",
"zh:ea206a5a32d6e0d6e32f1849ad703da9a28355d9c516282a8458b5cf1502b2a1",
"zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c",
]
}
provider "registry.terraform.io/stackitcloud/stackit" {
version = "0.91.0"
constraints = "> 0.60.0"
hashes = [
"h1:8de9n+Roq6Z2Ltp9poBBBN9a4zSpx73VLpgFS5mTyoI=",
"zh:0dde99e7b343fa01f8eefc378171fb8621bedb20f59157d6cc8e3d46c738105f",
"zh:0ed12db90276ccd2d6f87135b7dd078657823c3ca33121c6a157d0bdf08f801e",
"zh:160b32bcf1d01666784cf8469e10e0a38d4c3d24c80c0c5be470cc63ef27ea62",
"zh:32e1909037235c24138b74131c6fb12ac99003f79750f1768ca5468cc05da6b0",
"zh:4376f1cdafbb35ad5f220e28153741908390b23161d9eae3828f7830039ce8ef",
"zh:458b054781ef6165d9136fc3d667f9bf37319e37d0f19300bbb63b703de2599d",
"zh:54a1864cf1315a118c043f834e02f2a1ca0ecbc8c2a246460589a95847da6c80",
"zh:83424712926ccef3c60cc011dfa298721bdbaee3598a0c8459da46bc6b7424cc",
"zh:a3c38ebffdbca21dd177b06acf891bed1a903907ba252d0219d91ff0ecf9d861",
"zh:c6325e583b77aa1e9df94e3b4b12479d7bf12c66a2ace71c1b8f64e46ac5c37e",
"zh:de6db8deeee895af5670df2449c8b8c34df051277f8a6e2f19c5c9ec1f0ddb12",
"zh:e18b05e7d8356caa6103c5c80b5ea373be3ff255b453cf577c68798ffe1b93ce",
"zh:f4d9215f7a2888c882892642539b2edd3ea97cb25904e4fa358db4f001c3ccd0",
"zh:f94d0c0c2bf843867122ababc8d8066d52257e68bbcb5c62a603f77c581e9668",
]
}

View file

@ -0,0 +1,9 @@
# Maintainers
General maintainers:
- Mauritz Uphoff (mauritz.uphoff@digits.schwarz)
This example is actively maintained. The owner is responsible for reviewing and updating dependencies and functionalities on a monthly basis.
For questions, issues, or feature requests, please email general maintainers.
Please include the BP name and version in your request. We will track your request as an issue.

View file

@ -0,0 +1,7 @@
# SKE Kubernetes GPU Operator Installation
## Overview
This example demonstrates how to deploy a SKE cluster with an NVIDIA H100 node pool and install the GPU Operator.
**Note:** Currently, GPU-enabled node pools on SKE are only supported when using Ubuntu as the node operating system.

View file

@ -0,0 +1,10 @@
dcgm:
enabled: true
dcgmExporter:
enabled: true
serviceMonitor:
enabled: true
additionalLabels:
# this label needs to be set for prometheus to use the service monitor
release: kube-prometheus-stack

View file

@ -0,0 +1,157 @@
# Copyright 2026 Schwarz Digits Cloud GmbH & Co. KG
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
terraform {
required_providers {
stackit = {
source = "stackitcloud/stackit"
version = ">=0.60.0"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = ">=2.14.0"
}
}
}
variable "project_id" {
default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
}
variable "stackit_service_account_key_path" {
default = ""
}
provider "kubernetes" {
host = yamldecode(stackit_ske_kubeconfig.this.kube_config).clusters.0.cluster.server
client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).users.0.user.client-certificate-data)
client_key = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).users.0.user.client-key-data)
cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).clusters.0.cluster.certificate-authority-data)
}
provider "helm" {
kubernetes = {
host = yamldecode(stackit_ske_kubeconfig.this.kube_config).clusters.0.cluster.server
client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).users.0.user.client-certificate-data)
client_key = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).users.0.user.client-key-data)
cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).clusters.0.cluster.certificate-authority-data)
}
}
provider "stackit" {
default_region = "eu01"
service_account_key_path = var.stackit_service_account_key_path
}
resource "stackit_ske_kubeconfig" "this" {
project_id = var.project_id
cluster_name = stackit_ske_cluster.this.name
refresh = true
depends_on = [stackit_ske_cluster.this]
}
data "stackit_ske_kubernetes_versions" "this" {
version_state = "SUPPORTED"
}
data "stackit_ske_machine_image_versions" "this" {
version_state = "SUPPORTED"
}
locals {
flatcar_supported_version = one(flatten([
for mi in data.stackit_ske_machine_image_versions.this.machine_images : [
for v in mi.versions :
v.version
if mi.name == "flatcar"
]
]))
ubuntu_supported_version = one(flatten([
for mi in data.stackit_ske_machine_image_versions.this.machine_images : [
for v in mi.versions :
v.version
if mi.name == "ubuntu"
]
]))
gpu_operator_helm_values = templatefile("${path.module}/gpu-operator-values.yaml.tftpl", {})
}
resource "stackit_ske_cluster" "this" {
project_id = var.project_id
name = "ske-gpu"
kubernetes_version_min = data.stackit_ske_kubernetes_versions.this.kubernetes_versions.0.version
maintenance = {
enable_kubernetes_version_updates = true
enable_machine_image_version_updates = true
start = "01:00:00Z"
end = "02:00:00Z"
}
node_pools = [
{
name = "standard"
machine_type = "g2i.4"
minimum = "3"
maximum = "9"
max_surge = "3"
availability_zones = ["eu01-1", "eu01-2", "eu01-3"]
os_version_min = local.flatcar_supported_version
os_name = "flatcar"
volume_size = 150
volume_type = "storage_premium_perf6"
},
{
name = "gpu-pool-h100-2"
machine_type = "n3.14d.g1"
os_version_min = local.ubuntu_supported_version
os_name = "ubuntu"
minimum = "1"
maximum = "1"
max_surge = "1"
availability_zones = ["eu01-2"]
volume_size = 150
volume_type = "storage_premium_perf6"
labels = {
"dedicated" = "gpu"
}
taints = [
{
effect = "NoSchedule"
key = "nvidia.com/gpu"
value = "true"
},
]
},
]
}
resource "kubernetes_namespace_v1" "gpu_operator" {
metadata {
name = "gpu-operator"
}
}
resource "helm_release" "gpu_operator" {
name = "gpu-operator"
namespace = kubernetes_namespace_v1.gpu_operator.metadata[0].name
repository = "https://helm.ngc.nvidia.com/nvidia"
chart = "gpu-operator"
version = "25.3.1"
values = [
local.gpu_operator_helm_values
]
}

View file

@ -14,7 +14,7 @@
variable "stackit_project_id" {
type = string
default = "d75e6aab-b616-4b42-ae3b-aaf161ad626d"
default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
}
variable "stackit_region" {

View file

@ -41,11 +41,11 @@ variable "LOCAL_SUBNET" {
variable "STACKIT_PROJECT_ID" {
type = string
description = "STACKIT Project ID"
default = "16ec118f-90d0-466d-8393-99eea504c536"
default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
}
variable "STACKIT_ORG_ID" {
type = string
description = "STACKIT Org ID"
default = "03a34540-3c1a-4794-b2c6-7111ecf824ef"
default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
}

View file

@ -22,7 +22,7 @@ resource "stackit_resourcemanager_project" "sfs-no-folder" {
}
resource "stackit_resourcemanager_project" "sfs-folder" {
parent_container_id = "bc229fa8-4be4-42d5-8808-514fe6d39074" #Folder ID Demos
parent_container_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" #Folder ID Demos
name = "sfs-example-folder"
labels = {
"networkArea" = stackit_network_area.sfs.network_area_id