From a8f06eaf287d3570153ffa844848f40c4d12e278 Mon Sep 17 00:00:00 2001 From: Mauritz Uphoff Date: Thu, 16 Apr 2026 13:25:22 +0200 Subject: [PATCH 1/2] chore(example): add example for gpus on ske cluster Signed-off-by: Mauritz Uphoff --- .gitignore | 1 + examples/ske-gpu-operator/.terraform.lock.hcl | 63 +++++++ examples/ske-gpu-operator/MAINTAINERS.md | 9 + examples/ske-gpu-operator/README.md | 7 + .../gpu-operator-values.yaml.tftpl | 10 ++ examples/ske-gpu-operator/main.tf | 157 ++++++++++++++++++ 6 files changed, 247 insertions(+) create mode 100644 examples/ske-gpu-operator/.terraform.lock.hcl create mode 100644 examples/ske-gpu-operator/MAINTAINERS.md create mode 100644 examples/ske-gpu-operator/README.md create mode 100644 examples/ske-gpu-operator/gpu-operator-values.yaml.tftpl create mode 100644 examples/ske-gpu-operator/main.tf diff --git a/.gitignore b/.gitignore index 26bffc0..4567528 100644 --- a/.gitignore +++ b/.gitignore @@ -67,3 +67,4 @@ go.work.sum ### Jetbrains .idea ssh +keys diff --git a/examples/ske-gpu-operator/.terraform.lock.hcl b/examples/ske-gpu-operator/.terraform.lock.hcl new file mode 100644 index 0000000..2cebd8e --- /dev/null +++ b/examples/ske-gpu-operator/.terraform.lock.hcl @@ -0,0 +1,63 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/helm" { + version = "3.1.1" + hashes = [ + "h1:47CqNwkxctJtL/N/JuEj+8QMg8mRNI/NWeKO5/ydfZU=", + "zh:1a6d5ce931708aec29d1f3d9e360c2a0c35ba5a54d03eeaff0ce3ca597cd0275", + "zh:3411919ba2a5941801e677f0fea08bdd0ae22ba3c9ce3309f55554699e06524a", + "zh:81b36138b8f2320dc7f877b50f9e38f4bc614affe68de885d322629dd0d16a29", + "zh:95a2a0a497a6082ee06f95b38bd0f0d6924a65722892a856cfd914c0d117f104", + "zh:9d3e78c2d1bb46508b972210ad706dd8c8b106f8b206ecf096cd211c54f46990", + "zh:a79139abf687387a6efdbbb04289a0a8e7eaca2bd91cdc0ce68ea4f3286c2c34", + "zh:aaa8784be125fbd50c48d84d6e171d3fb6ef84a221dbc5165c067ce05faab4c8", + "zh:afecd301f469975c9d8f350cc482fe656e082b6ab0f677d1a816c3c615837cc1", + "zh:c54c22b18d48ff9053d899d178d9ffef7d9d19785d9bf310a07d648b7aac075b", + "zh:db2eefd55aea48e73384a555c72bac3f7d428e24147bedb64e1a039398e5b903", + "zh:ee61666a233533fd2be971091cecc01650561f1585783c381b6f6e8a390198a4", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} + +provider "registry.terraform.io/hashicorp/kubernetes" { + version = "3.0.1" + constraints = ">= 2.14.0" + hashes = [ + "h1:P0c8knzZnouTNFIRij8IS7+pqd0OKaFDYX0j4GRsiqo=", + "zh:02d55b0b2238fd17ffa12d5464593864e80f402b90b31f6e1bd02249b9727281", + "zh:20b93a51bfeed82682b3c12f09bac3031f5bdb4977c47c97a042e4df4fb2f9ba", + "zh:6e14486ecfaee38c09ccf33d4fdaf791409f90795c1b66e026c226fad8bc03c7", + "zh:8d0656ff422df94575668e32c310980193fccb1c28117e5c78dd2d4050a760a6", + "zh:9795119b30ec0c1baa99a79abace56ac850b6e6fbce60e7f6067792f6eb4b5f4", + "zh:b388c87acc40f6bd9620f4e23f01f3c7b41d9b88a68d5255dec0a72f0bdec249", + "zh:b59abd0a980649c2f97f172392f080eaeb18e486b603f83bf95f5d93aeccc090", + "zh:ba6e3060fddf4a022087d8f09e38aa0001c705f21170c2ded3d1c26c12f70d97", + "zh:c12626d044b1d5501cf95ca78cbe507c13ad1dd9f12d4736df66eb8e5f336eb8", + "zh:c55203240d50f4cdeb3df1e1760630d677679f5b1a6ffd9eba23662a4ad05119", + "zh:ea206a5a32d6e0d6e32f1849ad703da9a28355d9c516282a8458b5cf1502b2a1", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} + +provider "registry.terraform.io/stackitcloud/stackit" { + version = "0.91.0" + constraints = "> 0.60.0" + hashes = [ + "h1:8de9n+Roq6Z2Ltp9poBBBN9a4zSpx73VLpgFS5mTyoI=", + "zh:0dde99e7b343fa01f8eefc378171fb8621bedb20f59157d6cc8e3d46c738105f", + "zh:0ed12db90276ccd2d6f87135b7dd078657823c3ca33121c6a157d0bdf08f801e", + "zh:160b32bcf1d01666784cf8469e10e0a38d4c3d24c80c0c5be470cc63ef27ea62", + "zh:32e1909037235c24138b74131c6fb12ac99003f79750f1768ca5468cc05da6b0", + "zh:4376f1cdafbb35ad5f220e28153741908390b23161d9eae3828f7830039ce8ef", + "zh:458b054781ef6165d9136fc3d667f9bf37319e37d0f19300bbb63b703de2599d", + "zh:54a1864cf1315a118c043f834e02f2a1ca0ecbc8c2a246460589a95847da6c80", + "zh:83424712926ccef3c60cc011dfa298721bdbaee3598a0c8459da46bc6b7424cc", + "zh:a3c38ebffdbca21dd177b06acf891bed1a903907ba252d0219d91ff0ecf9d861", + "zh:c6325e583b77aa1e9df94e3b4b12479d7bf12c66a2ace71c1b8f64e46ac5c37e", + "zh:de6db8deeee895af5670df2449c8b8c34df051277f8a6e2f19c5c9ec1f0ddb12", + "zh:e18b05e7d8356caa6103c5c80b5ea373be3ff255b453cf577c68798ffe1b93ce", + "zh:f4d9215f7a2888c882892642539b2edd3ea97cb25904e4fa358db4f001c3ccd0", + "zh:f94d0c0c2bf843867122ababc8d8066d52257e68bbcb5c62a603f77c581e9668", + ] +} diff --git a/examples/ske-gpu-operator/MAINTAINERS.md b/examples/ske-gpu-operator/MAINTAINERS.md new file mode 100644 index 0000000..1aaefce --- /dev/null +++ b/examples/ske-gpu-operator/MAINTAINERS.md @@ -0,0 +1,9 @@ +# Maintainers + +General maintainers: + +- Mauritz Uphoff (mauritz.uphoff@digits.schwarz) + +This example is actively maintained. The owner is responsible for reviewing and updating dependencies and functionalities on a monthly basis. +For questions, issues, or feature requests, please email general maintainers. +Please include the BP name and version in your request. We will track your request as an issue. diff --git a/examples/ske-gpu-operator/README.md b/examples/ske-gpu-operator/README.md new file mode 100644 index 0000000..8dedc84 --- /dev/null +++ b/examples/ske-gpu-operator/README.md @@ -0,0 +1,7 @@ +# SKE Kubernetes GPU Operator Installation + +## Overview + +This example demonstrates how to deploy a SKE cluster with an NVIDIA H100 node pool and install the GPU Operator. + +**Note:** Currently, GPU-enabled node pools on SKE are only supported when using Ubuntu as the node operating system. diff --git a/examples/ske-gpu-operator/gpu-operator-values.yaml.tftpl b/examples/ske-gpu-operator/gpu-operator-values.yaml.tftpl new file mode 100644 index 0000000..c8208aa --- /dev/null +++ b/examples/ske-gpu-operator/gpu-operator-values.yaml.tftpl @@ -0,0 +1,10 @@ +dcgm: + enabled: true + +dcgmExporter: + enabled: true + serviceMonitor: + enabled: true + additionalLabels: + # this label needs to be set for prometheus to use the service monitor + release: kube-prometheus-stack diff --git a/examples/ske-gpu-operator/main.tf b/examples/ske-gpu-operator/main.tf new file mode 100644 index 0000000..c4174e6 --- /dev/null +++ b/examples/ske-gpu-operator/main.tf @@ -0,0 +1,157 @@ +# Copyright 2026 Schwarz Digits Cloud GmbH & Co. KG +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + stackit = { + source = "stackitcloud/stackit" + version = ">=0.60.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = ">=2.14.0" + } + } +} + +variable "project_id" { + default = "xxx" +} + +variable "stackit_service_account_key_path" { + default = "" +} + +provider "kubernetes" { + host = yamldecode(stackit_ske_kubeconfig.this.kube_config).clusters.0.cluster.server + client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).users.0.user.client-certificate-data) + client_key = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).users.0.user.client-key-data) + cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).clusters.0.cluster.certificate-authority-data) +} + +provider "helm" { + kubernetes = { + host = yamldecode(stackit_ske_kubeconfig.this.kube_config).clusters.0.cluster.server + client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).users.0.user.client-certificate-data) + client_key = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).users.0.user.client-key-data) + cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.this.kube_config).clusters.0.cluster.certificate-authority-data) + } +} + +provider "stackit" { + default_region = "eu01" + service_account_key_path = var.stackit_service_account_key_path +} + +resource "stackit_ske_kubeconfig" "this" { + project_id = var.project_id + cluster_name = stackit_ske_cluster.this.name + refresh = true + + depends_on = [stackit_ske_cluster.this] +} + +data "stackit_ske_kubernetes_versions" "this" { + version_state = "SUPPORTED" +} + +data "stackit_ske_machine_image_versions" "this" { + version_state = "SUPPORTED" +} + +locals { + flatcar_supported_version = one(flatten([ + for mi in data.stackit_ske_machine_image_versions.this.machine_images : [ + for v in mi.versions : + v.version + if mi.name == "flatcar" + ] + ])) + ubuntu_supported_version = one(flatten([ + for mi in data.stackit_ske_machine_image_versions.this.machine_images : [ + for v in mi.versions : + v.version + if mi.name == "ubuntu" + ] + ])) + gpu_operator_helm_values = templatefile("${path.module}/gpu-operator-values.yaml.tftpl", {}) +} + +resource "stackit_ske_cluster" "this" { + project_id = var.project_id + name = "ske-gpu" + kubernetes_version_min = data.stackit_ske_kubernetes_versions.this.kubernetes_versions.0.version + + maintenance = { + enable_kubernetes_version_updates = true + enable_machine_image_version_updates = true + start = "01:00:00Z" + end = "02:00:00Z" + } + + node_pools = [ + { + name = "standard" + machine_type = "g2i.4" + minimum = "3" + maximum = "9" + max_surge = "3" + availability_zones = ["eu01-1", "eu01-2", "eu01-3"] + os_version_min = local.flatcar_supported_version + os_name = "flatcar" + volume_size = 150 + volume_type = "storage_premium_perf6" + }, + { + name = "gpu-pool-h100-2" + machine_type = "n3.14d.g1" + os_version_min = local.ubuntu_supported_version + os_name = "ubuntu" + minimum = "1" + maximum = "1" + max_surge = "1" + availability_zones = ["eu01-2"] + volume_size = 150 + volume_type = "storage_premium_perf6" + labels = { + "dedicated" = "gpu" + } + taints = [ + { + effect = "NoSchedule" + key = "nvidia.com/gpu" + value = "true" + }, + ] + }, + ] +} + +resource "kubernetes_namespace_v1" "gpu_operator" { + metadata { + name = "gpu-operator" + } +} + +resource "helm_release" "gpu_operator" { + name = "gpu-operator" + namespace = kubernetes_namespace_v1.gpu_operator.metadata[0].name + repository = "https://helm.ngc.nvidia.com/nvidia" + chart = "gpu-operator" + version = "25.3.1" + + values = [ + local.gpu_operator_helm_values + ] +} -- 2.49.1 From 5a650acc2f78fc17ba141f735445c30babea83a2 Mon Sep 17 00:00:00 2001 From: Mauritz Uphoff Date: Thu, 16 Apr 2026 13:30:39 +0200 Subject: [PATCH 2/2] chore(example): remove visible ids Signed-off-by: Mauritz Uphoff --- examples/iaas-ha-vrrp/01-config.tf | 2 +- examples/iaas-volume-encryption/01-config.tf | 2 +- examples/iaas-volume-encryption/05-server.tf | 2 +- examples/resourcemanager-nested-folders/01-variables.tf | 2 +- examples/ske-external-secrets-sync/020-variables.tf | 2 +- examples/ske-gpu-operator/main.tf | 2 +- examples/ske-nginx-rate-limit/01-variables.tf | 2 +- examples/ske-stackit-sfs-integration/01-config.tf | 4 ++-- examples/ske-stackit-sfs-integration/04-project.tf | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/iaas-ha-vrrp/01-config.tf b/examples/iaas-ha-vrrp/01-config.tf index a0793e7..e01eeda 100644 --- a/examples/iaas-ha-vrrp/01-config.tf +++ b/examples/iaas-ha-vrrp/01-config.tf @@ -14,7 +14,7 @@ variable "stackit_project_id" { type = string - default = "d75e6aab-b616-4b42-ae3b-aaf161ad626d" + default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" } variable "stackit_region" { diff --git a/examples/iaas-volume-encryption/01-config.tf b/examples/iaas-volume-encryption/01-config.tf index ba124eb..7fbb8c0 100644 --- a/examples/iaas-volume-encryption/01-config.tf +++ b/examples/iaas-volume-encryption/01-config.tf @@ -29,5 +29,5 @@ variable "zone" { variable "STACKIT_PROJECT_ID" { type = string description = "STACKIT Project ID" - default = "16ec118f-90d0-466d-8393-99eea504c536" + default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" } diff --git a/examples/iaas-volume-encryption/05-server.tf b/examples/iaas-volume-encryption/05-server.tf index 9cbc0d6..3c342fb 100644 --- a/examples/iaas-volume-encryption/05-server.tf +++ b/examples/iaas-volume-encryption/05-server.tf @@ -33,7 +33,7 @@ resource "stackit_network_interface" "nic" { data "stackit_security_group" "default" { project_id = var.STACKIT_PROJECT_ID - security_group_id = "a6b4708e-b8ee-48ba-b084-a4892e9a73af" + security_group_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" } data "stackit_network" "default" { diff --git a/examples/resourcemanager-nested-folders/01-variables.tf b/examples/resourcemanager-nested-folders/01-variables.tf index 6170b51..7c85cff 100644 --- a/examples/resourcemanager-nested-folders/01-variables.tf +++ b/examples/resourcemanager-nested-folders/01-variables.tf @@ -24,7 +24,7 @@ variable "stackit_service_account_key_path" { variable "stackit_org_id" { type = string - default = "03a34540-3c1a-4794-b2c6-7111ecf824ef" + default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" } variable "owner_email" { diff --git a/examples/ske-external-secrets-sync/020-variables.tf b/examples/ske-external-secrets-sync/020-variables.tf index 360890d..f87f3ab 100644 --- a/examples/ske-external-secrets-sync/020-variables.tf +++ b/examples/ske-external-secrets-sync/020-variables.tf @@ -14,7 +14,7 @@ variable "stackit_project_id" { type = string - default = "d75e6aab-b616-4b42-ae3b-aaf161ad626d" + default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" } variable "stackit_region" { diff --git a/examples/ske-gpu-operator/main.tf b/examples/ske-gpu-operator/main.tf index c4174e6..b0b8c98 100644 --- a/examples/ske-gpu-operator/main.tf +++ b/examples/ske-gpu-operator/main.tf @@ -26,7 +26,7 @@ terraform { } variable "project_id" { - default = "xxx" + default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" } variable "stackit_service_account_key_path" { diff --git a/examples/ske-nginx-rate-limit/01-variables.tf b/examples/ske-nginx-rate-limit/01-variables.tf index 360890d..f87f3ab 100644 --- a/examples/ske-nginx-rate-limit/01-variables.tf +++ b/examples/ske-nginx-rate-limit/01-variables.tf @@ -14,7 +14,7 @@ variable "stackit_project_id" { type = string - default = "d75e6aab-b616-4b42-ae3b-aaf161ad626d" + default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" } variable "stackit_region" { diff --git a/examples/ske-stackit-sfs-integration/01-config.tf b/examples/ske-stackit-sfs-integration/01-config.tf index 10a65ba..8ecf802 100644 --- a/examples/ske-stackit-sfs-integration/01-config.tf +++ b/examples/ske-stackit-sfs-integration/01-config.tf @@ -41,11 +41,11 @@ variable "LOCAL_SUBNET" { variable "STACKIT_PROJECT_ID" { type = string description = "STACKIT Project ID" - default = "16ec118f-90d0-466d-8393-99eea504c536" + default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" } variable "STACKIT_ORG_ID" { type = string description = "STACKIT Org ID" - default = "03a34540-3c1a-4794-b2c6-7111ecf824ef" + default = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" } diff --git a/examples/ske-stackit-sfs-integration/04-project.tf b/examples/ske-stackit-sfs-integration/04-project.tf index 4fd3604..69088af 100644 --- a/examples/ske-stackit-sfs-integration/04-project.tf +++ b/examples/ske-stackit-sfs-integration/04-project.tf @@ -22,7 +22,7 @@ resource "stackit_resourcemanager_project" "sfs-no-folder" { } resource "stackit_resourcemanager_project" "sfs-folder" { - parent_container_id = "bc229fa8-4be4-42d5-8808-514fe6d39074" #Folder ID Demos + parent_container_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" #Folder ID Demos name = "sfs-example-folder" labels = { "networkArea" = stackit_network_area.sfs.network_area_id -- 2.49.1