Skip to content
Snippets Groups Projects
Commit 4008fb41 authored by Sebastian Karius's avatar Sebastian Karius
Browse files

Merge remote-tracking branch 'origin/main'

parents 803d82ef bf27e3ce
No related branches found
No related tags found
No related merge requests found
.gitignore 100644 → 100755
File mode changed from 100644 to 100755
README.md 100644 → 100755
File mode changed from 100644 to 100755
......@@ -14,10 +14,10 @@ LAMP (Linux Apache Mysql PHP) System als Basis für die Installation von Contao,
mkdir <path>/containerize/helm-values/LAMP
cd <path>/containerize/helm-values/LAMP
# Default Values ausgeben
helm inspect values lamp/lamp > values-default.yaml
helm inspect values containerize/ti-lamp > values-default.yaml
```
2. `values.yaml` anlegen und anpassen, sieh mitgelieferte `values.yaml`
2. `values.yaml` anlegen und anpassen, siehe mitgelieferte `values.yaml`
3. Installation
......@@ -26,10 +26,25 @@ LAMP (Linux Apache Mysql PHP) System als Basis für die Installation von Contao,
```bash
helm upgrade --install --wait contao \
-n contao --create-namespace \
lamp/lamp \
containerize/ti-lamp \
--values values.yaml \
--set mysql.rootPassword=<pass1> \
--set mysql.database=contao \
--set mysql.user=contao \
--set mysql.password=<pass2>
```
4. Upgrade / Anpassungen
```bash
helm upgrade --install --wait contao \
-n contao --create-namespace \
containerize/ti-lamp \
--values values.yaml \
--set mysql.rootPassword=<pass1> \
--set mysql.database=contao \
--set mysql.user=contao \
--set mysql.password=<pass2> \
--set webdav.user=contao \
--set webdav.password=<PASSWORD>
```
\ No newline at end of file
File mode changed from 100644 to 100755
......@@ -35,12 +35,18 @@ httpd:
</VirtualHost>
php:
# official PHP images ----
# repository: "php"
# tag: "8.3-fpm-alpine" # has no gd+intl
# tag: "8.3-fpm" # has no gd+intl
# local Repo -------------
repository: "mcr.informatik.uni-halle.de/studio-r215/containerize/php-fpm"
tag: "83"
# # official PHP images
# repository: "php"
# tag: "8.3-fpm-alpine" # has no gd+intl
# tag: "8.3-fpm" # has no gd+intl
# php.ini
ini: |
max_input_vars = 2000
upload_max_filesize = 50M
post_max_size = 55M
# resources:
# requests:
# memory: "128Mi"
......@@ -55,6 +61,11 @@ phpmyadmin:
tag: "apache"
subdomain: pma
webdav:
enabled: true
subdomain: webdv
ingress:
enabled: true
domain: cslsa.tikube.informatik.uni-halle.de
......
......@@ -5,6 +5,8 @@
Für die Nutzung von CUDA in Kubernetes Knoten:
Siehe https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-driver-configuration.html
Für die Nutzung mit vorinstallierten Treibern und Toolkit:
Siehe https://www.jimangel.io/posts/nvidia-rtx-gpu-kubernetes-setup/
### Installation
......@@ -34,7 +36,7 @@ Siehe https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-dr
helm status gpu-operator -n gpu-operator
```
#### Variante Bare-metal/Passthrough with pre-installed NVIDIA drivers
#### Variante Bare-metal/Passthrough mit pre-installed NVIDIA-Treiber
```yaml
driver:
......@@ -57,6 +59,44 @@ toolkit:
value: "true"
```
#### Variante Bare-metal/Passthrough mit pre-installed NVIDIA-Treiber und -Toolkit
- Toolkit instlallieren
```bash
# add nvidia-container-toolkit repo to apt sources
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt update
# install container toolkit
sudo apt install -y nvidia-container-toolkit
# options: --dry-run
sudo nvidia-ctk runtime configure --runtime=containerd
reboot
```
```yaml
driver:
enabled: false
version: "550.127.05" # hier muss die passende Version NVidia-Treiber der Basismaschine eingetragen werden
toolkit:
enabled: false
env:
- name: CONTAINERD_CONFIG
value: /var/lib/rancher/rke2/agent/etc/containerd/config.toml.tmpl
- name: CONTAINERD_SOCKET
value: /run/k3s/containerd/containerd.sock
- name: CONTAINERD_RUNTIME_CLASS
value: nvidia
- name: CONTAINERD_SET_AS_DEFAULT
value: "true"
```
### Time-Slicing
<https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-sharing.html>
......
......@@ -20,6 +20,18 @@ sandboxWorkloads:
enabled: false
defaultWorkload: "container"
hostPaths:
# rootFS represents the path to the root filesystem of the host.
# This is used by components that need to interact with the host filesystem
# and as such this must be a chroot-able filesystem.
# Examples include the MIG Manager and Toolkit Container which may need to
# stop, start, or restart systemd services
rootFS: "/"
# driverInstallDir represents the root at which driver files including libraries,
# config files, and executables can be found.
driverInstallDir: "/run/nvidia/driver"
daemonsets:
labels: {}
annotations: {}
......@@ -67,11 +79,11 @@ operator:
cleanupCRD: false
# upgrade CRD on chart upgrade, requires --disable-openapi-validation flag
# to be passed during helm upgrade.
upgradeCRD: false
upgradeCRD: true
initContainer:
image: cuda
repository: nvcr.io/nvidia
version: 12.4.1-base-ubi8
version: 12.6.2-base-ubi9
imagePullPolicy: IfNotPresent
tolerations:
- key: "node-role.kubernetes.io/master"
......@@ -131,7 +143,7 @@ driver:
usePrecompiled: false
repository: nvcr.io/nvidia
image: driver
version: "550.54.15"
version: "550.127.05"
imagePullPolicy: IfNotPresent
imagePullSecrets: []
startupProbe:
......@@ -181,7 +193,7 @@ driver:
repository: nvcr.io/nvidia/cloud-native
# When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
# to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
version: v0.6.8
version: v0.7.0
imagePullPolicy: IfNotPresent
env:
- name: ENABLE_GPU_POD_EVICTION
......@@ -219,7 +231,7 @@ toolkit:
enabled: true
repository: nvcr.io/nvidia/k8s
image: container-toolkit
version: v1.15.0-ubuntu20.04
version: v1.17.0-ubuntu20.04
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env: []
......@@ -230,7 +242,7 @@ devicePlugin:
enabled: true
repository: nvcr.io/nvidia
image: k8s-device-plugin
version: v0.15.0-ubi8
version: v0.17.0-ubi9
imagePullPolicy: IfNotPresent
imagePullSecrets: []
args: []
......@@ -271,7 +283,7 @@ devicePlugin:
config:
# Create a ConfigMap (default: false)
create: false
# ConfigMap name (either exiting or to create a new one with create=true above)
# ConfigMap name (either existing or to create a new one with create=true above)
name: ""
# Default config name within the ConfigMap
default: ""
......@@ -288,9 +300,8 @@ dcgm:
enabled: false
repository: nvcr.io/nvidia/cloud-native
image: dcgm
version: 3.3.5-1-ubuntu22.04
version: 3.3.8-1-ubuntu22.04
imagePullPolicy: IfNotPresent
hostPort: 5555
args: []
env: []
resources: {}
......@@ -299,7 +310,7 @@ dcgmExporter:
enabled: true
repository: nvcr.io/nvidia/k8s
image: dcgm-exporter
version: 3.3.5-3.4.1-ubuntu22.04
version: 3.3.8-3.6.0-ubuntu22.04
imagePullPolicy: IfNotPresent
env:
- name: DCGM_EXPORTER_LISTEN
......@@ -321,12 +332,31 @@ dcgmExporter:
# target_label: instance
# replacement: $1
# action: replace
# DCGM Exporter configuration
# This block is used to configure DCGM Exporter to emit a customized list of metrics.
# Use "name" to either point to an existing ConfigMap or to create a new one with a
# list of configurations (i.e with create=true).
# When pointing to an existing ConfigMap, the ConfigMap must exist in the same namespace as the release.
# The metrics are expected to be listed under a key called `dcgm-metrics.csv`.
# Use "data" to build an integrated ConfigMap from a set of custom metrics as
# part of the chart. An example of some custom metrics are shown below. Note that
# the contents of "data" must be in CSV format and be valid DCGM Exporter metric configurations.
# config:
# name: custom-dcgm-exporter-metrics
# create: true
# data: |-
# Format
# If line starts with a '#' it is considered a comment
# DCGM FIELD, Prometheus metric type, help message
# Clocks
# DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
# DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
gfd:
enabled: true
repository: nvcr.io/nvidia
image: k8s-device-plugin
version: v0.15.0-ubi8
version: v0.17.0-ubi9
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env:
......@@ -340,16 +370,52 @@ migManager:
enabled: true
repository: nvcr.io/nvidia/cloud-native
image: k8s-mig-manager
version: v0.7.0-ubuntu20.04
version: v0.10.0-ubuntu20.04
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env:
- name: WITH_REBOOT
value: "false"
resources: {}
# MIG configuration
# Use "name" to either point to an existing ConfigMap or to create a new one with a list of configurations(i.e with create=true).
# Use "data" to build an integrated ConfigMap from a set of configurations as
# part of this helm chart. An example of setting "data" might be:
# config:
# name: custom-mig-parted-configs
# create: true
# data: |-
# config.yaml: |-
# version: v1
# mig-configs:
# all-disabled:
# - devices: all
# mig-enabled: false
# custom-mig:
# - devices: [0]
# mig-enabled: false
# - devices: [1]
# mig-enabled: true
# mig-devices:
# "1g.10gb": 7
# - devices: [2]
# mig-enabled: true
# mig-devices:
# "2g.20gb": 2
# "3g.40gb": 1
# - devices: [3]
# mig-enabled: true
# mig-devices:
# "3g.40gb": 1
# "4g.40gb": 1
config:
name: "default-mig-parted-config"
default: "all-disabled"
# Create a ConfigMap (default: false)
create: false
# ConfigMap name (either existing or to create a new one with create=true above)
name: ""
# Data section for the ConfigMap to create (i.e only applies when create=true)
data: {}
gpuClientsConfig:
name: ""
......@@ -367,7 +433,7 @@ gds:
enabled: false
repository: nvcr.io/nvidia/cloud-native
image: nvidia-fs
version: "2.17.5"
version: "2.20.5"
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env: []
......@@ -377,7 +443,7 @@ gdrcopy:
enabled: false
repository: nvcr.io/nvidia/cloud-native
image: gdrdrv
version: "v2.4.1"
version: "v2.4.1-2"
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env: []
......@@ -397,7 +463,7 @@ vgpuManager:
repository: nvcr.io/nvidia/cloud-native
# When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
# to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
version: v0.6.8
version: v0.7.0
imagePullPolicy: IfNotPresent
env:
- name: ENABLE_GPU_POD_EVICTION
......@@ -409,7 +475,7 @@ vgpuDeviceManager:
enabled: true
repository: nvcr.io/nvidia/cloud-native
image: vgpu-device-manager
version: "v0.2.6"
version: v0.2.8
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env: []
......@@ -421,7 +487,7 @@ vfioManager:
enabled: true
repository: nvcr.io/nvidia
image: cuda
version: 12.4.1-base-ubi8
version: 12.6.2-base-ubi9
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env: []
......@@ -431,7 +497,7 @@ vfioManager:
repository: nvcr.io/nvidia/cloud-native
# When choosing a different version of k8s-driver-manager, DO NOT downgrade to a version lower than v0.6.4
# to ensure k8s-driver-manager stays compatible with gpu-operator starting from v24.3.0
version: v0.6.8
version: v0.7.0
imagePullPolicy: IfNotPresent
env:
- name: ENABLE_GPU_POD_EVICTION
......@@ -457,7 +523,7 @@ kataManager:
pullSecret: ""
repository: nvcr.io/nvidia/cloud-native
image: k8s-kata-manager
version: v0.2.0
version: v0.2.2
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env: []
......@@ -467,7 +533,7 @@ sandboxDevicePlugin:
enabled: true
repository: nvcr.io/nvidia
image: kubevirt-gpu-device-plugin
version: v1.2.7
version: v1.2.10
imagePullPolicy: IfNotPresent
imagePullSecrets: []
args: []
......@@ -489,6 +555,7 @@ ccManager:
node-feature-discovery:
enableNodeFeatureApi: true
priorityClassName: system-node-critical
gc:
enable: true
replicaCount: 1
......
# values for gpu-operator.
# values for gpu-operator
driver:
#enabled: true
enabled: false
usePrecompiled: true
repository: nvcr.io/nvidia
image: driver
#usePrecompiled: true
#repository: nvcr.io/nvidia
#image: driver
#version: "535.129.03"
#version: "535.161.08"
version: "550.54.15"
#version: "550.54.15"
#version: "550.90.07"
version: "550.127.05"
toolkit:
enabled: true
#enabled: true
enabled: false
#env: []
env:
- name: CONTAINERD_CONFIG
......@@ -24,3 +26,4 @@ toolkit:
- name: CONTAINERD_SET_AS_DEFAULT
value: "true"
File mode changed from 100644 to 100755
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment