530 lines
15 KiB
YAML
530 lines
15 KiB
YAML
# Default values for hami-vgpu.
|
|
|
|
nameOverride: ""
|
|
fullnameOverride: ""
|
|
imagePullSecrets: [ ]
|
|
version: "v2.5.1"
|
|
|
|
#Nvidia GPU Parameters
|
|
resourceName: "nvidia.com/gpu"
|
|
resourceMem: "nvidia.com/gpumem"
|
|
resourceMemPercentage: "nvidia.com/gpumem-percentage"
|
|
resourceCores: "nvidia.com/gpucores"
|
|
resourcePriority: "nvidia.com/priority"
|
|
|
|
#MLU Parameters
|
|
mluResourceName: "cambricon.com/vmlu"
|
|
mluResourceMem: "cambricon.com/mlu.smlu.vmemory"
|
|
mluResourceCores: "cambricon.com/mlu.smlu.vcore"
|
|
|
|
#Hygon DCU Parameters
|
|
dcuResourceName: "hygon.com/dcunum"
|
|
dcuResourceMem: "hygon.com/dcumem"
|
|
dcuResourceCores: "hygon.com/dcucores"
|
|
|
|
#Iluvatar GPU Parameters
|
|
iluvatarResourceName: "iluvatar.ai/vgpu"
|
|
iluvatarResourceMem: "iluvatar.ai/vcuda-memory"
|
|
iluvatarResourceCore: "iluvatar.ai/vcuda-core"
|
|
|
|
#Metax SGPU Parameters
|
|
metaxResourceName: "metax-tech.com/sgpu"
|
|
metaxResourceCore: "metax-tech.com/vcore"
|
|
metaxResourceMem: "metax-tech.com/vmemory"
|
|
|
|
schedulerName: "hami-scheduler"
|
|
|
|
podSecurityPolicy:
|
|
enabled: false
|
|
|
|
global:
|
|
gpuHookPath: /usr/local
|
|
labels: {}
|
|
annotations: {}
|
|
managedNodeSelectorEnable: false
|
|
managedNodeSelector:
|
|
usage: "gpu"
|
|
|
|
|
|
scheduler:
|
|
# @param nodeName defines the node name and the nvidia-vgpu-scheduler-scheduler will schedule to the node.
|
|
# if we install the nvidia-vgpu-scheduler-scheduler as default scheduler, we need to remove the k8s default
|
|
# scheduler pod from the cluster first, we must specify node name to skip the schedule workflow.
|
|
nodeName: ""
|
|
#nodeLabelSelector:
|
|
# "gpu": "on"
|
|
overwriteEnv: "false"
|
|
defaultSchedulerPolicy:
|
|
nodeSchedulerPolicy: binpack
|
|
gpuSchedulerPolicy: spread
|
|
metricsBindAddress: ":9395"
|
|
livenessProbe: false
|
|
leaderElect: true
|
|
# when leaderElect is true, replicas is available, otherwise replicas is 1.
|
|
replicas: 1
|
|
kubeScheduler:
|
|
# @param enabled indicate whether to run kube-scheduler container in the scheduler pod, it's true by default.
|
|
enabled: true
|
|
image: registry.k8s.io/kube-scheduler
|
|
imageTag: ""
|
|
imagePullPolicy: IfNotPresent
|
|
resources: {}
|
|
# If you do want to specify resources, uncomment the following lines, adjust them as necessary.
|
|
# and remove the curly braces after 'resources:'.
|
|
# limits:
|
|
# cpu: 1000m
|
|
# memory: 1000Mi
|
|
# requests:
|
|
# cpu: 100m
|
|
# memory: 100Mi
|
|
extraNewArgs:
|
|
- --config=/config/config.yaml
|
|
- -v=4
|
|
extraArgs:
|
|
- --policy-config-file=/config/config.json
|
|
- -v=4
|
|
extender:
|
|
image: "beclab/hami"
|
|
imagePullPolicy: IfNotPresent
|
|
resources: {}
|
|
# If you do want to specify resources, uncomment the following lines, adjust them as necessary,
|
|
# and remove the curly braces after 'resources:'.
|
|
# limits:
|
|
# cpu: 1000m
|
|
# memory: 1000Mi
|
|
# requests:
|
|
# cpu: 100m
|
|
# memory: 100Mi
|
|
extraArgs:
|
|
- --debug
|
|
- -v=4
|
|
podAnnotations: {}
|
|
tolerations: []
|
|
#serviceAccountName: "hami-vgpu-scheduler-sa"
|
|
admissionWebhook:
|
|
customURL:
|
|
enabled: false
|
|
# must be an endpoint using https.
|
|
# should generate host certs here
|
|
host: 127.0.0.1 # hostname or ip, can be your node'IP if you want to use https://<nodeIP>:<schedulerPort>/<path>
|
|
port: 31998
|
|
path: /webhook
|
|
whitelistNamespaces:
|
|
# Specify the namespaces that the webhook will not be applied to.
|
|
# - default
|
|
# - kube-system
|
|
# - istio-system
|
|
reinvocationPolicy: Never
|
|
failurePolicy: Ignore
|
|
patch:
|
|
image: jettech/kube-webhook-certgen:v1.5.2
|
|
imageNew: liangjw/kube-webhook-certgen:v1.1.1
|
|
imagePullPolicy: IfNotPresent
|
|
priorityClassName: ""
|
|
podAnnotations: {}
|
|
nodeSelector: {}
|
|
tolerations: []
|
|
runAsUser: 2000
|
|
service:
|
|
type: ClusterIP # Default type is NodePort, can be changed to ClusterIP
|
|
httpPort: 443 # HTTP port
|
|
schedulerPort: 31998 # NodePort for HTTP
|
|
monitorPort: 31993 # Monitoring port
|
|
labels: {}
|
|
annotations: {}
|
|
|
|
devicePlugin:
|
|
image: "beclab/hami"
|
|
monitorimage: "beclab/hami"
|
|
monitorctrPath: /usr/local/vgpu/containers
|
|
imagePullPolicy: IfNotPresent
|
|
deviceSplitCount: 100
|
|
deviceMemoryScaling: 100
|
|
deviceCoreScaling: 100
|
|
runtimeClassName: ""
|
|
migStrategy: "none"
|
|
disablecorelimit: "false"
|
|
passDeviceSpecsEnabled: false
|
|
extraArgs:
|
|
- -v=4
|
|
|
|
service:
|
|
type: ClusterIP # Default type is NodePort, can be changed to ClusterIP
|
|
httpPort: 31992
|
|
labels: {}
|
|
annotations: {}
|
|
|
|
pluginPath: /var/lib/kubelet/device-plugins
|
|
libPath: /usr/local/vgpu
|
|
|
|
podAnnotations: {}
|
|
nvidianodeSelector:
|
|
gpu.bytetrade.io/cuda-supported: 'true'
|
|
tolerations: []
|
|
# The updateStrategy for DevicePlugin DaemonSet.
|
|
# If you want to update the DaemonSet by manual, set type as "OnDelete".
|
|
# We recommend use OnDelete update strategy because DevicePlugin pod restart will cause business pod restart, this behavior is destructive.
|
|
# Otherwise, you can use RollingUpdate update strategy to rolling update DevicePlugin pod.
|
|
updateStrategy:
|
|
type: RollingUpdate
|
|
rollingUpdate:
|
|
maxUnavailable: 1
|
|
|
|
resources: {}
|
|
# If you do want to specify resources, uncomment the following lines, adjust them as necessary.
|
|
# and remove the curly braces after 'resources:'.
|
|
# limits:
|
|
# cpu: 1000m
|
|
# memory: 1000Mi
|
|
# requests:
|
|
# cpu: 100m
|
|
# memory: 100Mi
|
|
|
|
vgpuMonitor:
|
|
resources: {}
|
|
# If you do want to specify resources, uncomment the following lines, adjust them as necessary.
|
|
# and remove the curly braces after 'resources:'.
|
|
# limits:
|
|
# cpu: 1000m
|
|
# memory: 1000Mi
|
|
# requests:
|
|
# cpu: 100m
|
|
# memory: 100Mi
|
|
|
|
devices:
|
|
mthreads:
|
|
enabled: false
|
|
customresources:
|
|
- mthreads.com/vgpu
|
|
nvidia:
|
|
gpuCorePolicy: default
|
|
ascend:
|
|
enabled: false
|
|
image: ""
|
|
imagePullPolicy: IfNotPresent
|
|
extraArgs: []
|
|
nodeSelector:
|
|
ascend: "on"
|
|
tolerations: []
|
|
customresources:
|
|
- huawei.com/Ascend910A
|
|
- huawei.com/Ascend910A-memory
|
|
- huawei.com/Ascend910B2
|
|
- huawei.com/Ascend910B2-memory
|
|
- huawei.com/Ascend910B
|
|
- huawei.com/Ascend910B-memory
|
|
- huawei.com/Ascend910B4
|
|
- huawei.com/Ascend910B4-memory
|
|
- huawei.com/Ascend310P
|
|
- huawei.com/Ascend310P-memory
|
|
|
|
dcgmExporter:
|
|
image:
|
|
repository: nvidia/dcgm-exporter
|
|
pullPolicy: IfNotPresent
|
|
tag: 4.1.1-4.0.4-ubuntu22.04
|
|
|
|
# Change the following reference to "/etc/dcgm-exporter/default-counters.csv"
|
|
# to stop profiling metrics from DCGM
|
|
arguments: ["-f", "/etc/dcgm-exporter/default-counters.csv"]
|
|
# NOTE: in general, add any command line arguments to arguments above
|
|
# and they will be passed through.
|
|
# Use "-r", "<HOST>:<PORT>" to connect to an already running hostengine
|
|
# Example arguments: ["-r", "host123:5555"]
|
|
# Use "-n" to remove the hostname tag from the output.
|
|
# Example arguments: ["-n"]
|
|
# Use "-d" to specify the devices to monitor. -d must be followed by a string
|
|
# in the following format: [f] or [g[:numeric_range][+]][i[:numeric_range]]
|
|
# Where a numeric range is something like 0-4 or 0,2,4, etc.
|
|
# Example arguments: ["-d", "g+i"] to monitor all GPUs and GPU instances or
|
|
# ["-d", "g:0-3"] to monitor GPUs 0-3.
|
|
# Use "-m" to specify the namespace and name of a configmap containing
|
|
# the watched exporter fields.
|
|
# Example arguments: ["-m", "default:exporter-metrics-config-map"]
|
|
|
|
# Overrides the chart's name
|
|
nameOverride: "nvidia-dcgm-exporter"
|
|
|
|
# Overrides the chart's computed fullname
|
|
fullnameOverride: ""
|
|
|
|
# Overrides the deployment namespace
|
|
namespaceOverride: ""
|
|
|
|
# Defines the runtime class that will be used by the pod
|
|
runtimeClassName: ""
|
|
# Defines serviceAccount names for components.
|
|
serviceAccount:
|
|
# Specifies whether a service account should be created
|
|
create: true
|
|
# Annotations to add to the service account
|
|
annotations: {}
|
|
# The name of the service account to use.
|
|
# If not set and create is true, a name is generated using the fullname template
|
|
name:
|
|
|
|
rollingUpdate:
|
|
# Specifies maximum number of DaemonSet pods that can be unavailable during the update
|
|
maxUnavailable: 1
|
|
# Specifies maximum number of nodes with an existing available DaemonSet pod that can have an updated DaemonSet pod during during an update
|
|
maxSurge: 0
|
|
|
|
# Labels to be added to dcgm-exporter pods
|
|
podLabels: {}
|
|
|
|
# Annotations to be added to dcgm-exporter pods
|
|
podAnnotations: {}
|
|
# Using this annotation which is required for prometheus scraping
|
|
# prometheus.io/scrape: "true"
|
|
# prometheus.io/port: "9400"
|
|
|
|
# The SecurityContext for the dcgm-exporter pods
|
|
podSecurityContext: {}
|
|
# fsGroup: 2000
|
|
|
|
# The SecurityContext for the dcgm-exporter containers
|
|
securityContext:
|
|
runAsNonRoot: false
|
|
runAsUser: 0
|
|
capabilities:
|
|
add: ["SYS_ADMIN"]
|
|
# readOnlyRootFilesystem: true
|
|
|
|
# Defines the dcgm-exporter service
|
|
service:
|
|
# When enabled, the helm chart will create service
|
|
enable: true
|
|
type: ClusterIP
|
|
clusterIP: ""
|
|
port: 9400
|
|
address: ":9400"
|
|
# Annotations to add to the service
|
|
annotations: {}
|
|
|
|
# Allows to control pod resources
|
|
resources: {}
|
|
# limits:
|
|
# cpu: 100m
|
|
# memory: 128Mi
|
|
# requests:
|
|
# cpu: 100m
|
|
# memory: 128Mi
|
|
serviceMonitor:
|
|
apiVersion: "monitoring.coreos.com/v1"
|
|
enabled: true
|
|
interval: 15s
|
|
honorLabels: false
|
|
additionalLabels: {}
|
|
#monitoring: prometheus
|
|
relabelings: []
|
|
# - sourceLabels: [__meta_kubernetes_pod_node_name]
|
|
# separator: ;
|
|
# regex: ^(.*)$
|
|
# targetLabel: nodename
|
|
# replacement: $1
|
|
# action: replace
|
|
|
|
nodeSelector: {}
|
|
#node: gpu
|
|
|
|
tolerations: []
|
|
#- operator: Exists
|
|
|
|
affinity: {}
|
|
#nodeAffinity:
|
|
# requiredDuringSchedulingIgnoredDuringExecution:
|
|
# nodeSelectorTerms:
|
|
# - matchExpressions:
|
|
# - key: nvidia-gpu
|
|
# operator: Exists
|
|
|
|
extraHostVolumes: []
|
|
#- name: host-binaries
|
|
# hostPath: /opt/bin
|
|
|
|
extraConfigMapVolumes:
|
|
- name: exporter-metrics-volume
|
|
configMap:
|
|
name: exporter-metrics-config-map
|
|
items:
|
|
- key: metrics
|
|
path: default-counters.csv
|
|
|
|
extraVolumeMounts:
|
|
- name: exporter-metrics-volume
|
|
mountPath: /etc/dcgm-exporter/default-counters.csv
|
|
subPath: default-counters.csv
|
|
|
|
extraEnv: []
|
|
#- name: EXTRA_VAR
|
|
# value: "TheStringValue"
|
|
|
|
# Path to the kubelet socket for /pod-resources
|
|
kubeletPath: "/var/lib/kubelet/pod-resources"
|
|
|
|
# HTTPS configuration
|
|
tlsServerConfig:
|
|
# Enable or disable HTTPS configuration
|
|
enabled: false
|
|
# Use autogenerated self-signed TLS certificates. Not recommended for production environments.
|
|
autoGenerated: true
|
|
# Existing secret containing your own server key and certificate
|
|
existingSecret: ""
|
|
# Certificate file name
|
|
certFilename: "tls.crt"
|
|
# Key file name
|
|
keyFilename: "tls.key"
|
|
# CA certificate file name
|
|
caFilename: "ca.crt"
|
|
# Server policy for client authentication. Maps to ClientAuth Policies.
|
|
# For more detail on clientAuth options:
|
|
# https://golang.org/pkg/crypto/tls/#ClientAuthType
|
|
#
|
|
# NOTE: If you want to enable client authentication, you need to use
|
|
# RequireAndVerifyClientCert. Other values are insecure.
|
|
clientAuthType: ""
|
|
# TLS Key for HTTPS - ignored if existingSecret is provided
|
|
key: ""
|
|
# TLS Certificate for HTTPS - ignored if existingSecret is provided
|
|
cert: ""
|
|
# CA Certificate for HTTPS - ignored if existingSecret is provided
|
|
ca: ""
|
|
|
|
basicAuth:
|
|
#Object containing <user>:<passwords> key-value pairs for each user that will have access via basic authentication
|
|
users: {}
|
|
|
|
# Customized list of metrics to emit. Expected to be in the same format (CSV) as the default list.
|
|
# Must be the complete list and is not additive. If unset, the default list will take effect.
|
|
# customMetrics: |
|
|
# Format
|
|
# If line starts with a '#' it is considered a comment
|
|
# DCGM FIELD, Prometheus metric type, help message
|
|
|
|
webui:
|
|
replicaCount: 1
|
|
|
|
vendorNodeSelectors:
|
|
NVIDIA: gpu.bytetrade.io/cuda-supported=true
|
|
Ascend: ascend=on
|
|
DCU: dcu=on
|
|
MLU: mlu=on
|
|
|
|
image:
|
|
frontend:
|
|
repository: projecthami/hami-webui-fe-oss
|
|
pullPolicy: IfNotPresent
|
|
# Overrides the image tag whose default is the chart appVersion.
|
|
tag: "v1.0.5"
|
|
backend:
|
|
repository: projecthami/hami-webui-be-oss
|
|
pullPolicy: IfNotPresent
|
|
tag: "v1.0.5"
|
|
|
|
imagePullSecrets: []
|
|
nameOverride: "webui"
|
|
fullnameOverride: ""
|
|
namespaceOverride: ""
|
|
|
|
serviceAccount:
|
|
# Specifies whether a service account should be created
|
|
create: true
|
|
# Annotations to add to the service account
|
|
annotations: {}
|
|
# The name of the service account to use.
|
|
# If not set and create is true, a name is generated using the fullname template
|
|
name: ""
|
|
|
|
podAnnotations: {}
|
|
|
|
podSecurityContext: {}
|
|
# fsGroup: 2000
|
|
|
|
securityContext: {}
|
|
# capabilities:
|
|
# drop:
|
|
# - ALL
|
|
# readOnlyRootFilesystem: true
|
|
# runAsNonRoot: true
|
|
# runAsUser: 1000
|
|
|
|
service:
|
|
type: ClusterIP
|
|
port: 3000
|
|
|
|
ingress:
|
|
enabled: false
|
|
className: ""
|
|
annotations: {}
|
|
# kubernetes.io/ingress.class: nginx
|
|
# kubernetes.io/tls-acme: "true"
|
|
hosts:
|
|
- host: chart-example.local
|
|
paths:
|
|
- path: /
|
|
pathType: ImplementationSpecific
|
|
tls: []
|
|
# - secretName: chart-example-tls
|
|
# hosts:
|
|
# - chart-example.local
|
|
|
|
resources:
|
|
frontend:
|
|
limits:
|
|
cpu: 200m
|
|
memory: 500Mi
|
|
requests:
|
|
cpu: 200m
|
|
memory: 500Mi
|
|
backend:
|
|
limits:
|
|
cpu: 50m
|
|
memory: 250Mi
|
|
requests:
|
|
cpu: 50m
|
|
memory: 250Mi
|
|
# We usually recommend not to specify default resources and to leave this as a conscious
|
|
# choice for the user. This also increases chances charts run on environments with little
|
|
# resources, such as Minikube. If you do want to specify resources, uncomment the following
|
|
# lines, adjust them as necessary, and remove the curly braces after 'resources:'.
|
|
# limits:
|
|
# cpu: 100m
|
|
# memory: 128Mi
|
|
# requests:
|
|
# cpu: 100m
|
|
# memory: 128Mi
|
|
|
|
env:
|
|
frontend:
|
|
- name: TZ
|
|
value: "Asia/Shanghai"
|
|
backend:
|
|
- name: TZ
|
|
value: "Asia/Shanghai"
|
|
|
|
serviceMonitor:
|
|
enabled: true
|
|
interval: 15s
|
|
honorLabels: false
|
|
additionalLabels:
|
|
jobRelease: hami-webui-prometheus
|
|
relabelings: []
|
|
|
|
hamiServiceMonitor:
|
|
enabled: true
|
|
interval: 15s
|
|
honorLabels: false
|
|
additionalLabels:
|
|
jobRelease: hami-webui-prometheus
|
|
svcNamespace: kube-system
|
|
relabelings: []
|
|
|
|
nodeSelector: {}
|
|
|
|
tolerations: []
|
|
|
|
affinity: {}
|
|
|
|
externalPrometheus:
|
|
address: "http://prometheus-k8s.kubesphere-monitoring-system:9090"
|
|
enabled: true |