Files
Olares/frameworks/GPU/config/gpu/hami/values.yaml

220 lines
6.2 KiB
YAML

# Default values for hami-vgpu.
nameOverride: ""
fullnameOverride: ""
imagePullSecrets: [ ]
version: "v2.5.0"
#Nvidia GPU Parameters
resourceName: "nvidia.com/gpu"
resourceMem: "nvidia.com/gpumem"
resourceMemPercentage: "nvidia.com/gpumem-percentage"
resourceCores: "nvidia.com/gpucores"
resourcePriority: "nvidia.com/priority"
#MLU Parameters
mluResourceName: "cambricon.com/vmlu"
mluResourceMem: "cambricon.com/mlu.smlu.vmemory"
mluResourceCores: "cambricon.com/mlu.smlu.vcore"
#Hygon DCU Parameters
dcuResourceName: "hygon.com/dcunum"
dcuResourceMem: "hygon.com/dcumem"
dcuResourceCores: "hygon.com/dcucores"
#Iluvatar GPU Parameters
iluvatarResourceName: "iluvatar.ai/vgpu"
iluvatarResourceMem: "iluvatar.ai/vcuda-memory"
iluvatarResourceCore: "iluvatar.ai/vcuda-core"
#Metax SGPU Parameters
metaxResourceName: "metax-tech.com/sgpu"
metaxResourceCore: "metax-tech.com/vcore"
metaxResourceMem: "metax-tech.com/vmemory"
schedulerName: "hami-scheduler"
podSecurityPolicy:
enabled: false
global:
gpuHookPath: /usr/local
labels: {}
annotations: {}
managedNodeSelectorEnable: false
managedNodeSelector:
usage: "gpu"
scheduler:
# @param nodeName defines the node name and the nvidia-vgpu-scheduler-scheduler will schedule to the node.
# if we install the nvidia-vgpu-scheduler-scheduler as default scheduler, we need to remove the k8s default
# scheduler pod from the cluster first, we must specify node name to skip the schedule workflow.
nodeName: ""
#nodeLabelSelector:
# "gpu": "on"
overwriteEnv: "false"
defaultSchedulerPolicy:
nodeSchedulerPolicy: binpack
gpuSchedulerPolicy: spread
metricsBindAddress: ":9395"
livenessProbe: false
leaderElect: true
# when leaderElect is true, replicas is available, otherwise replicas is 1.
replicas: 1
kubeScheduler:
# @param enabled indicate whether to run kube-scheduler container in the scheduler pod, it's true by default.
enabled: true
image: registry.k8s.io/kube-scheduler
imageTag: ""
imagePullPolicy: IfNotPresent
resources: {}
# If you do want to specify resources, uncomment the following lines, adjust them as necessary.
# and remove the curly braces after 'resources:'.
# limits:
# cpu: 1000m
# memory: 1000Mi
# requests:
# cpu: 100m
# memory: 100Mi
extraNewArgs:
- --config=/config/config.yaml
- -v=4
extraArgs:
- --policy-config-file=/config/config.json
- -v=4
extender:
image: "beclab/hami"
imagePullPolicy: IfNotPresent
resources: {}
# If you do want to specify resources, uncomment the following lines, adjust them as necessary,
# and remove the curly braces after 'resources:'.
# limits:
# cpu: 1000m
# memory: 1000Mi
# requests:
# cpu: 100m
# memory: 100Mi
extraArgs:
- --debug
- -v=4
podAnnotations: {}
tolerations: []
#serviceAccountName: "hami-vgpu-scheduler-sa"
admissionWebhook:
customURL:
enabled: false
# must be an endpoint using https.
# should generate host certs here
host: 127.0.0.1 # hostname or ip, can be your node'IP if you want to use https://<nodeIP>:<schedulerPort>/<path>
port: 31998
path: /webhook
whitelistNamespaces:
# Specify the namespaces that the webhook will not be applied to.
# - default
# - kube-system
# - istio-system
reinvocationPolicy: Never
failurePolicy: Ignore
patch:
image: jettech/kube-webhook-certgen:v1.5.2
imageNew: liangjw/kube-webhook-certgen:v1.1.1
imagePullPolicy: IfNotPresent
priorityClassName: ""
podAnnotations: {}
nodeSelector: {}
tolerations: []
runAsUser: 2000
service:
type: NodePort # Default type is NodePort, can be changed to ClusterIP
httpPort: 443 # HTTP port
schedulerPort: 31998 # NodePort for HTTP
monitorPort: 31993 # Monitoring port
labels: {}
annotations: {}
devicePlugin:
image: "beclab/hami"
monitorimage: "beclab/hami"
monitorctrPath: /usr/local/vgpu/containers
imagePullPolicy: IfNotPresent
deviceSplitCount: 100
deviceMemoryScaling: 100
deviceCoreScaling: 100
runtimeClassName: ""
migStrategy: "none"
disablecorelimit: "false"
passDeviceSpecsEnabled: false
extraArgs:
- -v=4
service:
type: NodePort # Default type is NodePort, can be changed to ClusterIP
httpPort: 31992
labels: {}
annotations: {}
pluginPath: /var/lib/kubelet/device-plugins
libPath: /usr/local/vgpu
podAnnotations: {}
nvidianodeSelector:
gpu.bytetrade.io/cuda-supported: 'true'
tolerations: []
# The updateStrategy for DevicePlugin DaemonSet.
# If you want to update the DaemonSet by manual, set type as "OnDelete".
# We recommend use OnDelete update strategy because DevicePlugin pod restart will cause business pod restart, this behavior is destructive.
# Otherwise, you can use RollingUpdate update strategy to rolling update DevicePlugin pod.
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
resources: {}
# If you do want to specify resources, uncomment the following lines, adjust them as necessary.
# and remove the curly braces after 'resources:'.
# limits:
# cpu: 1000m
# memory: 1000Mi
# requests:
# cpu: 100m
# memory: 100Mi
vgpuMonitor:
resources: {}
# If you do want to specify resources, uncomment the following lines, adjust them as necessary.
# and remove the curly braces after 'resources:'.
# limits:
# cpu: 1000m
# memory: 1000Mi
# requests:
# cpu: 100m
# memory: 100Mi
devices:
mthreads:
enabled: false
customresources:
- mthreads.com/vgpu
nvidia:
gpuCorePolicy: default
ascend:
enabled: false
image: ""
imagePullPolicy: IfNotPresent
extraArgs: []
nodeSelector:
ascend: "on"
tolerations: []
customresources:
- huawei.com/Ascend910A
- huawei.com/Ascend910A-memory
- huawei.com/Ascend910B2
- huawei.com/Ascend910B2-memory
- huawei.com/Ascend910B
- huawei.com/Ascend910B-memory
- huawei.com/Ascend910B4
- huawei.com/Ascend910B4-memory
- huawei.com/Ascend310P
- huawei.com/Ascend310P-memory