Olares/frameworks/GPU/config/gpu/hami/values.yaml

# Default values for hami-vgpu.

nameOverride: ""
fullnameOverride: ""
imagePullSecrets: [ ]
version: "v2.5.1"

#Nvidia GPU Parameters
resourceName: "nvidia.com/gpu"
resourceMem: "nvidia.com/gpumem"
resourceMemPercentage: "nvidia.com/gpumem-percentage"
resourceCores: "nvidia.com/gpucores"
resourcePriority: "nvidia.com/priority"

#MLU Parameters
mluResourceName: "cambricon.com/vmlu"
mluResourceMem: "cambricon.com/mlu.smlu.vmemory"
mluResourceCores: "cambricon.com/mlu.smlu.vcore"

#Hygon DCU Parameters
dcuResourceName: "hygon.com/dcunum"
dcuResourceMem: "hygon.com/dcumem"
dcuResourceCores: "hygon.com/dcucores"

#Iluvatar GPU Parameters
iluvatarResourceName: "iluvatar.ai/vgpu"
iluvatarResourceMem: "iluvatar.ai/vcuda-memory"
iluvatarResourceCore: "iluvatar.ai/vcuda-core"

#Metax SGPU Parameters
metaxResourceName: "metax-tech.com/sgpu"
metaxResourceCore: "metax-tech.com/vcore"
metaxResourceMem: "metax-tech.com/vmemory"

schedulerName: "hami-scheduler"

podSecurityPolicy:
  enabled: false

global:
  gpuHookPath: /usr/local
  labels: {}
  annotations: {}
  managedNodeSelectorEnable: false
  managedNodeSelector:
    usage: "gpu"


scheduler:
  # @param nodeName defines the node name and the nvidia-vgpu-scheduler-scheduler will schedule to the node.
  # if we install the nvidia-vgpu-scheduler-scheduler as default scheduler, we need to remove the k8s default
  # scheduler pod from the cluster first, we must specify node name to skip the schedule workflow.
  nodeName: ""
  #nodeLabelSelector:
  #  "gpu": "on"
  overwriteEnv: "false"
  defaultSchedulerPolicy:
    nodeSchedulerPolicy: binpack
    gpuSchedulerPolicy: spread
  metricsBindAddress: ":9395"
  livenessProbe: false
  leaderElect: true
  # when leaderElect is true, replicas is available, otherwise replicas is 1.
  replicas: 1
  kubeScheduler:
    # @param enabled indicate whether to run kube-scheduler container in the scheduler pod, it's true by default.
    enabled: true
    image: registry.k8s.io/kube-scheduler
    imageTag: ""
    imagePullPolicy: IfNotPresent
    resources: {}
      # If you do want to specify resources, uncomment the following lines, adjust them as necessary.
      # and remove the curly braces after 'resources:'.
#      limits:
#        cpu: 1000m
#        memory: 1000Mi
#      requests:
#        cpu: 100m
#        memory: 100Mi
    extraNewArgs:
      - --config=/config/config.yaml
      - -v=4
    extraArgs:
      - --policy-config-file=/config/config.json
      - -v=4
  extender:
    image: "beclab/hami"
    imagePullPolicy: IfNotPresent
    resources: {}
      # If you do want to specify resources, uncomment the following lines, adjust them as necessary,
      # and remove the curly braces after 'resources:'.
#      limits:
#        cpu: 1000m
#        memory: 1000Mi
#      requests:
#        cpu: 100m
#        memory: 100Mi
    extraArgs:
      - --debug
      - -v=4
  podAnnotations: {}
  tolerations: []
  #serviceAccountName: "hami-vgpu-scheduler-sa"
  admissionWebhook:
    customURL:
      enabled: false
      # must be an endpoint using https.
      # should generate host certs here
      host: 127.0.0.1 # hostname or ip, can be your node'IP if you want to use https://<nodeIP>:<schedulerPort>/<path>
      port: 31998
      path: /webhook
    whitelistNamespaces:
    # Specify the namespaces that the webhook will not be applied to.
      # - default
      # - kube-system
      # - istio-system
    reinvocationPolicy: Never
    failurePolicy: Ignore
  patch:
    image: jettech/kube-webhook-certgen:v1.5.2
    imageNew: liangjw/kube-webhook-certgen:v1.1.1
    imagePullPolicy: IfNotPresent
    priorityClassName: ""
    podAnnotations: {}
    nodeSelector: {}
    tolerations: []
    runAsUser: 2000
  service:
    type: ClusterIP  # Default type is NodePort, can be changed to ClusterIP
    httpPort: 443   # HTTP port
    schedulerPort: 31998  # NodePort for HTTP
    monitorPort: 31993    # Monitoring port
    labels: {}
    annotations: {}

devicePlugin:
  image: "beclab/hami"
  monitorimage: "beclab/hami"
  monitorctrPath: /usr/local/vgpu/containers
  imagePullPolicy: IfNotPresent
  deviceSplitCount: 100
  deviceMemoryScaling: 100
  deviceCoreScaling: 100
  runtimeClassName: ""
  migStrategy: "none"
  disablecorelimit: "false"
  passDeviceSpecsEnabled: false
  extraArgs:
    - -v=4

  service:
    type: ClusterIP  # Default type is NodePort, can be changed to ClusterIP
    httpPort: 31992
    labels: {}
    annotations: {}

  pluginPath: /var/lib/kubelet/device-plugins
  libPath: /usr/local/vgpu

  podAnnotations: {}
  nvidianodeSelector:
    gpu.bytetrade.io/cuda-supported: 'true'
  tolerations: []
  # The updateStrategy for DevicePlugin DaemonSet.
  # If you want to update the DaemonSet by manual, set type as "OnDelete".
  # We recommend use OnDelete update strategy because DevicePlugin pod restart will cause business pod restart, this behavior is destructive.
  # Otherwise, you can use RollingUpdate update strategy to rolling update DevicePlugin pod.
  updateStrategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 1

  resources: {}
    # If you do want to specify resources, uncomment the following lines, adjust them as necessary.
    # and remove the curly braces after 'resources:'.
#    limits:
#       cpu: 1000m
#       memory: 1000Mi
#    requests:
#      cpu: 100m
#      memory: 100Mi

  vgpuMonitor:
    resources: {}
      # If you do want to specify resources, uncomment the following lines, adjust them as necessary.
      # and remove the curly braces after 'resources:'.
#      limits:
#        cpu: 1000m
#        memory: 1000Mi
#      requests:
#        cpu: 100m
#        memory: 100Mi

devices:
  mthreads:
    enabled: false
    customresources:
      - mthreads.com/vgpu
  nvidia:
    gpuCorePolicy: default
  ascend:
    enabled: false
    image: ""
    imagePullPolicy: IfNotPresent
    extraArgs: []
    nodeSelector:
      ascend: "on"
    tolerations: []
    customresources:
      - huawei.com/Ascend910A
      - huawei.com/Ascend910A-memory
      - huawei.com/Ascend910B2
      - huawei.com/Ascend910B2-memory
      - huawei.com/Ascend910B
      - huawei.com/Ascend910B-memory
      - huawei.com/Ascend910B4
      - huawei.com/Ascend910B4-memory
      - huawei.com/Ascend310P
      - huawei.com/Ascend310P-memory

dcgmExporter:
  image:
    repository: nvidia/dcgm-exporter
    pullPolicy: IfNotPresent
    tag: 4.1.1-4.0.4-ubuntu22.04

  # Change the following reference to "/etc/dcgm-exporter/default-counters.csv"
  # to stop profiling metrics from DCGM
  arguments: ["-f", "/etc/dcgm-exporter/default-counters.csv"]
  # NOTE: in general, add any command line arguments to arguments above
  # and they will be passed through.
  # Use "-r", "<HOST>:<PORT>" to connect to an already running hostengine
  # Example arguments: ["-r", "host123:5555"]
  # Use "-n" to remove the hostname tag from the output.
  # Example arguments: ["-n"]
  # Use "-d" to specify the devices to monitor. -d must be followed by a string
  # in the following format: [f] or [g[:numeric_range][+]][i[:numeric_range]]
  # Where a numeric range is something like 0-4 or 0,2,4, etc.
  # Example arguments: ["-d", "g+i"] to monitor all GPUs and GPU instances or
  # ["-d", "g:0-3"] to monitor GPUs 0-3.
  # Use "-m" to specify the namespace and name of a configmap containing
  # the watched exporter fields.
  # Example arguments: ["-m", "default:exporter-metrics-config-map"]

  # Overrides the chart's name
  nameOverride: "nvidia-dcgm-exporter"

  # Overrides the chart's computed fullname
  fullnameOverride: ""

  # Overrides the deployment namespace
  namespaceOverride: ""

  # Defines the runtime class that will be used by the pod
  runtimeClassName: ""
  # Defines serviceAccount names for components.
  serviceAccount:
    # Specifies whether a service account should be created
    create: true
    # Annotations to add to the service account
    annotations: {}
    # The name of the service account to use.
    # If not set and create is true, a name is generated using the fullname template
    name:

  rollingUpdate:
    # Specifies maximum number of DaemonSet pods that can be unavailable during the update
    maxUnavailable: 1
    # Specifies maximum number of nodes with an existing available DaemonSet pod that can have an updated DaemonSet pod during during an update
    maxSurge: 0

  # Labels to be added to dcgm-exporter pods
  podLabels: {}

  # Annotations to be added to dcgm-exporter pods
  podAnnotations: {}
  # Using this annotation which is required for prometheus scraping
    # prometheus.io/scrape: "true"
    # prometheus.io/port: "9400"

  # The SecurityContext for the dcgm-exporter pods
  podSecurityContext: {}
    # fsGroup: 2000

  # The SecurityContext for the dcgm-exporter containers
  securityContext:
    runAsNonRoot: false
    runAsUser: 0
    capabilities:
      add: ["SYS_ADMIN"]
    # readOnlyRootFilesystem: true

  # Defines the dcgm-exporter service
  service:
    # When enabled, the helm chart will create service
    enable: true
    type: ClusterIP
    clusterIP: ""
    port: 9400
    address: ":9400"
    # Annotations to add to the service
    annotations: {}

  # Allows to control pod resources
  resources: {}
    # limits:
    #   cpu: 100m
    #   memory: 128Mi
    # requests:
    #   cpu: 100m
    #   memory: 128Mi
  serviceMonitor:
    apiVersion: "monitoring.coreos.com/v1"
    enabled: true
    interval: 15s
    honorLabels: false
    additionalLabels: {}
      #monitoring: prometheus
    relabelings: []
      # - sourceLabels: [__meta_kubernetes_pod_node_name]
      #   separator: ;
      #   regex: ^(.*)$
      #   targetLabel: nodename
      #   replacement: $1
      #   action: replace

  nodeSelector: {}
    #node: gpu

  tolerations: []
  #- operator: Exists

  affinity: {}
    #nodeAffinity:
    #  requiredDuringSchedulingIgnoredDuringExecution:
    #    nodeSelectorTerms:
    #    - matchExpressions:
    #      - key: nvidia-gpu
    #        operator: Exists

  extraHostVolumes: []
  #- name: host-binaries
  #  hostPath: /opt/bin

  extraConfigMapVolumes:
    - name: exporter-metrics-volume
      configMap:
        name: exporter-metrics-config-map
        items:
        - key: metrics
          path: default-counters.csv

  extraVolumeMounts:
    - name: exporter-metrics-volume
      mountPath: /etc/dcgm-exporter/default-counters.csv
      subPath: default-counters.csv

  extraEnv: []
  #- name: EXTRA_VAR
  #  value: "TheStringValue"

  # Path to the kubelet socket for /pod-resources
  kubeletPath: "/var/lib/kubelet/pod-resources"

  # HTTPS configuration
  tlsServerConfig:
    # Enable or disable HTTPS configuration
    enabled: false
    # Use autogenerated self-signed TLS certificates. Not recommended for production environments.
    autoGenerated: true
    # Existing secret containing your own server key and certificate
    existingSecret: ""
    # Certificate file name
    certFilename: "tls.crt"
    # Key file name
    keyFilename: "tls.key"
    # CA certificate file name
    caFilename: "ca.crt"
    # Server policy for client authentication. Maps to ClientAuth Policies.
    # For more detail on clientAuth options:
    # https://golang.org/pkg/crypto/tls/#ClientAuthType
    #
    # NOTE: If you want to enable client authentication, you need to use
    # RequireAndVerifyClientCert. Other values are insecure.
    clientAuthType: ""
    # TLS Key for HTTPS - ignored if existingSecret is provided
    key: ""
    # TLS Certificate for HTTPS - ignored if existingSecret is provided
    cert: ""
    # CA Certificate for HTTPS - ignored if existingSecret is provided
    ca: ""

  basicAuth:
    #Object containing <user>:<passwords> key-value pairs for each user that will have access via basic authentication
    users: {}

  # Customized list of metrics to emit. Expected to be in the same format (CSV) as the default list.
  # Must be the complete list and is not additive. If unset, the default list will take effect.
  # customMetrics: |
    # Format
    # If line starts with a '#' it is considered a comment
    # DCGM FIELD, Prometheus metric type, help message

webui:
  replicaCount: 1

  vendorNodeSelectors:
    NVIDIA: gpu.bytetrade.io/cuda-supported=true
    Ascend: ascend=on
    DCU: dcu=on
    MLU: mlu=on

  image:
    frontend:
      repository: projecthami/hami-webui-fe-oss
      pullPolicy: IfNotPresent
      # Overrides the image tag whose default is the chart appVersion.
      tag: "v1.0.5"
    backend:
      repository: projecthami/hami-webui-be-oss
      pullPolicy: IfNotPresent
      tag: "v1.0.5"

  imagePullSecrets: []
  nameOverride: "webui"
  fullnameOverride: ""
  namespaceOverride: ""

  serviceAccount:
    # Specifies whether a service account should be created
    create: true
    # Annotations to add to the service account
    annotations: {}
    # The name of the service account to use.
    # If not set and create is true, a name is generated using the fullname template
    name: ""

  podAnnotations: {}

  podSecurityContext: {}
  # fsGroup: 2000

  securityContext: {}
    # capabilities:
    #   drop:
    #   - ALL
    # readOnlyRootFilesystem: true
    # runAsNonRoot: true
  # runAsUser: 1000

  service:
    type: ClusterIP
    port: 3000

  ingress:
    enabled: false
    className: ""
    annotations: {}
      # kubernetes.io/ingress.class: nginx
    # kubernetes.io/tls-acme: "true"
    hosts:
      - host: chart-example.local
        paths:
          - path: /
            pathType: ImplementationSpecific
    tls: []
    #  - secretName: chart-example-tls
    #    hosts:
    #      - chart-example.local

  resources:
    frontend:
      limits:
        cpu: 200m
        memory: 500Mi
      requests:
        cpu: 200m
        memory: 500Mi
    backend:
      limits:
        cpu: 50m
        memory: 250Mi
      requests:
        cpu: 50m
        memory: 250Mi
      # We usually recommend not to specify default resources and to leave this as a conscious
      # choice for the user. This also increases chances charts run on environments with little
      # resources, such as Minikube. If you do want to specify resources, uncomment the following
      # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
      # limits:
      #   cpu: 100m
      #   memory: 128Mi
      # requests:
      #   cpu: 100m
      #   memory: 128Mi

  env:
    frontend:
      - name: TZ
        value: "Asia/Shanghai"
    backend:
      - name: TZ
        value: "Asia/Shanghai"

  serviceMonitor:
    enabled: true
    interval: 15s
    honorLabels: false
    additionalLabels:
      jobRelease: hami-webui-prometheus
    relabelings: []

  hamiServiceMonitor:
    enabled: true
    interval: 15s
    honorLabels: false
    additionalLabels:
      jobRelease: hami-webui-prometheus
    svcNamespace: kube-system
    relabelings: []

  nodeSelector: {}

  tolerations: []

  affinity: {}

  externalPrometheus:
    address: "http://prometheus-k8s.kubesphere-monitoring-system:9090"
    enabled: true