Files
Olares/frameworks/GPU/config/gpu/templates/orion-server.yaml
2024-04-29 20:12:53 +08:00

117 lines
3.1 KiB
YAML

apiVersion: v1
kind: ConfigMap
metadata:
name: probe.orionx-server
namespace: {{ .Release.Namespace }}
data:
probe.sh: |
#!/bin/bash
# Author: yuanzhilei@virtaitech.com
#
oriond=$(pidof oriond)
if [[ -z $oriond ]]; then
echo "oriond not exists."
exit 1
fi
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: orionx-server
namespace: {{ .Release.Namespace }}
labels:
k8s-app: virtaitech
annotations:
gpu-server: {{ .Values.gpu.server }}
spec:
selector:
matchLabels:
app: orionx-server
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
app: orionx-server
spec:
priorityClassName: system-cluster-critical
tolerations:
- key: node-role.kubernetes.io/master
effect: NoSchedule
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
hostIPC: true
hostPID: true
containers:
- image: eball/orionx-server:4.3.0-cu9.0-12.1-slim-bytetrade-20231211
name: orionx-server
#command: ["bash", "-c"]
#args: ["echo 'Sleeping for 3600 seconds ...'; sleep 3600s"]
imagePullPolicy: Always
securityContext:
privileged: true
env:
- name : ORION_CONTROLLER
value : {{ .Values.gpu.server }}
- name: ORION_BIND_ADDR
valueFrom:
fieldRef:
fieldPath: status.hostIP
# - name : ORION_BIND_NET
# value : "eth0"
# - name : ORION_BIND_HOSTNAME
# value : "true"
- name : ORION_SERVER_PORT
value : "9960"
- name : ORION_VGPU_COUNT
value : "4"
- name : ORION_LOG_LEVEL
value : "INFO"
- name: ORION_VGPU_EXPORTER_LISTEN_PORT
value: "9401"
- name: ORION_VGPU_EXPORTER_LISTEN_IP
value: "0.0.0.0"
- name: ORION_SERVICE_REGISTRY_ADDRESS
value: {{ .Values.gpu.server }}
# - name : ENABLE_RDMA
# value : "true"
# - name: ENABLE_COMPUTATION_THROTTLE
# value : "false"
# - name: ORION_RDMA_NAME
# value: "mlx5_0"
# - name: ORION_RDMA_PORT
# value: "1"
# - name: ORION_RDMA_GID
# value: "3"
- name: MY_NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: ORION_SERVER_LABELS
value: '{"pod_host_node": "$(MY_NODE_NAME)"}'
livenessProbe:
exec:
command:
- /tmp/probe.sh
initialDelaySeconds: 50
periodSeconds: 10
volumeMounts:
- name: probe
mountPath: /tmp/probe.sh
subPath: probe.sh
- name: orioncomm
mountPath: "/var/tmp/orion/comm/"
- name: localtime
mountPath: /etc/localtime
volumes:
- name: localtime
hostPath:
path: /etc/localtime
- name: orioncomm
hostPath:
path: /var/tmp/orion/comm/
- name: probe
configMap:
name: probe.orionx-server
defaultMode: 0755