Compare commits

...

1 Commits

Author SHA1 Message Date
eball
0ee4459f99 hami: gpu slicing scheduler 2025-05-22 21:49:23 +08:00
5 changed files with 53 additions and 127 deletions

View File

@@ -0,0 +1,31 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: gpu-scheduler
namespace: {{ .Release.Namespace }}
spec:
selector:
matchLabels:
name: gpu-scheduler
template:
metadata:
labels:
name: gpu-scheduler
spec:
priorityClassName: system-node-critical
nodeSelector:
gpu.bytetrade.io/cuda-supported: 'true'
containers:
- name: gpu-scheduler
image: beclab/gpu-scheduler:v0.1.0
imagePullPolicy: IfNotPresent
ports:
- name: ws
containerPort: 6000
protocol: TCP
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule

View File

@@ -0,0 +1,22 @@
kind: Service
apiVersion: v1
metadata:
name: gpu-scheduler
namespace: {{ .Release.Namespace }}
spec:
ports:
- name: ws
protocol: TCP
port: 6000
targetPort: 6000
selector:
name: gpu-scheduler
clusterIP: None
clusterIPs:
- None
type: ClusterIP
sessionAffinity: None
ipFamilies:
- IPv4
ipFamilyPolicy: SingleStack
internalTrafficPolicy: Cluster

View File

@@ -1,42 +0,0 @@
# Copyright (c) 2023 Georgios Alexopoulos
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# We must create the `ResourceQuota` object for the namespace in order for the
# K8s API server to allow creation of resources with the `system-node-critical`
# and `system-cluster-critical` PriorityClasses in this namespace.
apiVersion: v1
kind: ResourceQuota
metadata:
name: pods-system-cluster-critical
namespace: nvshare-system
spec:
scopeSelector:
matchExpressions:
- operator : In
scopeName: PriorityClass
values: ["system-cluster-critical"]
---
apiVersion: v1
kind: ResourceQuota
metadata:
name: pods-system-node-critical
namespace: nvshare-system
spec:
scopeSelector:
matchExpressions:
- operator : In
scopeName: PriorityClass
values: ["system-node-critical"]

View File

@@ -1,19 +0,0 @@
# Copyright (c) 2023 Georgios Alexopoulos
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: v1
kind: Namespace
metadata:
name: nvshare-system

View File

@@ -1,66 +0,0 @@
# Copyright (c) 2023 Georgios Alexopoulos
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvshare-scheduler
namespace: nvshare-system
spec:
selector:
matchLabels:
name: nvshare-scheduler
template:
metadata:
labels:
name: nvshare-scheduler
spec:
priorityClassName: system-node-critical
nodeSelector:
gpu.bytetrade.io/cuda-supported: 'true'
initContainers:
- name: init-dir
image: busybox:1.28
volumeMounts:
- name: nvshare-socket-directory
mountPath: /var/run/nvshare
command:
- sh
- -c
- "[ -d /var/run/nvshare/scheduler.sock ] && rm -rf /var/run/nvshare/scheduler.sock || true"
containers:
- name: nvshare-scheduler
image: bytetrade/nvshare:nvshare-scheduler
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
command:
- sh
- -c
- "test -f /var/run/nvshare/scheduler.sock && rm -rf /var/run/nvshare/scheduler.sock; pid1 nvshare-scheduler"
volumeMounts:
- name: nvshare-socket-directory
mountPath: /var/run/nvshare
volumes:
- name: nvshare-socket-directory
hostPath:
path: /var/run/nvshare
type: DirectoryOrCreate
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule