Compare commits
8 Commits
module-app
...
appservice
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f6435fdffb | ||
|
|
695abe7397 | ||
|
|
5f48862758 | ||
|
|
2e9a703621 | ||
|
|
a76f78f148 | ||
|
|
6528b324ad | ||
|
|
0ab4558345 | ||
|
|
b789e8320f |
@@ -32,11 +32,13 @@ import (
|
||||
admissionv1 "k8s.io/api/admission/v1"
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/client-go/dynamic"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/utils/ptr"
|
||||
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
|
||||
)
|
||||
|
||||
@@ -303,8 +305,23 @@ func (h *Handler) gpuLimitMutate(ctx context.Context, req *admissionv1.Admission
|
||||
if gpuRequired == nil {
|
||||
return resp
|
||||
}
|
||||
if annotations[applicationGpuInjectKey] != "true" {
|
||||
|
||||
var injectContainer []string
|
||||
injectAll := false
|
||||
if injectValue, ok := annotations[applicationGpuInjectKey]; !ok || injectValue == "false" || injectValue == "" {
|
||||
return resp
|
||||
} else {
|
||||
if injectValue != "true" {
|
||||
injectToken := strings.Split(injectValue, ",")
|
||||
for _, token := range injectToken {
|
||||
c := strings.TrimSpace(token)
|
||||
if c != "" {
|
||||
injectContainer = append(injectContainer, c)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
injectAll = true
|
||||
}
|
||||
}
|
||||
|
||||
GPUType := appcfg.GetSelectedGpuTypeValue()
|
||||
@@ -321,27 +338,43 @@ func (h *Handler) gpuLimitMutate(ctx context.Context, req *admissionv1.Admission
|
||||
},
|
||||
}
|
||||
|
||||
patchBytes, err := webhook.CreatePatchForDeployment(tpl, h.getGPUResourceTypeKey(GPUType), envs)
|
||||
gpuRequiredValue := gpuRequired.Value() / 1024 / 1024 // HAMi gpu memory format
|
||||
hamiFormatGpuRequired := resource.NewQuantity(gpuRequiredValue, resource.DecimalSI)
|
||||
patchBytes, err := webhook.CreatePatchForDeployment(
|
||||
tpl,
|
||||
injectAll,
|
||||
injectContainer,
|
||||
h.getGPUResourceTypeKey(GPUType),
|
||||
ptr.To(hamiFormatGpuRequired.String()),
|
||||
envs,
|
||||
)
|
||||
if err != nil {
|
||||
klog.Errorf("create patch error %v", err)
|
||||
return h.sidecarWebhook.AdmissionError(req.UID, err)
|
||||
}
|
||||
klog.Info("patchBytes:", string(patchBytes))
|
||||
h.sidecarWebhook.PatchAdmissionResponse(resp, patchBytes)
|
||||
if len(patchBytes) > 0 {
|
||||
h.sidecarWebhook.PatchAdmissionResponse(resp, patchBytes)
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
// FIXME: should not hardcode
|
||||
func (h *Handler) getGPUResourceTypeKey(gpuType string) string {
|
||||
switch gpuType {
|
||||
case utils.NvidiaCardType:
|
||||
return constants.NvidiaGPU
|
||||
case utils.GB10ChipType:
|
||||
return constants.NvidiaGB10GPU
|
||||
return constants.NvidiaGPU
|
||||
case utils.AmdApuCardType:
|
||||
return constants.AMDGPU
|
||||
case utils.AmdGpuCardType:
|
||||
return constants.AMDGPU
|
||||
|
||||
case utils.StrixHaloChipType:
|
||||
return constants.AMDGPU
|
||||
case utils.CPUType:
|
||||
klog.Info("CPU type is selected, no GPU resource will be injected")
|
||||
return ""
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
|
||||
@@ -340,8 +340,8 @@ func GetClusterResource(kubeConfig *rest.Config, token string) (*prometheus.Clus
|
||||
arches.Insert(n.Labels["kubernetes.io/arch"])
|
||||
if quantity, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
} else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
// } else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
|
||||
// total += quantity.AsApproximateFloat64()
|
||||
} else if quantity, ok = n.Status.Capacity[constants.AMDAPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
}
|
||||
|
||||
@@ -84,10 +84,11 @@ const (
|
||||
EnvGPUType = "GPU_TYPE"
|
||||
|
||||
// gpu resource keys
|
||||
NvidiaGPU = "nvidia.com/gpu"
|
||||
NvidiaGB10GPU = "nvidia.com/gb10"
|
||||
AMDAPU = "amd.com/apu"
|
||||
AMDGPU = "amd.com/gpu"
|
||||
NvidiaGPU = "nvidia.com/gpu"
|
||||
NvidiaGPUMem = "nvidia.com/gpumem"
|
||||
// NvidiaGB10GPU = "nvidia.com/gb10"
|
||||
AMDAPU = "amd.com/apu"
|
||||
AMDGPU = "amd.com/gpu"
|
||||
|
||||
AuthorizationLevelOfPublic = "public"
|
||||
AuthorizationLevelOfPrivate = "private"
|
||||
|
||||
@@ -236,7 +236,8 @@ func CheckAppRequirement(token string, appConfig *appcfg.ApplicationConfig, op v
|
||||
}
|
||||
|
||||
// only support nvidia gpu managment by HAMi for now
|
||||
if appConfig.Requirement.GPU != nil && appConfig.GetSelectedGpuTypeValue() == utils.NvidiaCardType {
|
||||
if appConfig.Requirement.GPU != nil &&
|
||||
(appConfig.GetSelectedGpuTypeValue() == utils.NvidiaCardType || appConfig.GetSelectedGpuTypeValue() == utils.GB10ChipType) {
|
||||
if !appConfig.Requirement.GPU.IsZero() && metrics.GPU.Total <= 0 {
|
||||
return constants.GPU, constants.SystemGPUNotAvailable, fmt.Errorf(constants.SystemGPUNotAvailableMessage, op)
|
||||
|
||||
@@ -400,9 +401,9 @@ func GetClusterResource(token string) (*prometheus.ClusterMetrics, []string, err
|
||||
arches.Insert(n.Labels["kubernetes.io/arch"])
|
||||
if quantity, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
} else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
} else if quantity, ok = n.Status.Capacity[constants.AMDAPU]; ok {
|
||||
// } else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
|
||||
// total += quantity.AsApproximateFloat64()
|
||||
} else if quantity, ok = n.Status.Capacity[constants.AMDGPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,8 +5,10 @@ const (
|
||||
)
|
||||
|
||||
const (
|
||||
NvidiaCardType = "nvidia" // handling by HAMi
|
||||
AmdGpuCardType = "amd-gpu" //
|
||||
AmdApuCardType = "amd-apu" // AMD APU with integrated GPU , AI Max 395 etc.
|
||||
GB10ChipType = "nvidia-gb10" // NVIDIA GB10 Superchip & unified system memory
|
||||
CPUType = "cpu" // force to use CPU, no GPU
|
||||
NvidiaCardType = "nvidia" // handling by HAMi
|
||||
AmdGpuCardType = "amd-gpu" //
|
||||
AmdApuCardType = "amd-apu" // AMD APU with integrated GPU , AI Max 395 etc.
|
||||
GB10ChipType = "nvidia-gb10" // NVIDIA GB10 Superchip & unified system memory
|
||||
StrixHaloChipType = "strix-halo" // AMD Strix Halo GPU & unified system memory
|
||||
)
|
||||
|
||||
@@ -543,21 +543,23 @@ type EnvKeyValue struct {
|
||||
}
|
||||
|
||||
// CreatePatchForDeployment add gpu env for deployment and returns patch bytes.
|
||||
func CreatePatchForDeployment(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues []EnvKeyValue) ([]byte, error) {
|
||||
patches, err := addResourceLimits(tpl, typeKey, envKeyValues)
|
||||
func CreatePatchForDeployment(tpl *corev1.PodTemplateSpec, injectAll bool, injectContainer []string, gpuTypeKey string, gpumem *string, envKeyValues []EnvKeyValue) ([]byte, error) {
|
||||
patches, err := addGpuResourceLimits(tpl, injectAll, injectContainer, gpuTypeKey, gpumem)
|
||||
if err != nil {
|
||||
return []byte{}, err
|
||||
}
|
||||
patches = append(patches, addEnvToPatch(tpl, envKeyValues)...)
|
||||
return json.Marshal(patches)
|
||||
}
|
||||
|
||||
func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues []EnvKeyValue) (patch []patchOp, err error) {
|
||||
func addGpuResourceLimits(tpl *corev1.PodTemplateSpec, injectAll bool, injectContainer []string, typeKey string, gpumem *string) (patch []patchOp, err error) {
|
||||
if typeKey == "" {
|
||||
klog.Warning("No gpu type selected, skip adding resource limits")
|
||||
return patch, nil
|
||||
}
|
||||
|
||||
if typeKey == constants.NvidiaGPU || typeKey == constants.NvidiaGB10GPU {
|
||||
// add runtime class for nvidia gpu, HAMi runtime class is "nvidia"
|
||||
if typeKey == constants.NvidiaGPU {
|
||||
if tpl.Spec.RuntimeClassName != nil {
|
||||
patch = append(patch, patchOp{
|
||||
Op: constants.PatchOpReplace,
|
||||
@@ -575,21 +577,31 @@ func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues
|
||||
|
||||
for i := range tpl.Spec.Containers {
|
||||
container := tpl.Spec.Containers[i]
|
||||
if !injectAll && !funk.Contains(injectContainer, container.Name) {
|
||||
continue
|
||||
}
|
||||
|
||||
if len(container.Resources.Limits) == 0 {
|
||||
limitsValues := map[string]interface{}{
|
||||
typeKey: "1",
|
||||
}
|
||||
|
||||
if gpumem != nil && *gpumem != "" && typeKey == constants.NvidiaGPU {
|
||||
limitsValues[constants.NvidiaGPUMem] = *gpumem
|
||||
}
|
||||
|
||||
patch = append(patch, patchOp{
|
||||
Op: constants.PatchOpAdd,
|
||||
Path: fmt.Sprintf(resourcePath, i),
|
||||
Value: map[string]interface{}{
|
||||
typeKey: "1",
|
||||
},
|
||||
Op: constants.PatchOpAdd,
|
||||
Path: fmt.Sprintf(resourcePath, i),
|
||||
Value: limitsValues,
|
||||
})
|
||||
|
||||
} else {
|
||||
t := make(map[string]map[string]string)
|
||||
t["limits"] = map[string]string{}
|
||||
for k, v := range container.Resources.Limits {
|
||||
if k.String() == constants.NvidiaGPU ||
|
||||
k.String() == constants.NvidiaGB10GPU ||
|
||||
k.String() == constants.NvidiaGPUMem ||
|
||||
k.String() == constants.AMDAPU {
|
||||
// unset all previous gpu limits
|
||||
continue
|
||||
@@ -597,12 +609,24 @@ func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues
|
||||
t["limits"][k.String()] = v.String()
|
||||
}
|
||||
t["limits"][typeKey] = "1"
|
||||
if gpumem != nil && *gpumem != "" && typeKey == constants.NvidiaGPU {
|
||||
t["limits"][constants.NvidiaGPUMem] = *gpumem
|
||||
}
|
||||
patch = append(patch, patchOp{
|
||||
Op: constants.PatchOpReplace,
|
||||
Path: fmt.Sprintf(resourcePath, i),
|
||||
Value: t["limits"],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return patch, nil
|
||||
}
|
||||
|
||||
func addEnvToPatch(tpl *corev1.PodTemplateSpec, envKeyValues []EnvKeyValue) (patch []patchOp) {
|
||||
for i := range tpl.Spec.Containers {
|
||||
container := tpl.Spec.Containers[i]
|
||||
|
||||
envNames := make([]string, 0)
|
||||
if len(container.Env) == 0 {
|
||||
value := make([]map[string]string, 0)
|
||||
@@ -643,7 +667,7 @@ func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues
|
||||
|
||||
}
|
||||
|
||||
return patch, nil
|
||||
return patch
|
||||
}
|
||||
|
||||
func genPatchesForEnv(op string, containerIdx, envIdx int, name, value string) (patch []patchOp) {
|
||||
|
||||
Reference in New Issue
Block a user