Compare commits

...

16 Commits

Author SHA1 Message Date
eball
f6435fdffb fix: revert unchanged file 2026-02-27 21:02:21 +08:00
eball
695abe7397 Revert "fix: revert unchanged file"
This reverts commit 5f48862758.
2026-02-27 21:00:37 +08:00
eball
5f48862758 fix: revert unchanged file 2026-02-27 20:59:01 +08:00
eball
2e9a703621 feat: adjust GPU memory format in deployment patching for compatibility 2026-02-27 20:50:51 +08:00
eball
a76f78f148 feat: update GPU resource patching to support selective container injection 2026-02-27 19:16:46 +08:00
eball
6528b324ad feat: enhance GPU resource management with memory limits and chip type handling 2026-02-27 13:53:49 +08:00
eball
0ab4558345 fix: handle CPU type selection in GPU resource management 2026-02-26 16:34:50 +08:00
eball
b789e8320f refactor: unify GPU resource handling and remove hardcoded values 2026-02-26 11:56:15 +08:00
hys
7048b58211 update appservice image tag to 0.5.3 2026-02-12 21:01:36 +08:00
hysyeah
40e8c87159 fix: stop app if it is hami cause unschedule no wait (#2531)
* fix: stop app if it is hami cause unschedule

* ingore param from req if size=0
2026-02-12 21:01:36 +08:00
hys
3c7440dfd3 fix: set amd apu/gpu limit key to amd.com/gpu 2026-02-12 21:01:36 +08:00
hys
360d30ce37 fix: add spec ports 2026-02-12 21:00:07 +08:00
hys
b9dc5db688 fix: check k8s request before into installing state 2026-02-12 21:00:07 +08:00
hys
1aad7ac347 fix: v2 app stop 2026-02-12 21:00:07 +08:00
hys
84e4cfdf75 fix: helm upgrade do not use atomic param and allow upgrade failed release 2026-02-12 21:00:07 +08:00
hys
7738eea929 fix: failed release upgrade 2026-02-12 21:00:07 +08:00
9 changed files with 132 additions and 34 deletions

View File

@@ -170,7 +170,7 @@ spec:
priorityClassName: "system-cluster-critical"
containers:
- name: app-service
image: beclab/app-service:0.5.2
image: beclab/app-service:0.5.3
imagePullPolicy: IfNotPresent
ports:
- containerPort: 6755

View File

@@ -32,11 +32,13 @@ import (
admissionv1 "k8s.io/api/admission/v1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/dynamic"
"k8s.io/klog/v2"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
@@ -303,8 +305,23 @@ func (h *Handler) gpuLimitMutate(ctx context.Context, req *admissionv1.Admission
if gpuRequired == nil {
return resp
}
if annotations[applicationGpuInjectKey] != "true" {
var injectContainer []string
injectAll := false
if injectValue, ok := annotations[applicationGpuInjectKey]; !ok || injectValue == "false" || injectValue == "" {
return resp
} else {
if injectValue != "true" {
injectToken := strings.Split(injectValue, ",")
for _, token := range injectToken {
c := strings.TrimSpace(token)
if c != "" {
injectContainer = append(injectContainer, c)
}
}
} else {
injectAll = true
}
}
GPUType := appcfg.GetSelectedGpuTypeValue()
@@ -321,24 +338,43 @@ func (h *Handler) gpuLimitMutate(ctx context.Context, req *admissionv1.Admission
},
}
patchBytes, err := webhook.CreatePatchForDeployment(tpl, h.getGPUResourceTypeKey(GPUType), envs)
gpuRequiredValue := gpuRequired.Value() / 1024 / 1024 // HAMi gpu memory format
hamiFormatGpuRequired := resource.NewQuantity(gpuRequiredValue, resource.DecimalSI)
patchBytes, err := webhook.CreatePatchForDeployment(
tpl,
injectAll,
injectContainer,
h.getGPUResourceTypeKey(GPUType),
ptr.To(hamiFormatGpuRequired.String()),
envs,
)
if err != nil {
klog.Errorf("create patch error %v", err)
return h.sidecarWebhook.AdmissionError(req.UID, err)
}
klog.Info("patchBytes:", string(patchBytes))
h.sidecarWebhook.PatchAdmissionResponse(resp, patchBytes)
if len(patchBytes) > 0 {
h.sidecarWebhook.PatchAdmissionResponse(resp, patchBytes)
}
return resp
}
// FIXME: should not hardcode
func (h *Handler) getGPUResourceTypeKey(gpuType string) string {
switch gpuType {
case utils.NvidiaCardType:
return constants.NvidiaGPU
case utils.GB10ChipType:
return constants.NvidiaGB10GPU
return constants.NvidiaGPU
case utils.AmdApuCardType:
return constants.AMDAPU
return constants.AMDGPU
case utils.AmdGpuCardType:
return constants.AMDGPU
case utils.StrixHaloChipType:
return constants.AMDGPU
case utils.CPUType:
klog.Info("CPU type is selected, no GPU resource will be injected")
return ""
default:
return ""
}

View File

@@ -340,8 +340,8 @@ func GetClusterResource(kubeConfig *rest.Config, token string) (*prometheus.Clus
arches.Insert(n.Labels["kubernetes.io/arch"])
if quantity, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
total += quantity.AsApproximateFloat64()
// } else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
// total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.AMDAPU]; ok {
total += quantity.AsApproximateFloat64()
}

View File

@@ -31,6 +31,7 @@ import (
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/dynamic"
@@ -587,7 +588,7 @@ func (h *HelmOps) isStartUp() (bool, error) {
}
klog.Infof("podSErvers: %v", podNames)
serverStarted, err := checkIfStartup(serverPods, true)
serverStarted, err := h.checkIfStartup(serverPods, true)
if err != nil {
klog.Errorf("v2 app %s server pods not ready: %v", h.app.AppName, err)
return false, err
@@ -606,7 +607,7 @@ func (h *HelmOps) isStartUp() (bool, error) {
return false, err
}
clientStarted, err := checkIfStartup(clientPods, false)
clientStarted, err := h.checkIfStartup(clientPods, false)
if err != nil {
return false, err
}
@@ -669,7 +670,7 @@ func (h *HelmOps) findServerPods() ([]corev1.Pod, error) {
return pods, nil
}
func checkIfStartup(pods []corev1.Pod, isServerSide bool) (bool, error) {
func (h *HelmOps) checkIfStartup(pods []corev1.Pod, isServerSide bool) (bool, error) {
if len(pods) == 0 {
return false, errors.New("no pod found")
}
@@ -678,6 +679,16 @@ func checkIfStartup(pods []corev1.Pod, isServerSide bool) (bool, error) {
for _, pod := range pods {
creationTime := pod.GetCreationTimestamp()
pendingDuration := time.Since(creationTime.Time)
pendingKind, err := h.getPendingKind(&pod)
if err != nil {
return false, err
}
if pendingKind == "hami-scheduler" {
if isServerSide {
return false, errcode.ErrServerSidePodPending
}
return false, errcode.ErrPodPending
}
if pod.Status.Phase == corev1.PodPending && pendingDuration > time.Minute*10 {
if isServerSide {
@@ -703,6 +714,28 @@ func checkIfStartup(pods []corev1.Pod, isServerSide bool) (bool, error) {
return false, nil
}
func (h *HelmOps) getPendingKind(pod *corev1.Pod) (string, error) {
fieldSelector := fields.OneTermEqualSelector("involvedObject.name", pod.Name).String()
events, err := h.client.KubeClient.Kubernetes().CoreV1().Events(pod.Namespace).List(h.ctx, metav1.ListOptions{
FieldSelector: fieldSelector,
})
if err != nil {
return "", err
}
eventFrom := ""
for _, event := range events.Items {
if event.Reason == "FailedScheduling" {
if event.ReportingController != "" {
eventFrom = event.ReportingController
} else {
eventFrom = event.Source.Component
}
break
}
}
return eventFrom, nil
}
type applicationSettingsSubPolicy struct {
URI string `json:"uri"`
Policy string `json:"policy"`

View File

@@ -84,9 +84,11 @@ const (
EnvGPUType = "GPU_TYPE"
// gpu resource keys
NvidiaGPU = "nvidia.com/gpu"
NvidiaGB10GPU = "nvidia.com/gb10"
AMDAPU = "amd.com/apu"
NvidiaGPU = "nvidia.com/gpu"
NvidiaGPUMem = "nvidia.com/gpumem"
// NvidiaGB10GPU = "nvidia.com/gb10"
AMDAPU = "amd.com/apu"
AMDGPU = "amd.com/gpu"
AuthorizationLevelOfPublic = "public"
AuthorizationLevelOfPrivate = "private"

View File

@@ -160,7 +160,7 @@ func (imc *ImageManagerClient) PollDownloadProgress(ctx context.Context, am *app
if t == "" {
imageSize := maxImageSize
info := findImageSize(imageList, ref.Name)
if info != nil {
if info != nil && info.Size != 0 {
//klog.Infof("get image:%s size:%d", ref.Name, info.Size)
imageSize = info.Size
}

View File

@@ -236,7 +236,8 @@ func CheckAppRequirement(token string, appConfig *appcfg.ApplicationConfig, op v
}
// only support nvidia gpu managment by HAMi for now
if appConfig.Requirement.GPU != nil && appConfig.GetSelectedGpuTypeValue() == utils.NvidiaCardType {
if appConfig.Requirement.GPU != nil &&
(appConfig.GetSelectedGpuTypeValue() == utils.NvidiaCardType || appConfig.GetSelectedGpuTypeValue() == utils.GB10ChipType) {
if !appConfig.Requirement.GPU.IsZero() && metrics.GPU.Total <= 0 {
return constants.GPU, constants.SystemGPUNotAvailable, fmt.Errorf(constants.SystemGPUNotAvailableMessage, op)
@@ -400,9 +401,9 @@ func GetClusterResource(token string) (*prometheus.ClusterMetrics, []string, err
arches.Insert(n.Labels["kubernetes.io/arch"])
if quantity, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.AMDAPU]; ok {
// } else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
// total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.AMDGPU]; ok {
total += quantity.AsApproximateFloat64()
}
}

View File

@@ -5,8 +5,10 @@ const (
)
const (
NvidiaCardType = "nvidia" // handling by HAMi
AmdGpuCardType = "amd-gpu" //
AmdApuCardType = "amd-apu" // AMD APU with integrated GPU , AI Max 395 etc.
GB10ChipType = "nvidia-gb10" // NVIDIA GB10 Superchip & unified system memory
CPUType = "cpu" // force to use CPU, no GPU
NvidiaCardType = "nvidia" // handling by HAMi
AmdGpuCardType = "amd-gpu" //
AmdApuCardType = "amd-apu" // AMD APU with integrated GPU , AI Max 395 etc.
GB10ChipType = "nvidia-gb10" // NVIDIA GB10 Superchip & unified system memory
StrixHaloChipType = "strix-halo" // AMD Strix Halo GPU & unified system memory
)

View File

@@ -543,21 +543,23 @@ type EnvKeyValue struct {
}
// CreatePatchForDeployment add gpu env for deployment and returns patch bytes.
func CreatePatchForDeployment(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues []EnvKeyValue) ([]byte, error) {
patches, err := addResourceLimits(tpl, typeKey, envKeyValues)
func CreatePatchForDeployment(tpl *corev1.PodTemplateSpec, injectAll bool, injectContainer []string, gpuTypeKey string, gpumem *string, envKeyValues []EnvKeyValue) ([]byte, error) {
patches, err := addGpuResourceLimits(tpl, injectAll, injectContainer, gpuTypeKey, gpumem)
if err != nil {
return []byte{}, err
}
patches = append(patches, addEnvToPatch(tpl, envKeyValues)...)
return json.Marshal(patches)
}
func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues []EnvKeyValue) (patch []patchOp, err error) {
func addGpuResourceLimits(tpl *corev1.PodTemplateSpec, injectAll bool, injectContainer []string, typeKey string, gpumem *string) (patch []patchOp, err error) {
if typeKey == "" {
klog.Warning("No gpu type selected, skip adding resource limits")
return patch, nil
}
if typeKey == constants.NvidiaGPU || typeKey == constants.NvidiaGB10GPU {
// add runtime class for nvidia gpu, HAMi runtime class is "nvidia"
if typeKey == constants.NvidiaGPU {
if tpl.Spec.RuntimeClassName != nil {
patch = append(patch, patchOp{
Op: constants.PatchOpReplace,
@@ -575,21 +577,31 @@ func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues
for i := range tpl.Spec.Containers {
container := tpl.Spec.Containers[i]
if !injectAll && !funk.Contains(injectContainer, container.Name) {
continue
}
if len(container.Resources.Limits) == 0 {
limitsValues := map[string]interface{}{
typeKey: "1",
}
if gpumem != nil && *gpumem != "" && typeKey == constants.NvidiaGPU {
limitsValues[constants.NvidiaGPUMem] = *gpumem
}
patch = append(patch, patchOp{
Op: constants.PatchOpAdd,
Path: fmt.Sprintf(resourcePath, i),
Value: map[string]interface{}{
typeKey: "1",
},
Op: constants.PatchOpAdd,
Path: fmt.Sprintf(resourcePath, i),
Value: limitsValues,
})
} else {
t := make(map[string]map[string]string)
t["limits"] = map[string]string{}
for k, v := range container.Resources.Limits {
if k.String() == constants.NvidiaGPU ||
k.String() == constants.NvidiaGB10GPU ||
k.String() == constants.NvidiaGPUMem ||
k.String() == constants.AMDAPU {
// unset all previous gpu limits
continue
@@ -597,12 +609,24 @@ func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues
t["limits"][k.String()] = v.String()
}
t["limits"][typeKey] = "1"
if gpumem != nil && *gpumem != "" && typeKey == constants.NvidiaGPU {
t["limits"][constants.NvidiaGPUMem] = *gpumem
}
patch = append(patch, patchOp{
Op: constants.PatchOpReplace,
Path: fmt.Sprintf(resourcePath, i),
Value: t["limits"],
})
}
}
return patch, nil
}
func addEnvToPatch(tpl *corev1.PodTemplateSpec, envKeyValues []EnvKeyValue) (patch []patchOp) {
for i := range tpl.Spec.Containers {
container := tpl.Spec.Containers[i]
envNames := make([]string, 0)
if len(container.Env) == 0 {
value := make([]map[string]string, 0)
@@ -643,7 +667,7 @@ func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues
}
return patch, nil
return patch
}
func genPatchesForEnv(op string, containerIdx, envIdx int, name, value string) (patch []patchOp) {