fix: revert unchanged file

Revert "fix: revert unchanged file"
This reverts commit 5f48862758.
2026-02-27 21:02:21 +08:00 · 2026-02-27 21:00:37 +08:00 · 2026-02-27 20:59:01 +08:00 · 2026-02-27 20:50:51 +08:00 · 2026-02-27 19:16:46 +08:00 · 2026-02-27 13:53:49 +08:00
9 changed files with 132 additions and 34 deletions
--- a/framework/app-service/.olares/config/cluster/deploy/appservice_deploy.yaml
+++ b/framework/app-service/.olares/config/cluster/deploy/appservice_deploy.yaml
@@ -170,7 +170,7 @@ spec:
      priorityClassName: "system-cluster-critical"
      containers:
      - name: app-service
-        image: beclab/app-service:0.5.2
+        image: beclab/app-service:0.5.3
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 6755
--- a/framework/app-service/pkg/apiserver/handler_webhook.go
+++ b/framework/app-service/pkg/apiserver/handler_webhook.go
@@ -32,11 +32,13 @@ import (
 	admissionv1 "k8s.io/api/admission/v1"
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/dynamic"
 	"k8s.io/klog/v2"
+	"k8s.io/utils/ptr"
 	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
 )

@@ -303,8 +305,23 @@ func (h *Handler) gpuLimitMutate(ctx context.Context, req *admissionv1.Admission
 	if gpuRequired == nil {
 		return resp
 	}
-	if annotations[applicationGpuInjectKey] != "true" {
+
+	var injectContainer []string
+	injectAll := false
+	if injectValue, ok := annotations[applicationGpuInjectKey]; !ok || injectValue == "false" || injectValue == "" {
 		return resp
+	} else {
+		if injectValue != "true" {
+			injectToken := strings.Split(injectValue, ",")
+			for _, token := range injectToken {
+				c := strings.TrimSpace(token)
+				if c != "" {
+					injectContainer = append(injectContainer, c)
+				}
+			}
+		} else {
+			injectAll = true
+		}
 	}

 	GPUType := appcfg.GetSelectedGpuTypeValue()
@@ -321,24 +338,43 @@ func (h *Handler) gpuLimitMutate(ctx context.Context, req *admissionv1.Admission
 		},
 	}

-	patchBytes, err := webhook.CreatePatchForDeployment(tpl, h.getGPUResourceTypeKey(GPUType), envs)
+	gpuRequiredValue := gpuRequired.Value() / 1024 / 1024 // HAMi gpu memory format
+	hamiFormatGpuRequired := resource.NewQuantity(gpuRequiredValue, resource.DecimalSI)
+	patchBytes, err := webhook.CreatePatchForDeployment(
+		tpl,
+		injectAll,
+		injectContainer,
+		h.getGPUResourceTypeKey(GPUType),
+		ptr.To(hamiFormatGpuRequired.String()),
+		envs,
+	)
 	if err != nil {
 		klog.Errorf("create patch error %v", err)
 		return h.sidecarWebhook.AdmissionError(req.UID, err)
 	}
 	klog.Info("patchBytes:", string(patchBytes))
-	h.sidecarWebhook.PatchAdmissionResponse(resp, patchBytes)
+	if len(patchBytes) > 0 {
+		h.sidecarWebhook.PatchAdmissionResponse(resp, patchBytes)
+	}
 	return resp
 }

+// FIXME: should not hardcode
 func (h *Handler) getGPUResourceTypeKey(gpuType string) string {
 	switch gpuType {
 	case utils.NvidiaCardType:
 		return constants.NvidiaGPU
 	case utils.GB10ChipType:
-		return constants.NvidiaGB10GPU
+		return constants.NvidiaGPU
 	case utils.AmdApuCardType:
-		return constants.AMDAPU
+		return constants.AMDGPU
+	case utils.AmdGpuCardType:
+		return constants.AMDGPU
+	case utils.StrixHaloChipType:
+		return constants.AMDGPU
+	case utils.CPUType:
+		klog.Info("CPU type is selected, no GPU resource will be injected")
+		return ""
 	default:
 		return ""
 	}
--- a/framework/app-service/pkg/apiserver/utils.go
+++ b/framework/app-service/pkg/apiserver/utils.go
@@ -340,8 +340,8 @@ func GetClusterResource(kubeConfig *rest.Config, token string) (*prometheus.Clus
 				arches.Insert(n.Labels["kubernetes.io/arch"])
 				if quantity, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
 					total += quantity.AsApproximateFloat64()
-				} else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
-					total += quantity.AsApproximateFloat64()
+					// } else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
+					// 	total += quantity.AsApproximateFloat64()
 				} else if quantity, ok = n.Status.Capacity[constants.AMDAPU]; ok {
 					total += quantity.AsApproximateFloat64()
 				}
--- a/framework/app-service/pkg/appinstaller/helm_ops_install.go
+++ b/framework/app-service/pkg/appinstaller/helm_ops_install.go
@@ -31,6 +31,7 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/fields"
 	"k8s.io/apimachinery/pkg/runtime/schema"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/dynamic"
@@ -587,7 +588,7 @@ func (h *HelmOps) isStartUp() (bool, error) {
 		}
 		klog.Infof("podSErvers: %v", podNames)

-		serverStarted, err := checkIfStartup(serverPods, true)
+		serverStarted, err := h.checkIfStartup(serverPods, true)
 		if err != nil {
 			klog.Errorf("v2 app %s server pods not ready: %v", h.app.AppName, err)
 			return false, err
@@ -606,7 +607,7 @@ func (h *HelmOps) isStartUp() (bool, error) {
 		return false, err
 	}

-	clientStarted, err := checkIfStartup(clientPods, false)
+	clientStarted, err := h.checkIfStartup(clientPods, false)
 	if err != nil {
 		return false, err
 	}
@@ -669,7 +670,7 @@ func (h *HelmOps) findServerPods() ([]corev1.Pod, error) {
 	return pods, nil
 }

-func checkIfStartup(pods []corev1.Pod, isServerSide bool) (bool, error) {
+func (h *HelmOps) checkIfStartup(pods []corev1.Pod, isServerSide bool) (bool, error) {
 	if len(pods) == 0 {
 		return false, errors.New("no pod found")
 	}
@@ -678,6 +679,16 @@ func checkIfStartup(pods []corev1.Pod, isServerSide bool) (bool, error) {
 	for _, pod := range pods {
 		creationTime := pod.GetCreationTimestamp()
 		pendingDuration := time.Since(creationTime.Time)
+		pendingKind, err := h.getPendingKind(&pod)
+		if err != nil {
+			return false, err
+		}
+		if pendingKind == "hami-scheduler" {
+			if isServerSide {
+				return false, errcode.ErrServerSidePodPending
+			}
+			return false, errcode.ErrPodPending
+		}

 		if pod.Status.Phase == corev1.PodPending && pendingDuration > time.Minute*10 {
 			if isServerSide {
@@ -703,6 +714,28 @@ func checkIfStartup(pods []corev1.Pod, isServerSide bool) (bool, error) {
 	return false, nil
 }

+func (h *HelmOps) getPendingKind(pod *corev1.Pod) (string, error) {
+	fieldSelector := fields.OneTermEqualSelector("involvedObject.name", pod.Name).String()
+	events, err := h.client.KubeClient.Kubernetes().CoreV1().Events(pod.Namespace).List(h.ctx, metav1.ListOptions{
+		FieldSelector: fieldSelector,
+	})
+	if err != nil {
+		return "", err
+	}
+	eventFrom := ""
+	for _, event := range events.Items {
+		if event.Reason == "FailedScheduling" {
+			if event.ReportingController != "" {
+				eventFrom = event.ReportingController
+			} else {
+				eventFrom = event.Source.Component
+			}
+			break
+		}
+	}
+	return eventFrom, nil
+}
+
 type applicationSettingsSubPolicy struct {
 	URI      string `json:"uri"`
 	Policy   string `json:"policy"`
--- a/framework/app-service/pkg/constants/constants.go
+++ b/framework/app-service/pkg/constants/constants.go
@@ -84,9 +84,11 @@ const (
 	EnvGPUType      = "GPU_TYPE"

 	// gpu resource keys
-	NvidiaGPU     = "nvidia.com/gpu"
-	NvidiaGB10GPU = "nvidia.com/gb10"
-	AMDAPU        = "amd.com/apu"
+	NvidiaGPU    = "nvidia.com/gpu"
+	NvidiaGPUMem = "nvidia.com/gpumem"
+	//	NvidiaGB10GPU = "nvidia.com/gb10"
+	AMDAPU = "amd.com/apu"
+	AMDGPU = "amd.com/gpu"

 	AuthorizationLevelOfPublic  = "public"
 	AuthorizationLevelOfPrivate = "private"
--- a/framework/app-service/pkg/images/client.go
+++ b/framework/app-service/pkg/images/client.go
@@ -160,7 +160,7 @@ func (imc *ImageManagerClient) PollDownloadProgress(ctx context.Context, am *app
 					if t == "" {
 						imageSize := maxImageSize
 						info := findImageSize(imageList, ref.Name)
-						if info != nil {
+						if info != nil && info.Size != 0 {
 							//klog.Infof("get image:%s size:%d", ref.Name, info.Size)
 							imageSize = info.Size
 						}
--- a/framework/app-service/pkg/utils/app/validate.go
+++ b/framework/app-service/pkg/utils/app/validate.go
@@ -236,7 +236,8 @@ func CheckAppRequirement(token string, appConfig *appcfg.ApplicationConfig, op v
 	}

 	// only support nvidia gpu managment by HAMi for now
-	if appConfig.Requirement.GPU != nil && appConfig.GetSelectedGpuTypeValue() == utils.NvidiaCardType {
+	if appConfig.Requirement.GPU != nil &&
+		(appConfig.GetSelectedGpuTypeValue() == utils.NvidiaCardType || appConfig.GetSelectedGpuTypeValue() == utils.GB10ChipType) {
 		if !appConfig.Requirement.GPU.IsZero() && metrics.GPU.Total <= 0 {
 			return constants.GPU, constants.SystemGPUNotAvailable, fmt.Errorf(constants.SystemGPUNotAvailableMessage, op)

@@ -400,9 +401,9 @@ func GetClusterResource(token string) (*prometheus.ClusterMetrics, []string, err
 				arches.Insert(n.Labels["kubernetes.io/arch"])
 				if quantity, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
 					total += quantity.AsApproximateFloat64()
-				} else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
-					total += quantity.AsApproximateFloat64()
-				} else if quantity, ok = n.Status.Capacity[constants.AMDAPU]; ok {
+					// } else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
+					// 	total += quantity.AsApproximateFloat64()
+				} else if quantity, ok = n.Status.Capacity[constants.AMDGPU]; ok {
 					total += quantity.AsApproximateFloat64()
 				}
 			}
--- a/framework/app-service/pkg/utils/gpu_types.go
+++ b/framework/app-service/pkg/utils/gpu_types.go
@@ -5,8 +5,10 @@ const (
 )

 const (
-	NvidiaCardType = "nvidia"      // handling by HAMi
-	AmdGpuCardType = "amd-gpu"     //
-	AmdApuCardType = "amd-apu"     // AMD APU with integrated GPU , AI Max 395 etc.
-	GB10ChipType   = "nvidia-gb10" // NVIDIA GB10 Superchip & unified system memory
+	CPUType           = "cpu"         // force to use CPU, no GPU
+	NvidiaCardType    = "nvidia"      // handling by HAMi
+	AmdGpuCardType    = "amd-gpu"     //
+	AmdApuCardType    = "amd-apu"     // AMD APU with integrated GPU , AI Max 395 etc.
+	GB10ChipType      = "nvidia-gb10" // NVIDIA GB10 Superchip & unified system memory
+	StrixHaloChipType = "strix-halo"  // AMD Strix Halo GPU & unified system memory
 )
--- a/framework/app-service/pkg/webhook/webhook.go
+++ b/framework/app-service/pkg/webhook/webhook.go
@@ -543,21 +543,23 @@ type EnvKeyValue struct {
 }

 // CreatePatchForDeployment add gpu env for deployment and returns patch bytes.
-func CreatePatchForDeployment(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues []EnvKeyValue) ([]byte, error) {
-	patches, err := addResourceLimits(tpl, typeKey, envKeyValues)
+func CreatePatchForDeployment(tpl *corev1.PodTemplateSpec, injectAll bool, injectContainer []string, gpuTypeKey string, gpumem *string, envKeyValues []EnvKeyValue) ([]byte, error) {
+	patches, err := addGpuResourceLimits(tpl, injectAll, injectContainer, gpuTypeKey, gpumem)
 	if err != nil {
 		return []byte{}, err
 	}
+	patches = append(patches, addEnvToPatch(tpl, envKeyValues)...)
 	return json.Marshal(patches)
 }

-func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues []EnvKeyValue) (patch []patchOp, err error) {
+func addGpuResourceLimits(tpl *corev1.PodTemplateSpec, injectAll bool, injectContainer []string, typeKey string, gpumem *string) (patch []patchOp, err error) {
 	if typeKey == "" {
 		klog.Warning("No gpu type selected, skip adding resource limits")
 		return patch, nil
 	}

-	if typeKey == constants.NvidiaGPU || typeKey == constants.NvidiaGB10GPU {
+	// add runtime class for nvidia gpu, HAMi runtime class is "nvidia"
+	if typeKey == constants.NvidiaGPU {
 		if tpl.Spec.RuntimeClassName != nil {
 			patch = append(patch, patchOp{
 				Op:    constants.PatchOpReplace,
@@ -575,21 +577,31 @@ func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues

 	for i := range tpl.Spec.Containers {
 		container := tpl.Spec.Containers[i]
+		if !injectAll && !funk.Contains(injectContainer, container.Name) {
+			continue
+		}

 		if len(container.Resources.Limits) == 0 {
+			limitsValues := map[string]interface{}{
+				typeKey: "1",
+			}
+
+			if gpumem != nil && *gpumem != "" && typeKey == constants.NvidiaGPU {
+				limitsValues[constants.NvidiaGPUMem] = *gpumem
+			}
+
 			patch = append(patch, patchOp{
-				Op:   constants.PatchOpAdd,
-				Path: fmt.Sprintf(resourcePath, i),
-				Value: map[string]interface{}{
-					typeKey: "1",
-				},
+				Op:    constants.PatchOpAdd,
+				Path:  fmt.Sprintf(resourcePath, i),
+				Value: limitsValues,
 			})
+
 		} else {
 			t := make(map[string]map[string]string)
 			t["limits"] = map[string]string{}
 			for k, v := range container.Resources.Limits {
 				if k.String() == constants.NvidiaGPU ||
-					k.String() == constants.NvidiaGB10GPU ||
+					k.String() == constants.NvidiaGPUMem ||
 					k.String() == constants.AMDAPU {
 					// unset all previous gpu limits
 					continue
@@ -597,12 +609,24 @@ func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues
 				t["limits"][k.String()] = v.String()
 			}
 			t["limits"][typeKey] = "1"
+			if gpumem != nil && *gpumem != "" && typeKey == constants.NvidiaGPU {
+				t["limits"][constants.NvidiaGPUMem] = *gpumem
+			}
 			patch = append(patch, patchOp{
 				Op:    constants.PatchOpReplace,
 				Path:  fmt.Sprintf(resourcePath, i),
 				Value: t["limits"],
 			})
 		}
+	}
+
+	return patch, nil
+}
+
+func addEnvToPatch(tpl *corev1.PodTemplateSpec, envKeyValues []EnvKeyValue) (patch []patchOp) {
+	for i := range tpl.Spec.Containers {
+		container := tpl.Spec.Containers[i]
+
 		envNames := make([]string, 0)
 		if len(container.Env) == 0 {
 			value := make([]map[string]string, 0)
@@ -643,7 +667,7 @@ func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues

 	}

-	return patch, nil
+	return patch
 }

 func genPatchesForEnv(op string, containerIdx, envIdx int, name, value string) (patch []patchOp) {
Author	SHA1	Message	Date
eball	f6435fdffb	fix: revert unchanged file	2026-02-27 21:02:21 +08:00
eball	695abe7397	Revert "fix: revert unchanged file" This reverts commit `5f48862758`.	2026-02-27 21:00:37 +08:00
eball	5f48862758	fix: revert unchanged file	2026-02-27 20:59:01 +08:00
eball	2e9a703621	feat: adjust GPU memory format in deployment patching for compatibility	2026-02-27 20:50:51 +08:00
eball	a76f78f148	feat: update GPU resource patching to support selective container injection	2026-02-27 19:16:46 +08:00
eball	6528b324ad	feat: enhance GPU resource management with memory limits and chip type handling	2026-02-27 13:53:49 +08:00
eball	0ab4558345	fix: handle CPU type selection in GPU resource management	2026-02-26 16:34:50 +08:00
eball	b789e8320f	refactor: unify GPU resource handling and remove hardcoded values	2026-02-26 11:56:15 +08:00
hys	7048b58211	update appservice image tag to 0.5.3	2026-02-12 21:01:36 +08:00
hysyeah	40e8c87159	fix: stop app if it is hami cause unschedule no wait (#2531 ) * fix: stop app if it is hami cause unschedule * ingore param from req if size=0	2026-02-12 21:01:36 +08:00
hys	3c7440dfd3	fix: set amd apu/gpu limit key to amd.com/gpu	2026-02-12 21:01:36 +08:00
hys	360d30ce37	fix: add spec ports	2026-02-12 21:00:07 +08:00
hys	b9dc5db688	fix: check k8s request before into installing state	2026-02-12 21:00:07 +08:00
hys	1aad7ac347	fix: v2 app stop	2026-02-12 21:00:07 +08:00
hys	84e4cfdf75	fix: helm upgrade do not use atomic param and allow upgrade failed release	2026-02-12 21:00:07 +08:00
hys	7738eea929	fix: failed release upgrade	2026-02-12 21:00:07 +08:00