Compare commits

...

11 Commits

Author SHA1 Message Date
dkeven
1cff8c5c24 feat(appservice): support updating more fields in api & controller 2026-02-03 13:31:19 +08:00
eball
e72fc4b1f3 Merge branch 'main' into module-appservice
* main:
  fix a link issue
  desktop, settings, files, vault: fix multiple known issues (#2467)
  authelia: add user regulation for TOTP authentication attempts (#2466)
  fix(cli): unify config setting for release command (#2465)
  fix(cli): set node port range in minikube to allow smb service (#2460)
  settings, user service: update wallpaper style (#2463)
  bfl: enhance user login background handling with style support (#2464)
  feat: search upgrade to v0.1.6 (#2459)
  settings: add settings new version and update provider api (#2456)
  refactor(cli): unify config of command line options and envs (#2453)
  appservice: v2 app stop (#2455)

# Conflicts:
#	framework/app-service/.olares/config/cluster/deploy/appservice_deploy.yaml
2026-02-03 11:50:14 +08:00
eball
c809ce4c58 app-service: add support for selecting GPU types in application installation 2026-02-03 11:38:05 +08:00
eball
82a5cbe08b feat: add support for selecting GPU types in application installation (#2458)
* fix: failed release upgrade

* fix: helm upgrade do not use atomic param and allow upgrade failed release

* feat: add clickhouse support

* appservice image tag to 0.4.76

* feat: add icon filed to nats event

* chores: get all node gpu types

* feat: add support for selecting GPU types in application installation

* feat: enhance GPU type selection logic in application installation

* feat: replace hardcoded GPU type with constant for supported GPU selection

* feat: update app config methods to include selected GPU type and enhance validation for NVIDIA GPUs

* feat: update supported GPU handling to include default options and improve validation logic

* feat: update GPU resource handling to unset previous limits before setting new ones

* feat: refactor permission parsing to use exported function and update related calls

---------

Co-authored-by: hys <hysyeah@gmail.com>
2026-02-03 11:19:24 +08:00
hys
a88cedb0ce set appservice image tag to 0.4.77 2026-01-28 20:27:10 +08:00
hys
06d0d36042 fix: add spec ports 2026-01-28 20:27:10 +08:00
hys
67deaf16ea fix: check k8s request before into installing state 2026-01-28 20:27:10 +08:00
hys
b27854b863 fix: v2 app stop 2026-01-28 20:27:10 +08:00
hys
4fd22c4e20 feat: add icon filed to nats event 2026-01-28 20:27:10 +08:00
hys
031d8164ff fix: helm upgrade do not use atomic param and allow upgrade failed release 2026-01-28 20:27:10 +08:00
hys
0c6def8f43 fix: failed release upgrade 2026-01-28 20:27:10 +08:00
25 changed files with 445 additions and 403 deletions

View File

@@ -170,7 +170,7 @@ spec:
priorityClassName: "system-cluster-critical"
containers:
- name: app-service
image: beclab/app-service:0.4.77
image: beclab/app-service:0.4.78
imagePullPolicy: IfNotPresent
ports:
- containerPort: 6755

View File

@@ -40,7 +40,7 @@ type UserEnvSyncController struct {
//+kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch
//+kubebuilder:rbac:groups=iam.kubesphere.io,resources=users,verbs=get;list;watch
//+kubebuilder:rbac:groups=sys.bytetrade.io,resources=userenvs,verbs=get;list;watch;create
//+kubebuilder:rbac:groups=sys.bytetrade.io,resources=userenvs,verbs=get;list;watch;create;patch;update
func (r *UserEnvSyncController) SetupWithManager(mgr ctrl.Manager) error {
cmPred := predicate.NewPredicateFuncs(func(obj client.Object) bool {
@@ -164,14 +164,63 @@ func (r *UserEnvSyncController) syncUserEnvForUser(ctx context.Context, username
return 0, fmt.Errorf("list userenvs in %s failed: %w", userNs, err)
}
existSet := make(map[string]struct{}, len(existing.Items))
existByName := make(map[string]*sysv1alpha1.UserEnv, len(existing.Items))
for i := range existing.Items {
existSet[existing.Items[i].EnvName] = struct{}{}
existByName[existing.Items[i].EnvName] = &existing.Items[i]
}
created := 0
for _, spec := range base {
if _, ok := existSet[spec.EnvName]; ok {
if ue, ok := existByName[spec.EnvName]; ok {
original := ue.DeepCopy()
updated := false
if ue.Default == "" && spec.Default != "" {
ue.Default = spec.Default
updated = true
}
if ue.Type == "" && spec.Type != "" {
ue.Type = spec.Type
updated = true
}
if ue.Title == "" && spec.Title != "" {
ue.Title = spec.Title
updated = true
}
if ue.Description == "" && spec.Description != "" {
ue.Description = spec.Description
updated = true
}
if ue.RemoteOptions == "" && spec.RemoteOptions != "" {
ue.RemoteOptions = spec.RemoteOptions
updated = true
}
if ue.Regex == "" && spec.Regex != "" {
ue.Regex = spec.Regex
updated = true
}
if len(spec.Options) > 0 {
existOpt := make(map[string]struct{}, len(ue.Options))
for _, it := range ue.Options {
existOpt[it.Value] = struct{}{}
}
for _, it := range spec.Options {
if _, exists := existOpt[it.Value]; exists {
continue
}
ue.Options = append(ue.Options, it)
existOpt[it.Value] = struct{}{}
updated = true
}
}
if updated {
if err := r.Patch(ctx, ue, client.MergeFrom(original)); err != nil {
return created, fmt.Errorf("patch userenv %s/%s failed: %w", ue.Namespace, ue.Name, err)
}
klog.Infof("UserEnvSync: patched userenv %s/%s for user %s", ue.Namespace, ue.Name, username)
}
continue
}
name, err := apputils.EnvNameToResourceName(spec.EnvName)

View File

@@ -126,15 +126,16 @@ type UpgradeRequest struct {
// InstallRequest represents a request to install an application.
type InstallRequest struct {
Dev bool `json:"devMode"`
RepoURL string `json:"repoUrl"`
CfgURL string `json:"cfgUrl"`
Source AppSource `json:"source"`
Images []Image `json:"images"`
Envs []sysv1alpha1.AppEnvVar `json:"envs"`
RawAppName string `json:"rawAppName"`
Title string `json:"title"`
Entrances []EntranceClone `json:"entrances"`
Dev bool `json:"devMode"`
RepoURL string `json:"repoUrl"`
CfgURL string `json:"cfgUrl"`
Source AppSource `json:"source"`
Images []Image `json:"images"`
Envs []sysv1alpha1.AppEnvVar `json:"envs"`
RawAppName string `json:"rawAppName"`
Title string `json:"title"`
Entrances []EntranceClone `json:"entrances"`
SelectedGpuType string `json:"selectedGpuType"`
}
type Image struct {

View File

@@ -3,11 +3,14 @@ package apiserver
import (
"context"
"encoding/json"
"fmt"
"os"
"sort"
"strconv"
"strings"
"golang.org/x/exp/maps"
"github.com/beclab/Olares/framework/app-service/api/app.bytetrade.io/v1alpha1"
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
"github.com/beclab/Olares/framework/app-service/pkg/appcfg"
@@ -946,12 +949,37 @@ func (h *Handler) oamValues(req *restful.Request, resp *restful.Response) {
api.HandleError(resp, req, err)
return
}
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
gpuTypes, err := utils.GetAllGpuTypesFromNodes(&nodes)
if err != nil {
klog.Errorf("get gpu type failed %v", gpuType)
klog.Errorf("get gpu type failed %v", err)
api.HandleError(resp, req, err)
return
}
gpuType := "none"
selectedGpuType := req.QueryParameter("gputype")
if len(gpuTypes) > 0 {
if selectedGpuType != "" {
if _, ok := gpuTypes[selectedGpuType]; ok {
gpuType = selectedGpuType
} else {
err := fmt.Errorf("selected gpu type %s not found in cluster", selectedGpuType)
klog.Error(err)
api.HandleError(resp, req, err)
return
}
} else {
if len(gpuTypes) == 1 {
gpuType = maps.Keys(gpuTypes)[0]
} else {
err := fmt.Errorf("multiple gpu types found in cluster, please specify one")
klog.Error(err)
api.HandleError(resp, req, err)
return
}
}
}
values["GPU"] = map[string]interface{}{
"Type": gpuType,
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),

View File

@@ -6,10 +6,12 @@ import (
"io"
"net/http"
"net/url"
"sync"
"time"
sysv1alpha1 "github.com/beclab/Olares/framework/app-service/api/sys.bytetrade.io/v1alpha1"
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
"github.com/beclab/Olares/framework/app-service/pkg/constants"
"github.com/beclab/Olares/framework/app-service/pkg/utils"
apputils "github.com/beclab/Olares/framework/app-service/pkg/utils/app"
"github.com/emicklei/go-restful/v3"
@@ -76,32 +78,74 @@ func (h *Handler) updateAppEnv(req *restful.Request, resp *restful.Response) {
return
}
var refEnvOnce sync.Once
var listErr error
refEnvs := make(map[string]string)
updated := false
original := targetAppEnv.DeepCopy()
for i, existingEnv := range targetAppEnv.Envs {
for _, env := range updatedEnvs {
if existingEnv.EnvName == env.EnvName {
if !existingEnv.Editable {
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' is not editable", env.EnvName))
return
}
if existingEnv.Required && existingEnv.Default == "" && env.Value == "" {
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' is required", env.EnvName))
return
}
if existingEnv.Value != env.Value {
if err := existingEnv.ValidateValue(env.Value); err != nil {
api.HandleBadRequest(resp, req, fmt.Errorf("failed to update app env '%s': %v", env.EnvName, err))
if existingEnv.EnvName != env.EnvName {
continue
}
if !existingEnv.Editable {
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' is not editable", env.EnvName))
return
}
if existingEnv.Required && existingEnv.Default == "" && env.Value == "" && (env.ValueFrom == nil || env.ValueFrom.EnvName == "") {
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' is required", env.EnvName))
return
}
if env.ValueFrom != nil && env.ValueFrom.EnvName != "" && (existingEnv.ValueFrom == nil || existingEnv.ValueFrom.EnvName != env.ValueFrom.EnvName) {
refEnvOnce.Do(func() {
sysenvs := new(sysv1alpha1.SystemEnvList)
listErr = h.ctrlClient.List(req.Request.Context(), sysenvs)
if listErr != nil {
return
}
targetAppEnv.Envs[i].Value = env.Value
updated = true
if existingEnv.ApplyOnChange {
targetAppEnv.NeedApply = true
userenvs := new(sysv1alpha1.UserEnvList)
listErr = h.ctrlClient.List(req.Request.Context(), userenvs, client.InNamespace(utils.UserspaceName(owner)))
for _, sysenv := range sysenvs.Items {
refEnvs[sysenv.EnvName] = sysenv.GetEffectiveValue()
}
for _, userenv := range userenvs.Items {
refEnvs[userenv.EnvName] = userenv.GetEffectiveValue()
}
})
if listErr != nil {
api.HandleInternalError(resp, req, fmt.Errorf("failed to list referenced envs: %s", listErr))
return
}
break
value, ok := refEnvs[env.ValueFrom.EnvName]
if !ok {
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' references unknown env '%s'", env.EnvName, env.ValueFrom.EnvName))
return
}
if existingEnv.Required && value == "" {
api.HandleBadRequest(resp, req, fmt.Errorf("required app env '%s' references empty env '%s'", env.EnvName, env.ValueFrom.EnvName))
return
}
if existingEnv.ValidateValue(value) != nil {
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' references invalid value '%s' from '%s': %v", env.EnvName, value, env.ValueFrom.EnvName, err))
return
}
targetAppEnv.Envs[i].ValueFrom = env.ValueFrom
targetAppEnv.Envs[i].Value = value
targetAppEnv.Envs[i].ValueFrom.Status = constants.EnvRefStatusSynced
updated = true
} else if existingEnv.Value != env.Value {
if err := existingEnv.ValidateValue(env.Value); err != nil {
api.HandleBadRequest(resp, req, fmt.Errorf("failed to update app env '%s': %v", env.EnvName, err))
return
}
targetAppEnv.Envs[i].Value = env.Value
updated = true
}
if updated && existingEnv.ApplyOnChange {
targetAppEnv.NeedApply = true
}
break
}
}

View File

@@ -1,174 +1,33 @@
package apiserver
import (
"fmt"
"sync"
"time"
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
"github.com/beclab/Olares/framework/app-service/pkg/client/clientset"
"github.com/beclab/Olares/framework/app-service/pkg/constants"
"github.com/beclab/Olares/framework/app-service/pkg/utils"
"golang.org/x/exp/maps"
"sigs.k8s.io/controller-runtime/pkg/client"
"github.com/emicklei/go-restful/v3"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/util/retry"
"k8s.io/klog/v2"
)
var running bool = false
var switchLock sync.Mutex
func (h *Handler) disableGpuManagedMemory(req *restful.Request, resp *restful.Response) {
if err := h.nvshareSwitch(req, false); err != nil {
api.HandleError(resp, req, &errors.StatusError{
ErrStatus: metav1.Status{Code: 400, Message: "operation failed, " + err.Error()},
})
func (h *Handler) getGpuTypes(req *restful.Request, resp *restful.Response) {
var nodes corev1.NodeList
err := h.ctrlClient.List(req.Request.Context(), &nodes, &client.ListOptions{})
if err != nil {
klog.Errorf("list node failed %v", err)
api.HandleError(resp, req, err)
return
}
resp.WriteAsJson(map[string]int{"code": 0})
}
func (h *Handler) enableGpuManagedMemory(req *restful.Request, resp *restful.Response) {
if err := h.nvshareSwitch(req, true); err != nil {
api.HandleError(resp, req, &errors.StatusError{
ErrStatus: metav1.Status{Code: 400, Message: "operation failed, " + err.Error()},
})
gpuTypes, err := utils.GetAllGpuTypesFromNodes(&nodes)
if err != nil {
klog.Errorf("get gpu type failed %v", err)
api.HandleError(resp, req, err)
return
}
resp.WriteAsJson(map[string]int{"code": 0})
}
func (h *Handler) nvshareSwitch(req *restful.Request, enable bool) error {
client := req.Attribute(constants.KubeSphereClientAttribute).(*clientset.ClientSet)
switchLock.Lock()
defer switchLock.Unlock()
if running {
return fmt.Errorf("last operation is still running")
}
deployments, err := client.KubeClient.Kubernetes().AppsV1().Deployments("").List(req.Request.Context(), metav1.ListOptions{})
if err != nil {
klog.Error("list deployment error, ", err)
return err
}
envValue := "0"
if enable {
envValue = "1"
}
for _, d := range deployments.Items {
shouldUpdate := false
for i, c := range d.Spec.Template.Spec.Containers {
found := false
for k := range c.Resources.Limits {
if k == constants.NvshareGPU {
found = true
break
}
}
if found {
// a gpu request container
addEnv := true
for n, env := range d.Spec.Template.Spec.Containers[i].Env {
if env.Name == constants.EnvNvshareManagedMemory {
addEnv = false
d.Spec.Template.Spec.Containers[i].Env[n].Value = envValue
break
}
}
if addEnv {
d.Spec.Template.Spec.Containers[i].Env =
append(d.Spec.Template.Spec.Containers[i].Env,
corev1.EnvVar{Name: constants.EnvNvshareManagedMemory, Value: envValue})
}
shouldUpdate = true
} // end found
} // end of container loop
if shouldUpdate {
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
deployment, err := client.KubeClient.Kubernetes().AppsV1().Deployments(d.Namespace).
Get(req.Request.Context(), d.Name, metav1.GetOptions{})
if err != nil {
return err
}
deployment.Spec.Template.Spec.Containers = d.Spec.Template.Spec.Containers
_, err = client.KubeClient.Kubernetes().AppsV1().Deployments(d.Namespace).
Update(req.Request.Context(), deployment, metav1.UpdateOptions{})
return err
})
if err != nil {
klog.Error("update deployment error, ", err, ", ", d.Name, ", ", d.Namespace)
return err
}
} // should update
} // end of deployment loop
// update terminus
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
terminus, err := utils.GetTerminus(req.Request.Context(), h.ctrlClient)
if err != nil {
return err
}
terminus.Spec.Settings[constants.EnvNvshareManagedMemory] = envValue
return h.ctrlClient.Update(req.Request.Context(), terminus)
})
if err != nil {
klog.Error("update terminus error, ", err)
return err
}
running = true
// delay 30s, assume the all pods will be reload in 30s.
delay := time.NewTimer(30 * time.Second)
go func() {
<-delay.C
switchLock.Lock()
defer switchLock.Unlock()
running = false
}()
return nil
}
func (h *Handler) getManagedMemoryValue(req *restful.Request, resp *restful.Response) {
terminus, err := utils.GetTerminus(req.Request.Context(), h.ctrlClient)
if err != nil {
klog.Error("get terminus value error, ", err)
api.HandleError(resp, req, &errors.StatusError{
ErrStatus: metav1.Status{Code: 400, Message: "get value error, " + err.Error()},
})
return
}
managed := true
if v, ok := terminus.Spec.Settings[constants.EnvNvshareManagedMemory]; ok && v == "0" {
managed = false
}
resp.WriteAsJson(&map[string]interface{}{
"managed_memory": managed,
"gpu_types": maps.Keys(gpuTypes),
},
)
}

View File

@@ -21,9 +21,12 @@ import (
"github.com/beclab/Olares/framework/app-service/pkg/utils"
apputils "github.com/beclab/Olares/framework/app-service/pkg/utils/app"
"github.com/beclab/Olares/framework/app-service/pkg/utils/config"
"golang.org/x/exp/maps"
"sigs.k8s.io/controller-runtime/pkg/client"
"github.com/emicklei/go-restful/v3"
"helm.sh/helm/v3/pkg/time"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
@@ -37,7 +40,7 @@ type depRequest struct {
type installHelperIntf interface {
getAdminUsers() (admin []string, isAdmin bool, err error)
getInstalledApps() (installed bool, app []*v1alpha1.Application, err error)
getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion string) (err error)
getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion, selectedGpuType string) (err error)
setAppConfig(req *api.InstallRequest, appName string)
validate(bool, []*v1alpha1.Application) error
setAppEnv(overrides []sysv1alpha1.AppEnvVar) error
@@ -105,6 +108,36 @@ func (h *Handler) install(req *restful.Request, resp *restful.Response) {
}
}
// check selected gpu type can be supported
// if selectedGpuType != "" , then check if the gpu type exists in cluster
// if selectedGpuType == "" , and only one gpu type exists in cluster, then use it
var nodes corev1.NodeList
err = h.ctrlClient.List(req.Request.Context(), &nodes, &client.ListOptions{})
if err != nil {
klog.Errorf("list node failed %v", err)
api.HandleError(resp, req, err)
return
}
gpuTypes, err := utils.GetAllGpuTypesFromNodes(&nodes)
if err != nil {
klog.Errorf("get gpu type failed %v", err)
api.HandleError(resp, req, err)
return
}
if insReq.SelectedGpuType != "" {
if _, ok := gpuTypes[insReq.SelectedGpuType]; !ok {
klog.Errorf("selected gpu type %s not found in cluster", insReq.SelectedGpuType)
api.HandleBadRequest(resp, req, fmt.Errorf("selected gpu type %s not found in cluster", insReq.SelectedGpuType))
return
}
} else {
if len(gpuTypes) == 1 {
insReq.SelectedGpuType = maps.Keys(gpuTypes)[0]
klog.Infof("only one gpu type %s found in cluster, use it as selected gpu type", insReq.SelectedGpuType)
}
}
apiVersion, appCfg, err := apputils.GetApiVersionFromAppConfig(req.Request.Context(), &apputils.ConfigOptions{
App: app,
RawAppName: rawAppName,
@@ -112,6 +145,7 @@ func (h *Handler) install(req *restful.Request, resp *restful.Response) {
RepoURL: insReq.RepoURL,
MarketSource: marketSource,
Version: chartVersion,
SelectedGpu: insReq.SelectedGpuType,
})
klog.Infof("chartVersion: %s", chartVersion)
if err != nil {
@@ -188,7 +222,7 @@ func (h *Handler) install(req *restful.Request, resp *restful.Response) {
return
}
err = helper.getAppConfig(adminUsers, marketSource, isAdmin, appInstalled, installedApps, chartVersion)
err = helper.getAppConfig(adminUsers, marketSource, isAdmin, appInstalled, installedApps, chartVersion, insReq.SelectedGpuType)
if err != nil {
klog.Errorf("Failed to get app config err=%v", err)
return
@@ -423,7 +457,7 @@ func (h *installHandlerHelper) getInstalledApps() (installed bool, app []*v1alph
return
}
func (h *installHandlerHelper) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion string) (err error) {
func (h *installHandlerHelper) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion, selectedGpuType string) (err error) {
var (
admin string
installAsAdmin bool
@@ -472,6 +506,7 @@ func (h *installHandlerHelper) getAppConfig(adminUsers []string, marketSource st
Admin: admin,
IsAdmin: installAsAdmin,
MarketSource: marketSource,
SelectedGpu: selectedGpuType,
})
if err != nil {
klog.Errorf("Failed to get appconfig err=%v", err)
@@ -685,7 +720,7 @@ func (h *installHandlerHelperV2) _validateClusterScope(isAdmin bool, installedAp
return nil
}
func (h *installHandlerHelperV2) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion string) (err error) {
func (h *installHandlerHelperV2) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion, selectedGpuType string) (err error) {
klog.Info("get app config for install handler v2")
var (
@@ -713,6 +748,7 @@ func (h *installHandlerHelperV2) getAppConfig(adminUsers []string, marketSource
Admin: admin,
MarketSource: marketSource,
IsAdmin: isAdmin,
SelectedGpu: selectedGpuType,
})
if err != nil {
klog.Errorf("Failed to get appconfig err=%v", err)

View File

@@ -13,6 +13,7 @@ import (
"github.com/beclab/Olares/framework/app-service/api/app.bytetrade.io/v1alpha1"
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
"github.com/beclab/Olares/framework/app-service/pkg/appcfg"
"github.com/beclab/Olares/framework/app-service/pkg/appinstaller"
"github.com/beclab/Olares/framework/app-service/pkg/appstate"
"github.com/beclab/Olares/framework/app-service/pkg/client/clientset"
"github.com/beclab/Olares/framework/app-service/pkg/constants"
@@ -520,6 +521,7 @@ type applicationPermission struct {
Permissions []permission `json:"permissions"`
}
// Deprecated
func (h *Handler) applicationPermissionList(req *restful.Request, resp *restful.Response) {
owner := req.Attribute(constants.UserContextAttribute).(string)
//token := req.HeaderParameter(constants.AuthorizationTokenKey)
@@ -572,46 +574,39 @@ func (h *Handler) applicationPermissionList(req *restful.Request, resp *restful.
func (h *Handler) getApplicationPermission(req *restful.Request, resp *restful.Response) {
app := req.PathParameter(ParamAppName)
owner := req.Attribute(constants.UserContextAttribute).(string)
client, err := dynamic.NewForConfig(h.kubeConfig)
name, err := apputils.FmtAppMgrName(app, owner, "")
if err != nil {
api.HandleError(resp, req, err)
return
}
var am v1alpha1.ApplicationManager
err = h.ctrlClient.Get(req.Request.Context(), types.NamespacedName{Name: name}, &am)
if err != nil {
api.HandleError(resp, req, err)
return
}
var appConfig appcfg.ApplicationConfig
err = am.GetAppConfig(&appConfig)
if err != nil {
klog.Errorf("Failed to get app config err=%v", err)
api.HandleError(resp, req, err)
return
}
var ret *applicationPermission
apClient := provider.NewApplicationPermissionRequest(client)
namespace := fmt.Sprintf("user-system-%s", owner)
aps, err := apClient.List(req.Request.Context(), namespace, metav1.ListOptions{})
if err != nil {
api.HandleError(resp, req, err)
return
}
for _, ap := range aps.Items {
if ap.Object == nil {
continue
}
appName, _, _ := unstructured.NestedString(ap.Object, "spec", "app")
if appName == app {
perms, _, _ := unstructured.NestedSlice(ap.Object, "spec", "permissions")
permissions := appinstaller.ParseAppPermission(appConfig.Permission)
for _, ap := range permissions {
if perms, ok := ap.([]appcfg.ProviderPermission); ok {
permissions := make([]permission, 0)
for _, p := range perms {
if perm, ok := p.(map[string]interface{}); ok {
ops := make([]string, 0)
for _, op := range perm["ops"].([]interface{}) {
if opStr, ok := op.(string); ok {
ops = append(ops, opStr)
}
}
permissions = append(permissions, permission{
DataType: perm["dataType"].(string),
Group: perm["group"].(string),
Version: perm["version"].(string),
Ops: ops,
})
}
permissions = append(permissions, permission{
DataType: p.ProviderName,
Group: p.AppName,
})
}
ret = &applicationPermission{
App: appName,
App: am.Spec.AppName,
Owner: owner,
Permissions: permissions,
}
@@ -642,6 +637,7 @@ type opApi struct {
URI string `json:"uri"`
}
// Deprecated
func (h *Handler) getProviderRegistry(req *restful.Request, resp *restful.Response) {
dataTypeReq := req.PathParameter(ParamDataType)
groupReq := req.PathParameter(ParamGroup)
@@ -708,56 +704,44 @@ func (h *Handler) getProviderRegistry(req *restful.Request, resp *restful.Respon
func (h *Handler) getApplicationProviderList(req *restful.Request, resp *restful.Response) {
owner := req.Attribute(constants.UserContextAttribute).(string)
app := req.PathParameter(ParamAppName)
client, err := dynamic.NewForConfig(h.kubeConfig)
name, err := apputils.FmtAppMgrName(app, owner, "")
if err != nil {
api.HandleError(resp, req, err)
return
}
var am v1alpha1.ApplicationManager
err = h.ctrlClient.Get(req.Request.Context(), types.NamespacedName{Name: name}, &am)
if err != nil {
api.HandleError(resp, req, err)
return
}
var appConfig appcfg.ApplicationConfig
err = am.GetAppConfig(&appConfig)
if err != nil {
klog.Errorf("Failed to get app config err=%v", err)
api.HandleError(resp, req, err)
return
}
ret := make([]providerRegistry, 0)
rClient := provider.NewRegistryRequest(client)
namespace := fmt.Sprintf("user-system-%s", owner)
prs, err := rClient.List(req.Request.Context(), namespace, metav1.ListOptions{})
if err != nil {
api.HandleError(resp, req, err)
return
}
for _, ap := range prs.Items {
if ap.Object == nil {
continue
}
deployment, _, _ := unstructured.NestedString(ap.Object, "spec", "deployment")
kind, _, _ := unstructured.NestedString(ap.Object, "spec", "kind")
if app == deployment && kind == "provider" {
dataType, _, _ := unstructured.NestedString(ap.Object, "spec", "dataType")
group, _, _ := unstructured.NestedString(ap.Object, "spec", "group")
description, _, _ := unstructured.NestedString(ap.Object, "spec", "description")
endpoint, _, _ := unstructured.NestedString(ap.Object, "spec", "endpoint")
ns, _, _ := unstructured.NestedString(ap.Object, "spec", "namespace")
version, _, _ := unstructured.NestedString(ap.Object, "spec", "version")
opApis := make([]opApi, 0)
opApiList, _, _ := unstructured.NestedSlice(ap.Object, "spec", "opApis")
for _, op := range opApiList {
if aop, ok := op.(map[string]interface{}); ok {
opApis = append(opApis, opApi{
Name: aop["name"].(string),
URI: aop["uri"].(string),
})
}
}
ret = append(ret, providerRegistry{
DataType: dataType,
Deployment: deployment,
Description: description,
Endpoint: endpoint,
Kind: kind,
Group: group,
Namespace: ns,
OpApis: opApis,
Version: version,
ns := am.Spec.AppNamespace
for _, ap := range appConfig.Provider {
dataType := ap.Name
endpoint := ap.Entrance
opApis := make([]opApi, 0)
for _, op := range ap.Paths {
opApis = append(opApis, opApi{
URI: op,
})
}
ret = append(ret, providerRegistry{
DataType: dataType,
Endpoint: endpoint,
Namespace: ns,
OpApis: opApis,
})
}
resp.WriteAsJson(ret)
}

View File

@@ -37,7 +37,6 @@ import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/dynamic"
"k8s.io/klog/v2"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
@@ -308,36 +307,21 @@ func (h *Handler) gpuLimitMutate(ctx context.Context, req *admissionv1.Admission
return resp
}
GPUType, err := h.findNvidiaGpuFromNodes(ctx)
if err != nil && !errors.Is(err, api.ErrGPUNodeNotFound) {
return h.sidecarWebhook.AdmissionError(req.UID, err)
}
GPUType := appcfg.GetSelectedGpuTypeValue()
// no gpu found, no need to inject env, just return.
if GPUType == "" {
if GPUType == "none" || GPUType == "" {
return resp
}
terminus, err := utils.GetTerminus(ctx, h.ctrlClient)
if err != nil {
return h.sidecarWebhook.AdmissionError(req.UID, err)
}
nvshareManagedMemory := ""
if terminus.Spec.Settings != nil {
nvshareManagedMemory = terminus.Spec.Settings[constants.EnvNvshareManagedMemory]
envs := []webhook.EnvKeyValue{
{
Key: constants.EnvGPUType,
Value: GPUType,
},
}
envs := []webhook.EnvKeyValue{}
if nvshareManagedMemory != "" {
envs = append(envs, webhook.EnvKeyValue{
Key: constants.EnvNvshareManagedMemory,
Value: nvshareManagedMemory,
})
}
envs = append(envs, webhook.EnvKeyValue{Key: "NVSHARE_DEBUG", Value: "1"})
patchBytes, err := webhook.CreatePatchForDeployment(tpl, req.Namespace, gpuRequired, GPUType, envs)
patchBytes, err := webhook.CreatePatchForDeployment(tpl, h.getGPUResourceTypeKey(GPUType), envs)
if err != nil {
klog.Errorf("create patch error %v", err)
return h.sidecarWebhook.AdmissionError(req.UID, err)
@@ -347,33 +331,17 @@ func (h *Handler) gpuLimitMutate(ctx context.Context, req *admissionv1.Admission
return resp
}
func (h *Handler) findNvidiaGpuFromNodes(ctx context.Context) (string, error) {
var nodes corev1.NodeList
err := h.ctrlClient.List(ctx, &nodes, &client.ListOptions{})
if err != nil {
return "", err
func (h *Handler) getGPUResourceTypeKey(gpuType string) string {
switch gpuType {
case utils.NvidiaCardType:
return constants.NvidiaGPU
case utils.GB10ChipType:
return constants.NvidiaGB10GPU
case utils.AmdApuCardType:
return constants.AMDAPU
default:
return ""
}
// return nvshare gpu or virtaitech gpu in priority
gtype := ""
for _, n := range nodes.Items {
if _, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
if _, ok = n.Status.Capacity[constants.NvshareGPU]; ok {
return constants.NvshareGPU, nil
}
gtype = constants.NvidiaGPU
}
if _, ok := n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
return constants.VirtAiTechVGPU, nil
}
}
if gtype != "" {
return gtype, nil
}
return "", api.ErrGPUNodeNotFound
}
func (h *Handler) providerRegistryValidate(req *restful.Request, resp *restful.Response) {

View File

@@ -340,7 +340,9 @@ func GetClusterResource(kubeConfig *rest.Config, token string) (*prometheus.Clus
arches.Insert(n.Labels["kubernetes.io/arch"])
if quantity, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
} else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.AMDAPU]; ok {
total += quantity.AsApproximateFloat64()
}
}

View File

@@ -254,21 +254,9 @@ func addServiceToContainer(c *restful.Container, handler *Handler) error {
Param(ws.PathParameter(ParamEntranceName, "the name of a application entrance")).
Returns(http.StatusOK, "Success to set the application entrance policy", nil))
ws.Route(ws.POST("/gpu/disable/managed-memory").
To(handler.disableGpuManagedMemory).
Doc("disable nvshare's managed memory ").
Metadata(restfulspec.KeyOpenAPITags, MODULE_TAGS).
Returns(http.StatusOK, "Success to disable", nil))
ws.Route(ws.POST("/gpu/enable/managed-memory").
To(handler.enableGpuManagedMemory).
Doc("enable nvshare's managed memory ").
Metadata(restfulspec.KeyOpenAPITags, MODULE_TAGS).
Returns(http.StatusOK, "Success to enable", nil))
ws.Route(ws.GET("/gpu/managed-memory").
To(handler.getManagedMemoryValue).
Doc("get nvshare's managed memory enabled or not").
ws.Route(ws.GET("/gpu/types").
To(handler.getGpuTypes).
Doc("get all gpu types in the cluster").
Metadata(restfulspec.KeyOpenAPITags, MODULE_TAGS).
Returns(http.StatusOK, "Success to get ", &ResultResponse{}))

View File

@@ -56,14 +56,19 @@ type AppSpec struct {
Developer string `yaml:"developer" json:"developer"`
RequiredMemory string `yaml:"requiredMemory" json:"requiredMemory"`
RequiredDisk string `yaml:"requiredDisk" json:"requiredDisk"`
SupportClient SupportClient `yaml:"supportClient" json:"supportClient"`
RequiredGPU string `yaml:"requiredGpu" json:"requiredGpu"`
RequiredCPU string `yaml:"requiredCpu" json:"requiredCpu"`
LimitedMemory string `yaml:"limitedMemory" json:"limitedMemory"`
LimitedDisk string `yaml:"limitedDisk" json:"limitedDisk"`
LimitedGPU string `yaml:"limitedGPU" json:"limitedGPU"`
LimitedCPU string `yaml:"limitedCPU" json:"limitedCPU"`
SupportClient SupportClient `yaml:"supportClient" json:"supportClient"`
RunAsUser bool `yaml:"runAsUser" json:"runAsUser"`
RunAsInternal bool `yaml:"runAsInternal" json:"runAsInternal"`
PodGPUConsumePolicy string `yaml:"podGpuConsumePolicy" json:"podGpuConsumePolicy"`
SubCharts []Chart `yaml:"subCharts" json:"subCharts"`
Hardware Hardware `yaml:"hardware" json:"hardware"`
SupportedGpu []any `yaml:"supportedGpu,omitempty" json:"supportedGpu,omitempty"`
}
type Hardware struct {
@@ -188,6 +193,17 @@ type Provider struct {
Verbs []string `yaml:"verbs" json:"verbs"`
}
type SpecialResource struct {
RequiredMemory *string `yaml:"requiredMemory,omitempty" json:"requiredMemory,omitempty"`
RequiredDisk *string `yaml:"requiredDisk,omitempty" json:"requiredDisk,omitempty"`
RequiredGPU *string `yaml:"requiredGpu,omitempty" json:"requiredGpu,omitempty"`
RequiredCPU *string `yaml:"requiredCpu,omitempty" json:"requiredCpu,omitempty"`
LimitedMemory *string `yaml:"limitedMemory,omitempty" json:"limitedMemory,omitempty"`
LimitedDisk *string `yaml:"limitedDisk,omitempty" json:"limitedDisk,omitempty"`
LimitedGPU *string `yaml:"limitedGPU,omitempty" json:"limitedGPU,omitempty"`
LimitedCPU *string `yaml:"limitedCPU,omitempty" json:"limitedCPU,omitempty"`
}
func (c *Chart) Namespace(owner string) string {
if c.Shared {
return fmt.Sprintf("%s-%s", c.Name, "shared")

View File

@@ -100,6 +100,7 @@ type ApplicationConfig struct {
PodsSelectors []metav1.LabelSelector
HardwareRequirement Hardware
SharedEntrances []v1alpha1.Entrance
SelectedGpuType string
}
func (c *ApplicationConfig) IsMiddleware() bool {
@@ -159,6 +160,13 @@ func (c *ApplicationConfig) GenSharedEntranceURL(ctx context.Context) ([]v1alpha
return app.GenSharedEntranceURL(ctx)
}
func (c *ApplicationConfig) GetSelectedGpuTypeValue() string {
if c.SelectedGpuType == "" {
return "none"
}
return c.SelectedGpuType
}
func (p *ProviderPermission) GetNamespace(ownerName string) string {
if p.Namespace != "" {
if p.Namespace == "user-space" || p.Namespace == "user-system" {

View File

@@ -752,7 +752,7 @@ func getApplicationPolicy(policies []appcfg.AppPolicy, entrances []appv1alpha1.E
return string(policyStr), nil
}
func parseAppPermission(data []appcfg.AppPermission) []appcfg.AppPermission {
func ParseAppPermission(data []appcfg.AppPermission) []appcfg.AppPermission {
permissions := make([]appcfg.AppPermission, 0)
for _, p := range data {
switch perm := p.(type) {

View File

@@ -78,7 +78,7 @@ func (h *HelmOps) Uninstall_(client kubernetes.Interface, actionConfig *action.C
return err
}
h.app.Permission = parseAppPermission(h.app.Permission)
h.app.Permission = ParseAppPermission(h.app.Permission)
var perm []appcfg.ProviderPermission
for _, p := range h.app.Permission {
if t, ok := p.([]appcfg.ProviderPermission); ok {

View File

@@ -50,7 +50,7 @@ func (h *HelmOps) SetValues() (values map[string]interface{}, err error) {
values["domain"] = entries
userspace := make(map[string]interface{})
h.app.Permission = parseAppPermission(h.app.Permission)
h.app.Permission = ParseAppPermission(h.app.Permission)
for _, p := range h.app.Permission {
switch perm := p.(type) {
case appcfg.AppDataPermission, appcfg.AppCachePermission, appcfg.UserDataPermission:
@@ -170,17 +170,12 @@ func (h *HelmOps) SetValues() (values map[string]interface{}, err error) {
values["cluster"] = map[string]interface{}{
"arch": arch,
}
gpuType, err := utils.FindGpuTypeFromNodes(nodes)
if err != nil {
klog.Errorf("Failed to get gpuType err=%v", err)
return values, err
}
values["GPU"] = map[string]interface{}{
"Type": gpuType,
"Type": h.app.GetSelectedGpuTypeValue(),
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
}
values["gpu"] = gpuType
values["gpu"] = h.app.GetSelectedGpuTypeValue()
if h.app.OIDC.Enabled {
err = h.createOIDCClient(values, zone, h.app.Namespace)

View File

@@ -16,7 +16,6 @@ import (
apputils "github.com/beclab/Olares/framework/app-service/pkg/utils/app"
"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -176,19 +175,8 @@ func (p *DownloadingApp) exec(ctx context.Context) error {
},
}
var nodes corev1.NodeList
err = p.client.List(ctx, &nodes, &client.ListOptions{})
if err != nil {
klog.Errorf("list node failed %v", err)
return err
}
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
if err != nil {
klog.Errorf("get gpu type failed %v", gpuType)
return err
}
values["GPU"] = map[string]interface{}{
"Type": gpuType,
"Type": appConfig.GetSelectedGpuTypeValue(),
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
}

View File

@@ -22,7 +22,6 @@ import (
"github.com/pkg/errors"
"helm.sh/helm/v3/pkg/action"
corev1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -214,19 +213,8 @@ func (p *UpgradingApp) exec(ctx context.Context) error {
"username": p.manager.Spec.AppOwner,
},
}
var nodes corev1.NodeList
err = p.client.List(ctx, &nodes, &client.ListOptions{})
if err != nil {
klog.Errorf("list node failed %v", err)
return err
}
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
if err != nil {
klog.Errorf("get gpu type failed %v", gpuType)
return err
}
values["GPU"] = map[string]interface{}{
"Type": gpuType,
"Type": appConfig.GetSelectedGpuTypeValue(),
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
}

View File

@@ -78,13 +78,15 @@ const (
SidecarInitContainerName = "olares-sidecar-init"
EnvoyConfigWorkDirName = "envoy-config"
ByteTradeAuthor = "bytetrade.io"
NvshareGPU = "nvshare.com/gpu"
NvidiaGPU = "nvidia.com/gpu"
VirtAiTechVGPU = "virtaitech.com/gpu"
PatchOpAdd = "add"
PatchOpReplace = "replace"
EnvNvshareManagedMemory = "NVSHARE_MANAGED_MEMORY"
ByteTradeAuthor = "bytetrade.io"
PatchOpAdd = "add"
PatchOpReplace = "replace"
EnvGPUType = "GPU_TYPE"
// gpu resource keys
NvidiaGPU = "nvidia.com/gpu"
NvidiaGB10GPU = "nvidia.com/gb10"
AMDAPU = "amd.com/apu"
AuthorizationLevelOfPublic = "public"
AuthorizationLevelOfPrivate = "private"

View File

@@ -273,11 +273,7 @@ func (c *Creator) installSysApps(ctx context.Context, bflPod *corev1.Pod) error
"arch": arch,
}
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
if err != nil {
return err
}
vals["gpu"] = gpuType
vals["gpu"] = "none" // unused currently
userIndex, userSubnet, err := c.getUserSubnet(ctx)
if err != nil {

View File

@@ -16,6 +16,7 @@ import (
corev1 "k8s.io/api/core/v1"
sysv1alpha1 "github.com/beclab/Olares/framework/app-service/api/sys.bytetrade.io/v1alpha1"
"github.com/go-viper/mapstructure/v2"
"sigs.k8s.io/controller-runtime/pkg/client"
"github.com/beclab/Olares/framework/app-service/api/app.bytetrade.io/v1alpha1"
@@ -674,6 +675,7 @@ type ConfigOptions struct {
MarketSource string
IsAdmin bool
RawAppName string
SelectedGpu string
}
// GetAppConfig get app installation configuration from app store
@@ -740,7 +742,7 @@ func getAppConfigFromRepo(ctx context.Context, options *ConfigOptions) (*appcfg.
return getAppConfigFromConfigurationFile(options, chartPath)
}
func toApplicationConfig(app, chart, rawAppName string, cfg *appcfg.AppConfiguration) (*appcfg.ApplicationConfig, string, error) {
func toApplicationConfig(app, chart, rawAppName, selectedGpu string, cfg *appcfg.AppConfiguration) (*appcfg.ApplicationConfig, string, error) {
var permission []appcfg.AppPermission
if cfg.Permission.AppData {
permission = append(permission, appcfg.AppDataRW)
@@ -788,6 +790,57 @@ func toApplicationConfig(app, chart, rawAppName string, cfg *appcfg.AppConfigura
return nil, chart, err
}
// set suppertedGpu to ["nvidia","nvidia-gb10"] by default
if len(cfg.Spec.SupportedGpu) == 0 {
cfg.Spec.SupportedGpu = []interface{}{utils.NvidiaCardType, utils.GB10ChipType}
}
// try to get selected GPU type special resource requirement
if selectedGpu != "" {
found := false
for _, supportedGpu := range cfg.Spec.SupportedGpu {
if str, ok := supportedGpu.(string); ok && str == selectedGpu {
found = true
break
}
if supportedGpuResourceMap, ok := supportedGpu.(map[string]interface{}); ok {
if resourceRequirement, ok := supportedGpuResourceMap[selectedGpu].(map[string]interface{}); ok {
found = true
var specialResource appcfg.SpecialResource
err := mapstructure.Decode(resourceRequirement, &specialResource)
if err != nil {
return nil, chart, fmt.Errorf("failed to decode special resource for selected GPU type %s: %v", selectedGpu, err)
}
for _, resSetter := range []struct {
v **resource.Quantity
s *string
}{
{v: &mem, s: specialResource.RequiredMemory},
{v: &disk, s: specialResource.RequiredDisk},
{v: &cpu, s: specialResource.RequiredCPU},
{v: &gpu, s: specialResource.RequiredGPU},
} {
if resSetter.s != nil && *resSetter.s != "" {
*resSetter.v, err = valuePtr(resource.ParseQuantity(*resSetter.s))
if err != nil {
return nil, chart, fmt.Errorf("failed to parse special resource quantity %s: %v", *resSetter.s, err)
}
}
}
break
} // end if selected gpu's resource requirement found
} // end if supportedGpu is map
} // end for supportedGpu
if !found {
return nil, chart, fmt.Errorf("selected GPU type %s is not supported", selectedGpu)
}
}
// transform from Policy to AppPolicy
var policies []appcfg.AppPolicy
for _, p := range cfg.Options.Policies {
@@ -877,6 +930,7 @@ func toApplicationConfig(app, chart, rawAppName string, cfg *appcfg.AppConfigura
PodsSelectors: podSelectors,
HardwareRequirement: cfg.Spec.Hardware,
SharedEntrances: cfg.SharedEntrances,
SelectedGpuType: selectedGpu,
}, chart, nil
}
@@ -890,7 +944,7 @@ func getAppConfigFromConfigurationFile(opt *ConfigOptions, chartPath string) (*a
return nil, chartPath, err
}
return toApplicationConfig(opt.App, chartPath, opt.RawAppName, &cfg)
return toApplicationConfig(opt.App, chartPath, opt.RawAppName, opt.SelectedGpu, &cfg)
}
func checkVersionFormat(constraint string) error {

View File

@@ -234,7 +234,9 @@ func CheckAppRequirement(token string, appConfig *appcfg.ApplicationConfig, op v
return constants.CPU, constants.SystemCPUPressure, fmt.Errorf(constants.SystemCPUPressureMessage, op)
}
}
if appConfig.Requirement.GPU != nil {
// only support nvidia gpu managment by HAMi for now
if appConfig.Requirement.GPU != nil && appConfig.GetSelectedGpuTypeValue() == utils.NvidiaCardType {
if !appConfig.Requirement.GPU.IsZero() && metrics.GPU.Total <= 0 {
return constants.GPU, constants.SystemGPUNotAvailable, fmt.Errorf(constants.SystemGPUNotAvailableMessage, op)
@@ -398,7 +400,9 @@ func GetClusterResource(token string) (*prometheus.ClusterMetrics, []string, err
arches.Insert(n.Labels["kubernetes.io/arch"])
if quantity, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
} else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.AMDAPU]; ok {
total += quantity.AsApproximateFloat64()
}
}

View File

@@ -0,0 +1,12 @@
package utils
const (
NodeGPUTypeLabel = "gpu.bytetrade.io/type"
)
const (
NvidiaCardType = "nvidia" // handling by HAMi
AmdGpuCardType = "amd-gpu" //
AmdApuCardType = "amd-apu" // AMD APU with integrated GPU , AI Max 395 etc.
GB10ChipType = "nvidia-gb10" // NVIDIA GB10 Superchip & unified system memory
)

View File

@@ -103,24 +103,37 @@ func GetAllNodesTunnelIPCIDRs() (cidrs []string) {
return cidrs
}
func FindGpuTypeFromNodes(nodes *corev1.NodeList) (string, error) {
gpuType := "none"
// func FindGpuTypeFromNodes(nodes *corev1.NodeList) (string, error) {
// gpuType := "none"
// if nodes == nil {
// return gpuType, errors.New("empty node list")
// }
// for _, n := range nodes.Items {
// if _, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
// if _, ok = n.Status.Capacity[constants.NvshareGPU]; ok {
// return "nvshare", nil
// }
// gpuType = "nvidia"
// }
// if _, ok := n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
// return "virtaitech", nil
// }
// }
// return gpuType, nil
// }
func GetAllGpuTypesFromNodes(nodes *corev1.NodeList) (map[string]struct{}, error) {
gpuTypes := make(map[string]struct{})
if nodes == nil {
return gpuType, errors.New("empty node list")
return gpuTypes, errors.New("empty node list")
}
for _, n := range nodes.Items {
if _, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
if _, ok = n.Status.Capacity[constants.NvshareGPU]; ok {
return "nvshare", nil
}
gpuType = "nvidia"
}
if _, ok := n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
return "virtaitech", nil
if typeLabel, ok := n.Labels[NodeGPUTypeLabel]; ok {
gpuTypes[typeLabel] = struct{}{} // TODO: add driver version info
}
}
return gpuType, nil
return gpuTypes, nil
}
func IsNodeReady(node *corev1.Node) bool {

View File

@@ -30,7 +30,6 @@ import (
admissionv1 "k8s.io/api/admission/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/labels"
@@ -544,16 +543,21 @@ type EnvKeyValue struct {
}
// CreatePatchForDeployment add gpu env for deployment and returns patch bytes.
func CreatePatchForDeployment(tpl *corev1.PodTemplateSpec, namespace string, gpuRequired *resource.Quantity, typeKey string, envKeyValues []EnvKeyValue) ([]byte, error) {
patches, err := addResourceLimits(tpl, namespace, gpuRequired, typeKey, envKeyValues)
func CreatePatchForDeployment(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues []EnvKeyValue) ([]byte, error) {
patches, err := addResourceLimits(tpl, typeKey, envKeyValues)
if err != nil {
return []byte{}, err
}
return json.Marshal(patches)
}
func addResourceLimits(tpl *corev1.PodTemplateSpec, namespace string, gpuRequired *resource.Quantity, typeKey string, envKeyValues []EnvKeyValue) (patch []patchOp, err error) {
if typeKey == constants.NvidiaGPU || typeKey == constants.NvshareGPU {
func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues []EnvKeyValue) (patch []patchOp, err error) {
if typeKey == "" {
klog.Warning("No gpu type selected, skip adding resource limits")
return patch, nil
}
if typeKey == constants.NvidiaGPU || typeKey == constants.NvidiaGB10GPU {
if tpl.Spec.RuntimeClassName != nil {
patch = append(patch, patchOp{
Op: constants.PatchOpReplace,
@@ -584,7 +588,10 @@ func addResourceLimits(tpl *corev1.PodTemplateSpec, namespace string, gpuRequire
t := make(map[string]map[string]string)
t["limits"] = map[string]string{}
for k, v := range container.Resources.Limits {
if k.String() == constants.NvidiaGPU || k.String() == constants.NvshareGPU || k.String() == constants.VirtAiTechVGPU {
if k.String() == constants.NvidiaGPU ||
k.String() == constants.NvidiaGB10GPU ||
k.String() == constants.AMDAPU {
// unset all previous gpu limits
continue
}
t["limits"][k.String()] = v.String()