Compare commits
11 Commits
daemon/fix
...
appservice
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1cff8c5c24 | ||
|
|
e72fc4b1f3 | ||
|
|
c809ce4c58 | ||
|
|
82a5cbe08b | ||
|
|
a88cedb0ce | ||
|
|
06d0d36042 | ||
|
|
67deaf16ea | ||
|
|
b27854b863 | ||
|
|
4fd22c4e20 | ||
|
|
031d8164ff | ||
|
|
0c6def8f43 |
@@ -170,7 +170,7 @@ spec:
|
||||
priorityClassName: "system-cluster-critical"
|
||||
containers:
|
||||
- name: app-service
|
||||
image: beclab/app-service:0.4.77
|
||||
image: beclab/app-service:0.4.78
|
||||
imagePullPolicy: IfNotPresent
|
||||
ports:
|
||||
- containerPort: 6755
|
||||
|
||||
@@ -40,7 +40,7 @@ type UserEnvSyncController struct {
|
||||
|
||||
//+kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch
|
||||
//+kubebuilder:rbac:groups=iam.kubesphere.io,resources=users,verbs=get;list;watch
|
||||
//+kubebuilder:rbac:groups=sys.bytetrade.io,resources=userenvs,verbs=get;list;watch;create
|
||||
//+kubebuilder:rbac:groups=sys.bytetrade.io,resources=userenvs,verbs=get;list;watch;create;patch;update
|
||||
|
||||
func (r *UserEnvSyncController) SetupWithManager(mgr ctrl.Manager) error {
|
||||
cmPred := predicate.NewPredicateFuncs(func(obj client.Object) bool {
|
||||
@@ -164,14 +164,63 @@ func (r *UserEnvSyncController) syncUserEnvForUser(ctx context.Context, username
|
||||
return 0, fmt.Errorf("list userenvs in %s failed: %w", userNs, err)
|
||||
}
|
||||
|
||||
existSet := make(map[string]struct{}, len(existing.Items))
|
||||
existByName := make(map[string]*sysv1alpha1.UserEnv, len(existing.Items))
|
||||
for i := range existing.Items {
|
||||
existSet[existing.Items[i].EnvName] = struct{}{}
|
||||
existByName[existing.Items[i].EnvName] = &existing.Items[i]
|
||||
}
|
||||
|
||||
created := 0
|
||||
for _, spec := range base {
|
||||
if _, ok := existSet[spec.EnvName]; ok {
|
||||
if ue, ok := existByName[spec.EnvName]; ok {
|
||||
original := ue.DeepCopy()
|
||||
updated := false
|
||||
|
||||
if ue.Default == "" && spec.Default != "" {
|
||||
ue.Default = spec.Default
|
||||
updated = true
|
||||
}
|
||||
if ue.Type == "" && spec.Type != "" {
|
||||
ue.Type = spec.Type
|
||||
updated = true
|
||||
}
|
||||
if ue.Title == "" && spec.Title != "" {
|
||||
ue.Title = spec.Title
|
||||
updated = true
|
||||
}
|
||||
if ue.Description == "" && spec.Description != "" {
|
||||
ue.Description = spec.Description
|
||||
updated = true
|
||||
}
|
||||
if ue.RemoteOptions == "" && spec.RemoteOptions != "" {
|
||||
ue.RemoteOptions = spec.RemoteOptions
|
||||
updated = true
|
||||
}
|
||||
if ue.Regex == "" && spec.Regex != "" {
|
||||
ue.Regex = spec.Regex
|
||||
updated = true
|
||||
}
|
||||
|
||||
if len(spec.Options) > 0 {
|
||||
existOpt := make(map[string]struct{}, len(ue.Options))
|
||||
for _, it := range ue.Options {
|
||||
existOpt[it.Value] = struct{}{}
|
||||
}
|
||||
for _, it := range spec.Options {
|
||||
if _, exists := existOpt[it.Value]; exists {
|
||||
continue
|
||||
}
|
||||
ue.Options = append(ue.Options, it)
|
||||
existOpt[it.Value] = struct{}{}
|
||||
updated = true
|
||||
}
|
||||
}
|
||||
|
||||
if updated {
|
||||
if err := r.Patch(ctx, ue, client.MergeFrom(original)); err != nil {
|
||||
return created, fmt.Errorf("patch userenv %s/%s failed: %w", ue.Namespace, ue.Name, err)
|
||||
}
|
||||
klog.Infof("UserEnvSync: patched userenv %s/%s for user %s", ue.Namespace, ue.Name, username)
|
||||
}
|
||||
continue
|
||||
}
|
||||
name, err := apputils.EnvNameToResourceName(spec.EnvName)
|
||||
|
||||
@@ -126,15 +126,16 @@ type UpgradeRequest struct {
|
||||
|
||||
// InstallRequest represents a request to install an application.
|
||||
type InstallRequest struct {
|
||||
Dev bool `json:"devMode"`
|
||||
RepoURL string `json:"repoUrl"`
|
||||
CfgURL string `json:"cfgUrl"`
|
||||
Source AppSource `json:"source"`
|
||||
Images []Image `json:"images"`
|
||||
Envs []sysv1alpha1.AppEnvVar `json:"envs"`
|
||||
RawAppName string `json:"rawAppName"`
|
||||
Title string `json:"title"`
|
||||
Entrances []EntranceClone `json:"entrances"`
|
||||
Dev bool `json:"devMode"`
|
||||
RepoURL string `json:"repoUrl"`
|
||||
CfgURL string `json:"cfgUrl"`
|
||||
Source AppSource `json:"source"`
|
||||
Images []Image `json:"images"`
|
||||
Envs []sysv1alpha1.AppEnvVar `json:"envs"`
|
||||
RawAppName string `json:"rawAppName"`
|
||||
Title string `json:"title"`
|
||||
Entrances []EntranceClone `json:"entrances"`
|
||||
SelectedGpuType string `json:"selectedGpuType"`
|
||||
}
|
||||
|
||||
type Image struct {
|
||||
|
||||
@@ -3,11 +3,14 @@ package apiserver
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/exp/maps"
|
||||
|
||||
"github.com/beclab/Olares/framework/app-service/api/app.bytetrade.io/v1alpha1"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/appcfg"
|
||||
@@ -946,12 +949,37 @@ func (h *Handler) oamValues(req *restful.Request, resp *restful.Response) {
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
|
||||
gpuTypes, err := utils.GetAllGpuTypesFromNodes(&nodes)
|
||||
if err != nil {
|
||||
klog.Errorf("get gpu type failed %v", gpuType)
|
||||
klog.Errorf("get gpu type failed %v", err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
|
||||
gpuType := "none"
|
||||
selectedGpuType := req.QueryParameter("gputype")
|
||||
if len(gpuTypes) > 0 {
|
||||
if selectedGpuType != "" {
|
||||
if _, ok := gpuTypes[selectedGpuType]; ok {
|
||||
gpuType = selectedGpuType
|
||||
} else {
|
||||
err := fmt.Errorf("selected gpu type %s not found in cluster", selectedGpuType)
|
||||
klog.Error(err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
} else {
|
||||
if len(gpuTypes) == 1 {
|
||||
gpuType = maps.Keys(gpuTypes)[0]
|
||||
} else {
|
||||
err := fmt.Errorf("multiple gpu types found in cluster, please specify one")
|
||||
klog.Error(err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
values["GPU"] = map[string]interface{}{
|
||||
"Type": gpuType,
|
||||
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
|
||||
|
||||
@@ -6,10 +6,12 @@ import (
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
sysv1alpha1 "github.com/beclab/Olares/framework/app-service/api/sys.bytetrade.io/v1alpha1"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/constants"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/utils"
|
||||
apputils "github.com/beclab/Olares/framework/app-service/pkg/utils/app"
|
||||
"github.com/emicklei/go-restful/v3"
|
||||
@@ -76,32 +78,74 @@ func (h *Handler) updateAppEnv(req *restful.Request, resp *restful.Response) {
|
||||
return
|
||||
}
|
||||
|
||||
var refEnvOnce sync.Once
|
||||
var listErr error
|
||||
refEnvs := make(map[string]string)
|
||||
|
||||
updated := false
|
||||
original := targetAppEnv.DeepCopy()
|
||||
for i, existingEnv := range targetAppEnv.Envs {
|
||||
for _, env := range updatedEnvs {
|
||||
if existingEnv.EnvName == env.EnvName {
|
||||
if !existingEnv.Editable {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' is not editable", env.EnvName))
|
||||
return
|
||||
}
|
||||
if existingEnv.Required && existingEnv.Default == "" && env.Value == "" {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' is required", env.EnvName))
|
||||
return
|
||||
}
|
||||
if existingEnv.Value != env.Value {
|
||||
if err := existingEnv.ValidateValue(env.Value); err != nil {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("failed to update app env '%s': %v", env.EnvName, err))
|
||||
if existingEnv.EnvName != env.EnvName {
|
||||
continue
|
||||
}
|
||||
if !existingEnv.Editable {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' is not editable", env.EnvName))
|
||||
return
|
||||
}
|
||||
if existingEnv.Required && existingEnv.Default == "" && env.Value == "" && (env.ValueFrom == nil || env.ValueFrom.EnvName == "") {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' is required", env.EnvName))
|
||||
return
|
||||
}
|
||||
if env.ValueFrom != nil && env.ValueFrom.EnvName != "" && (existingEnv.ValueFrom == nil || existingEnv.ValueFrom.EnvName != env.ValueFrom.EnvName) {
|
||||
refEnvOnce.Do(func() {
|
||||
sysenvs := new(sysv1alpha1.SystemEnvList)
|
||||
listErr = h.ctrlClient.List(req.Request.Context(), sysenvs)
|
||||
if listErr != nil {
|
||||
return
|
||||
}
|
||||
targetAppEnv.Envs[i].Value = env.Value
|
||||
updated = true
|
||||
if existingEnv.ApplyOnChange {
|
||||
targetAppEnv.NeedApply = true
|
||||
userenvs := new(sysv1alpha1.UserEnvList)
|
||||
listErr = h.ctrlClient.List(req.Request.Context(), userenvs, client.InNamespace(utils.UserspaceName(owner)))
|
||||
for _, sysenv := range sysenvs.Items {
|
||||
refEnvs[sysenv.EnvName] = sysenv.GetEffectiveValue()
|
||||
}
|
||||
for _, userenv := range userenvs.Items {
|
||||
refEnvs[userenv.EnvName] = userenv.GetEffectiveValue()
|
||||
}
|
||||
})
|
||||
if listErr != nil {
|
||||
api.HandleInternalError(resp, req, fmt.Errorf("failed to list referenced envs: %s", listErr))
|
||||
return
|
||||
}
|
||||
break
|
||||
value, ok := refEnvs[env.ValueFrom.EnvName]
|
||||
if !ok {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' references unknown env '%s'", env.EnvName, env.ValueFrom.EnvName))
|
||||
return
|
||||
}
|
||||
if existingEnv.Required && value == "" {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("required app env '%s' references empty env '%s'", env.EnvName, env.ValueFrom.EnvName))
|
||||
return
|
||||
}
|
||||
if existingEnv.ValidateValue(value) != nil {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' references invalid value '%s' from '%s': %v", env.EnvName, value, env.ValueFrom.EnvName, err))
|
||||
return
|
||||
}
|
||||
targetAppEnv.Envs[i].ValueFrom = env.ValueFrom
|
||||
targetAppEnv.Envs[i].Value = value
|
||||
targetAppEnv.Envs[i].ValueFrom.Status = constants.EnvRefStatusSynced
|
||||
updated = true
|
||||
} else if existingEnv.Value != env.Value {
|
||||
if err := existingEnv.ValidateValue(env.Value); err != nil {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("failed to update app env '%s': %v", env.EnvName, err))
|
||||
return
|
||||
}
|
||||
targetAppEnv.Envs[i].Value = env.Value
|
||||
updated = true
|
||||
}
|
||||
if updated && existingEnv.ApplyOnChange {
|
||||
targetAppEnv.NeedApply = true
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,174 +1,33 @@
|
||||
package apiserver
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/client/clientset"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/constants"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/utils"
|
||||
"golang.org/x/exp/maps"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
|
||||
"github.com/emicklei/go-restful/v3"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/client-go/util/retry"
|
||||
"k8s.io/klog/v2"
|
||||
)
|
||||
|
||||
var running bool = false
|
||||
var switchLock sync.Mutex
|
||||
|
||||
func (h *Handler) disableGpuManagedMemory(req *restful.Request, resp *restful.Response) {
|
||||
if err := h.nvshareSwitch(req, false); err != nil {
|
||||
api.HandleError(resp, req, &errors.StatusError{
|
||||
ErrStatus: metav1.Status{Code: 400, Message: "operation failed, " + err.Error()},
|
||||
})
|
||||
|
||||
func (h *Handler) getGpuTypes(req *restful.Request, resp *restful.Response) {
|
||||
var nodes corev1.NodeList
|
||||
err := h.ctrlClient.List(req.Request.Context(), &nodes, &client.ListOptions{})
|
||||
if err != nil {
|
||||
klog.Errorf("list node failed %v", err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
resp.WriteAsJson(map[string]int{"code": 0})
|
||||
}
|
||||
|
||||
func (h *Handler) enableGpuManagedMemory(req *restful.Request, resp *restful.Response) {
|
||||
if err := h.nvshareSwitch(req, true); err != nil {
|
||||
api.HandleError(resp, req, &errors.StatusError{
|
||||
ErrStatus: metav1.Status{Code: 400, Message: "operation failed, " + err.Error()},
|
||||
})
|
||||
|
||||
gpuTypes, err := utils.GetAllGpuTypesFromNodes(&nodes)
|
||||
if err != nil {
|
||||
klog.Errorf("get gpu type failed %v", err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
resp.WriteAsJson(map[string]int{"code": 0})
|
||||
}
|
||||
|
||||
func (h *Handler) nvshareSwitch(req *restful.Request, enable bool) error {
|
||||
client := req.Attribute(constants.KubeSphereClientAttribute).(*clientset.ClientSet)
|
||||
switchLock.Lock()
|
||||
defer switchLock.Unlock()
|
||||
|
||||
if running {
|
||||
return fmt.Errorf("last operation is still running")
|
||||
}
|
||||
|
||||
deployments, err := client.KubeClient.Kubernetes().AppsV1().Deployments("").List(req.Request.Context(), metav1.ListOptions{})
|
||||
if err != nil {
|
||||
klog.Error("list deployment error, ", err)
|
||||
return err
|
||||
}
|
||||
|
||||
envValue := "0"
|
||||
if enable {
|
||||
envValue = "1"
|
||||
}
|
||||
|
||||
for _, d := range deployments.Items {
|
||||
shouldUpdate := false
|
||||
for i, c := range d.Spec.Template.Spec.Containers {
|
||||
found := false
|
||||
for k := range c.Resources.Limits {
|
||||
if k == constants.NvshareGPU {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if found {
|
||||
// a gpu request container
|
||||
addEnv := true
|
||||
for n, env := range d.Spec.Template.Spec.Containers[i].Env {
|
||||
if env.Name == constants.EnvNvshareManagedMemory {
|
||||
addEnv = false
|
||||
d.Spec.Template.Spec.Containers[i].Env[n].Value = envValue
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if addEnv {
|
||||
d.Spec.Template.Spec.Containers[i].Env =
|
||||
append(d.Spec.Template.Spec.Containers[i].Env,
|
||||
corev1.EnvVar{Name: constants.EnvNvshareManagedMemory, Value: envValue})
|
||||
}
|
||||
|
||||
shouldUpdate = true
|
||||
} // end found
|
||||
} // end of container loop
|
||||
|
||||
if shouldUpdate {
|
||||
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
|
||||
deployment, err := client.KubeClient.Kubernetes().AppsV1().Deployments(d.Namespace).
|
||||
Get(req.Request.Context(), d.Name, metav1.GetOptions{})
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
deployment.Spec.Template.Spec.Containers = d.Spec.Template.Spec.Containers
|
||||
|
||||
_, err = client.KubeClient.Kubernetes().AppsV1().Deployments(d.Namespace).
|
||||
Update(req.Request.Context(), deployment, metav1.UpdateOptions{})
|
||||
|
||||
return err
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
klog.Error("update deployment error, ", err, ", ", d.Name, ", ", d.Namespace)
|
||||
return err
|
||||
}
|
||||
} // should update
|
||||
} // end of deployment loop
|
||||
|
||||
// update terminus
|
||||
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
|
||||
terminus, err := utils.GetTerminus(req.Request.Context(), h.ctrlClient)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
terminus.Spec.Settings[constants.EnvNvshareManagedMemory] = envValue
|
||||
|
||||
return h.ctrlClient.Update(req.Request.Context(), terminus)
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
klog.Error("update terminus error, ", err)
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
running = true
|
||||
// delay 30s, assume the all pods will be reload in 30s.
|
||||
delay := time.NewTimer(30 * time.Second)
|
||||
go func() {
|
||||
<-delay.C
|
||||
switchLock.Lock()
|
||||
defer switchLock.Unlock()
|
||||
|
||||
running = false
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *Handler) getManagedMemoryValue(req *restful.Request, resp *restful.Response) {
|
||||
terminus, err := utils.GetTerminus(req.Request.Context(), h.ctrlClient)
|
||||
if err != nil {
|
||||
klog.Error("get terminus value error, ", err)
|
||||
api.HandleError(resp, req, &errors.StatusError{
|
||||
ErrStatus: metav1.Status{Code: 400, Message: "get value error, " + err.Error()},
|
||||
})
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
managed := true
|
||||
if v, ok := terminus.Spec.Settings[constants.EnvNvshareManagedMemory]; ok && v == "0" {
|
||||
managed = false
|
||||
}
|
||||
|
||||
resp.WriteAsJson(&map[string]interface{}{
|
||||
"managed_memory": managed,
|
||||
"gpu_types": maps.Keys(gpuTypes),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
@@ -21,9 +21,12 @@ import (
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/utils"
|
||||
apputils "github.com/beclab/Olares/framework/app-service/pkg/utils/app"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/utils/config"
|
||||
"golang.org/x/exp/maps"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
|
||||
"github.com/emicklei/go-restful/v3"
|
||||
"helm.sh/helm/v3/pkg/time"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
@@ -37,7 +40,7 @@ type depRequest struct {
|
||||
type installHelperIntf interface {
|
||||
getAdminUsers() (admin []string, isAdmin bool, err error)
|
||||
getInstalledApps() (installed bool, app []*v1alpha1.Application, err error)
|
||||
getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion string) (err error)
|
||||
getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion, selectedGpuType string) (err error)
|
||||
setAppConfig(req *api.InstallRequest, appName string)
|
||||
validate(bool, []*v1alpha1.Application) error
|
||||
setAppEnv(overrides []sysv1alpha1.AppEnvVar) error
|
||||
@@ -105,6 +108,36 @@ func (h *Handler) install(req *restful.Request, resp *restful.Response) {
|
||||
}
|
||||
}
|
||||
|
||||
// check selected gpu type can be supported
|
||||
// if selectedGpuType != "" , then check if the gpu type exists in cluster
|
||||
// if selectedGpuType == "" , and only one gpu type exists in cluster, then use it
|
||||
var nodes corev1.NodeList
|
||||
err = h.ctrlClient.List(req.Request.Context(), &nodes, &client.ListOptions{})
|
||||
if err != nil {
|
||||
klog.Errorf("list node failed %v", err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
gpuTypes, err := utils.GetAllGpuTypesFromNodes(&nodes)
|
||||
if err != nil {
|
||||
klog.Errorf("get gpu type failed %v", err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
|
||||
if insReq.SelectedGpuType != "" {
|
||||
if _, ok := gpuTypes[insReq.SelectedGpuType]; !ok {
|
||||
klog.Errorf("selected gpu type %s not found in cluster", insReq.SelectedGpuType)
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("selected gpu type %s not found in cluster", insReq.SelectedGpuType))
|
||||
return
|
||||
}
|
||||
} else {
|
||||
if len(gpuTypes) == 1 {
|
||||
insReq.SelectedGpuType = maps.Keys(gpuTypes)[0]
|
||||
klog.Infof("only one gpu type %s found in cluster, use it as selected gpu type", insReq.SelectedGpuType)
|
||||
}
|
||||
}
|
||||
|
||||
apiVersion, appCfg, err := apputils.GetApiVersionFromAppConfig(req.Request.Context(), &apputils.ConfigOptions{
|
||||
App: app,
|
||||
RawAppName: rawAppName,
|
||||
@@ -112,6 +145,7 @@ func (h *Handler) install(req *restful.Request, resp *restful.Response) {
|
||||
RepoURL: insReq.RepoURL,
|
||||
MarketSource: marketSource,
|
||||
Version: chartVersion,
|
||||
SelectedGpu: insReq.SelectedGpuType,
|
||||
})
|
||||
klog.Infof("chartVersion: %s", chartVersion)
|
||||
if err != nil {
|
||||
@@ -188,7 +222,7 @@ func (h *Handler) install(req *restful.Request, resp *restful.Response) {
|
||||
return
|
||||
}
|
||||
|
||||
err = helper.getAppConfig(adminUsers, marketSource, isAdmin, appInstalled, installedApps, chartVersion)
|
||||
err = helper.getAppConfig(adminUsers, marketSource, isAdmin, appInstalled, installedApps, chartVersion, insReq.SelectedGpuType)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to get app config err=%v", err)
|
||||
return
|
||||
@@ -423,7 +457,7 @@ func (h *installHandlerHelper) getInstalledApps() (installed bool, app []*v1alph
|
||||
return
|
||||
}
|
||||
|
||||
func (h *installHandlerHelper) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion string) (err error) {
|
||||
func (h *installHandlerHelper) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion, selectedGpuType string) (err error) {
|
||||
var (
|
||||
admin string
|
||||
installAsAdmin bool
|
||||
@@ -472,6 +506,7 @@ func (h *installHandlerHelper) getAppConfig(adminUsers []string, marketSource st
|
||||
Admin: admin,
|
||||
IsAdmin: installAsAdmin,
|
||||
MarketSource: marketSource,
|
||||
SelectedGpu: selectedGpuType,
|
||||
})
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to get appconfig err=%v", err)
|
||||
@@ -685,7 +720,7 @@ func (h *installHandlerHelperV2) _validateClusterScope(isAdmin bool, installedAp
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *installHandlerHelperV2) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion string) (err error) {
|
||||
func (h *installHandlerHelperV2) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion, selectedGpuType string) (err error) {
|
||||
klog.Info("get app config for install handler v2")
|
||||
|
||||
var (
|
||||
@@ -713,6 +748,7 @@ func (h *installHandlerHelperV2) getAppConfig(adminUsers []string, marketSource
|
||||
Admin: admin,
|
||||
MarketSource: marketSource,
|
||||
IsAdmin: isAdmin,
|
||||
SelectedGpu: selectedGpuType,
|
||||
})
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to get appconfig err=%v", err)
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
"github.com/beclab/Olares/framework/app-service/api/app.bytetrade.io/v1alpha1"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/appcfg"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/appinstaller"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/appstate"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/client/clientset"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/constants"
|
||||
@@ -520,6 +521,7 @@ type applicationPermission struct {
|
||||
Permissions []permission `json:"permissions"`
|
||||
}
|
||||
|
||||
// Deprecated
|
||||
func (h *Handler) applicationPermissionList(req *restful.Request, resp *restful.Response) {
|
||||
owner := req.Attribute(constants.UserContextAttribute).(string)
|
||||
//token := req.HeaderParameter(constants.AuthorizationTokenKey)
|
||||
@@ -572,46 +574,39 @@ func (h *Handler) applicationPermissionList(req *restful.Request, resp *restful.
|
||||
func (h *Handler) getApplicationPermission(req *restful.Request, resp *restful.Response) {
|
||||
app := req.PathParameter(ParamAppName)
|
||||
owner := req.Attribute(constants.UserContextAttribute).(string)
|
||||
client, err := dynamic.NewForConfig(h.kubeConfig)
|
||||
name, err := apputils.FmtAppMgrName(app, owner, "")
|
||||
if err != nil {
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
var am v1alpha1.ApplicationManager
|
||||
err = h.ctrlClient.Get(req.Request.Context(), types.NamespacedName{Name: name}, &am)
|
||||
if err != nil {
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
|
||||
var appConfig appcfg.ApplicationConfig
|
||||
err = am.GetAppConfig(&appConfig)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to get app config err=%v", err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
|
||||
var ret *applicationPermission
|
||||
apClient := provider.NewApplicationPermissionRequest(client)
|
||||
namespace := fmt.Sprintf("user-system-%s", owner)
|
||||
aps, err := apClient.List(req.Request.Context(), namespace, metav1.ListOptions{})
|
||||
if err != nil {
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
for _, ap := range aps.Items {
|
||||
if ap.Object == nil {
|
||||
continue
|
||||
}
|
||||
appName, _, _ := unstructured.NestedString(ap.Object, "spec", "app")
|
||||
if appName == app {
|
||||
perms, _, _ := unstructured.NestedSlice(ap.Object, "spec", "permissions")
|
||||
permissions := appinstaller.ParseAppPermission(appConfig.Permission)
|
||||
for _, ap := range permissions {
|
||||
if perms, ok := ap.([]appcfg.ProviderPermission); ok {
|
||||
permissions := make([]permission, 0)
|
||||
for _, p := range perms {
|
||||
if perm, ok := p.(map[string]interface{}); ok {
|
||||
ops := make([]string, 0)
|
||||
for _, op := range perm["ops"].([]interface{}) {
|
||||
if opStr, ok := op.(string); ok {
|
||||
ops = append(ops, opStr)
|
||||
}
|
||||
}
|
||||
permissions = append(permissions, permission{
|
||||
DataType: perm["dataType"].(string),
|
||||
Group: perm["group"].(string),
|
||||
Version: perm["version"].(string),
|
||||
Ops: ops,
|
||||
})
|
||||
}
|
||||
|
||||
permissions = append(permissions, permission{
|
||||
DataType: p.ProviderName,
|
||||
Group: p.AppName,
|
||||
})
|
||||
}
|
||||
ret = &applicationPermission{
|
||||
App: appName,
|
||||
App: am.Spec.AppName,
|
||||
Owner: owner,
|
||||
Permissions: permissions,
|
||||
}
|
||||
@@ -642,6 +637,7 @@ type opApi struct {
|
||||
URI string `json:"uri"`
|
||||
}
|
||||
|
||||
// Deprecated
|
||||
func (h *Handler) getProviderRegistry(req *restful.Request, resp *restful.Response) {
|
||||
dataTypeReq := req.PathParameter(ParamDataType)
|
||||
groupReq := req.PathParameter(ParamGroup)
|
||||
@@ -708,56 +704,44 @@ func (h *Handler) getProviderRegistry(req *restful.Request, resp *restful.Respon
|
||||
func (h *Handler) getApplicationProviderList(req *restful.Request, resp *restful.Response) {
|
||||
owner := req.Attribute(constants.UserContextAttribute).(string)
|
||||
app := req.PathParameter(ParamAppName)
|
||||
client, err := dynamic.NewForConfig(h.kubeConfig)
|
||||
|
||||
name, err := apputils.FmtAppMgrName(app, owner, "")
|
||||
if err != nil {
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
var am v1alpha1.ApplicationManager
|
||||
err = h.ctrlClient.Get(req.Request.Context(), types.NamespacedName{Name: name}, &am)
|
||||
if err != nil {
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
|
||||
var appConfig appcfg.ApplicationConfig
|
||||
err = am.GetAppConfig(&appConfig)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to get app config err=%v", err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
|
||||
ret := make([]providerRegistry, 0)
|
||||
rClient := provider.NewRegistryRequest(client)
|
||||
namespace := fmt.Sprintf("user-system-%s", owner)
|
||||
prs, err := rClient.List(req.Request.Context(), namespace, metav1.ListOptions{})
|
||||
if err != nil {
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
for _, ap := range prs.Items {
|
||||
if ap.Object == nil {
|
||||
continue
|
||||
}
|
||||
deployment, _, _ := unstructured.NestedString(ap.Object, "spec", "deployment")
|
||||
kind, _, _ := unstructured.NestedString(ap.Object, "spec", "kind")
|
||||
|
||||
if app == deployment && kind == "provider" {
|
||||
dataType, _, _ := unstructured.NestedString(ap.Object, "spec", "dataType")
|
||||
group, _, _ := unstructured.NestedString(ap.Object, "spec", "group")
|
||||
description, _, _ := unstructured.NestedString(ap.Object, "spec", "description")
|
||||
endpoint, _, _ := unstructured.NestedString(ap.Object, "spec", "endpoint")
|
||||
ns, _, _ := unstructured.NestedString(ap.Object, "spec", "namespace")
|
||||
version, _, _ := unstructured.NestedString(ap.Object, "spec", "version")
|
||||
opApis := make([]opApi, 0)
|
||||
opApiList, _, _ := unstructured.NestedSlice(ap.Object, "spec", "opApis")
|
||||
for _, op := range opApiList {
|
||||
if aop, ok := op.(map[string]interface{}); ok {
|
||||
opApis = append(opApis, opApi{
|
||||
Name: aop["name"].(string),
|
||||
URI: aop["uri"].(string),
|
||||
})
|
||||
}
|
||||
}
|
||||
ret = append(ret, providerRegistry{
|
||||
DataType: dataType,
|
||||
Deployment: deployment,
|
||||
Description: description,
|
||||
Endpoint: endpoint,
|
||||
Kind: kind,
|
||||
Group: group,
|
||||
Namespace: ns,
|
||||
OpApis: opApis,
|
||||
Version: version,
|
||||
ns := am.Spec.AppNamespace
|
||||
for _, ap := range appConfig.Provider {
|
||||
dataType := ap.Name
|
||||
endpoint := ap.Entrance
|
||||
opApis := make([]opApi, 0)
|
||||
for _, op := range ap.Paths {
|
||||
opApis = append(opApis, opApi{
|
||||
URI: op,
|
||||
})
|
||||
|
||||
}
|
||||
ret = append(ret, providerRegistry{
|
||||
DataType: dataType,
|
||||
Endpoint: endpoint,
|
||||
Namespace: ns,
|
||||
OpApis: opApis,
|
||||
})
|
||||
}
|
||||
resp.WriteAsJson(ret)
|
||||
}
|
||||
|
||||
@@ -37,7 +37,6 @@ import (
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/client-go/dynamic"
|
||||
"k8s.io/klog/v2"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
|
||||
)
|
||||
|
||||
@@ -308,36 +307,21 @@ func (h *Handler) gpuLimitMutate(ctx context.Context, req *admissionv1.Admission
|
||||
return resp
|
||||
}
|
||||
|
||||
GPUType, err := h.findNvidiaGpuFromNodes(ctx)
|
||||
if err != nil && !errors.Is(err, api.ErrGPUNodeNotFound) {
|
||||
return h.sidecarWebhook.AdmissionError(req.UID, err)
|
||||
}
|
||||
GPUType := appcfg.GetSelectedGpuTypeValue()
|
||||
|
||||
// no gpu found, no need to inject env, just return.
|
||||
if GPUType == "" {
|
||||
if GPUType == "none" || GPUType == "" {
|
||||
return resp
|
||||
}
|
||||
|
||||
terminus, err := utils.GetTerminus(ctx, h.ctrlClient)
|
||||
if err != nil {
|
||||
return h.sidecarWebhook.AdmissionError(req.UID, err)
|
||||
}
|
||||
nvshareManagedMemory := ""
|
||||
if terminus.Spec.Settings != nil {
|
||||
nvshareManagedMemory = terminus.Spec.Settings[constants.EnvNvshareManagedMemory]
|
||||
envs := []webhook.EnvKeyValue{
|
||||
{
|
||||
Key: constants.EnvGPUType,
|
||||
Value: GPUType,
|
||||
},
|
||||
}
|
||||
|
||||
envs := []webhook.EnvKeyValue{}
|
||||
if nvshareManagedMemory != "" {
|
||||
envs = append(envs, webhook.EnvKeyValue{
|
||||
Key: constants.EnvNvshareManagedMemory,
|
||||
Value: nvshareManagedMemory,
|
||||
})
|
||||
}
|
||||
|
||||
envs = append(envs, webhook.EnvKeyValue{Key: "NVSHARE_DEBUG", Value: "1"})
|
||||
|
||||
patchBytes, err := webhook.CreatePatchForDeployment(tpl, req.Namespace, gpuRequired, GPUType, envs)
|
||||
patchBytes, err := webhook.CreatePatchForDeployment(tpl, h.getGPUResourceTypeKey(GPUType), envs)
|
||||
if err != nil {
|
||||
klog.Errorf("create patch error %v", err)
|
||||
return h.sidecarWebhook.AdmissionError(req.UID, err)
|
||||
@@ -347,33 +331,17 @@ func (h *Handler) gpuLimitMutate(ctx context.Context, req *admissionv1.Admission
|
||||
return resp
|
||||
}
|
||||
|
||||
func (h *Handler) findNvidiaGpuFromNodes(ctx context.Context) (string, error) {
|
||||
var nodes corev1.NodeList
|
||||
err := h.ctrlClient.List(ctx, &nodes, &client.ListOptions{})
|
||||
if err != nil {
|
||||
return "", err
|
||||
func (h *Handler) getGPUResourceTypeKey(gpuType string) string {
|
||||
switch gpuType {
|
||||
case utils.NvidiaCardType:
|
||||
return constants.NvidiaGPU
|
||||
case utils.GB10ChipType:
|
||||
return constants.NvidiaGB10GPU
|
||||
case utils.AmdApuCardType:
|
||||
return constants.AMDAPU
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
|
||||
// return nvshare gpu or virtaitech gpu in priority
|
||||
gtype := ""
|
||||
for _, n := range nodes.Items {
|
||||
if _, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
|
||||
if _, ok = n.Status.Capacity[constants.NvshareGPU]; ok {
|
||||
return constants.NvshareGPU, nil
|
||||
}
|
||||
gtype = constants.NvidiaGPU
|
||||
}
|
||||
|
||||
if _, ok := n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
|
||||
return constants.VirtAiTechVGPU, nil
|
||||
}
|
||||
}
|
||||
|
||||
if gtype != "" {
|
||||
return gtype, nil
|
||||
}
|
||||
|
||||
return "", api.ErrGPUNodeNotFound
|
||||
}
|
||||
|
||||
func (h *Handler) providerRegistryValidate(req *restful.Request, resp *restful.Response) {
|
||||
|
||||
@@ -340,7 +340,9 @@ func GetClusterResource(kubeConfig *rest.Config, token string) (*prometheus.Clus
|
||||
arches.Insert(n.Labels["kubernetes.io/arch"])
|
||||
if quantity, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
} else if quantity, ok = n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
|
||||
} else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
} else if quantity, ok = n.Status.Capacity[constants.AMDAPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -254,21 +254,9 @@ func addServiceToContainer(c *restful.Container, handler *Handler) error {
|
||||
Param(ws.PathParameter(ParamEntranceName, "the name of a application entrance")).
|
||||
Returns(http.StatusOK, "Success to set the application entrance policy", nil))
|
||||
|
||||
ws.Route(ws.POST("/gpu/disable/managed-memory").
|
||||
To(handler.disableGpuManagedMemory).
|
||||
Doc("disable nvshare's managed memory ").
|
||||
Metadata(restfulspec.KeyOpenAPITags, MODULE_TAGS).
|
||||
Returns(http.StatusOK, "Success to disable", nil))
|
||||
|
||||
ws.Route(ws.POST("/gpu/enable/managed-memory").
|
||||
To(handler.enableGpuManagedMemory).
|
||||
Doc("enable nvshare's managed memory ").
|
||||
Metadata(restfulspec.KeyOpenAPITags, MODULE_TAGS).
|
||||
Returns(http.StatusOK, "Success to enable", nil))
|
||||
|
||||
ws.Route(ws.GET("/gpu/managed-memory").
|
||||
To(handler.getManagedMemoryValue).
|
||||
Doc("get nvshare's managed memory enabled or not").
|
||||
ws.Route(ws.GET("/gpu/types").
|
||||
To(handler.getGpuTypes).
|
||||
Doc("get all gpu types in the cluster").
|
||||
Metadata(restfulspec.KeyOpenAPITags, MODULE_TAGS).
|
||||
Returns(http.StatusOK, "Success to get ", &ResultResponse{}))
|
||||
|
||||
|
||||
@@ -56,14 +56,19 @@ type AppSpec struct {
|
||||
Developer string `yaml:"developer" json:"developer"`
|
||||
RequiredMemory string `yaml:"requiredMemory" json:"requiredMemory"`
|
||||
RequiredDisk string `yaml:"requiredDisk" json:"requiredDisk"`
|
||||
SupportClient SupportClient `yaml:"supportClient" json:"supportClient"`
|
||||
RequiredGPU string `yaml:"requiredGpu" json:"requiredGpu"`
|
||||
RequiredCPU string `yaml:"requiredCpu" json:"requiredCpu"`
|
||||
LimitedMemory string `yaml:"limitedMemory" json:"limitedMemory"`
|
||||
LimitedDisk string `yaml:"limitedDisk" json:"limitedDisk"`
|
||||
LimitedGPU string `yaml:"limitedGPU" json:"limitedGPU"`
|
||||
LimitedCPU string `yaml:"limitedCPU" json:"limitedCPU"`
|
||||
SupportClient SupportClient `yaml:"supportClient" json:"supportClient"`
|
||||
RunAsUser bool `yaml:"runAsUser" json:"runAsUser"`
|
||||
RunAsInternal bool `yaml:"runAsInternal" json:"runAsInternal"`
|
||||
PodGPUConsumePolicy string `yaml:"podGpuConsumePolicy" json:"podGpuConsumePolicy"`
|
||||
SubCharts []Chart `yaml:"subCharts" json:"subCharts"`
|
||||
Hardware Hardware `yaml:"hardware" json:"hardware"`
|
||||
SupportedGpu []any `yaml:"supportedGpu,omitempty" json:"supportedGpu,omitempty"`
|
||||
}
|
||||
|
||||
type Hardware struct {
|
||||
@@ -188,6 +193,17 @@ type Provider struct {
|
||||
Verbs []string `yaml:"verbs" json:"verbs"`
|
||||
}
|
||||
|
||||
type SpecialResource struct {
|
||||
RequiredMemory *string `yaml:"requiredMemory,omitempty" json:"requiredMemory,omitempty"`
|
||||
RequiredDisk *string `yaml:"requiredDisk,omitempty" json:"requiredDisk,omitempty"`
|
||||
RequiredGPU *string `yaml:"requiredGpu,omitempty" json:"requiredGpu,omitempty"`
|
||||
RequiredCPU *string `yaml:"requiredCpu,omitempty" json:"requiredCpu,omitempty"`
|
||||
LimitedMemory *string `yaml:"limitedMemory,omitempty" json:"limitedMemory,omitempty"`
|
||||
LimitedDisk *string `yaml:"limitedDisk,omitempty" json:"limitedDisk,omitempty"`
|
||||
LimitedGPU *string `yaml:"limitedGPU,omitempty" json:"limitedGPU,omitempty"`
|
||||
LimitedCPU *string `yaml:"limitedCPU,omitempty" json:"limitedCPU,omitempty"`
|
||||
}
|
||||
|
||||
func (c *Chart) Namespace(owner string) string {
|
||||
if c.Shared {
|
||||
return fmt.Sprintf("%s-%s", c.Name, "shared")
|
||||
|
||||
@@ -100,6 +100,7 @@ type ApplicationConfig struct {
|
||||
PodsSelectors []metav1.LabelSelector
|
||||
HardwareRequirement Hardware
|
||||
SharedEntrances []v1alpha1.Entrance
|
||||
SelectedGpuType string
|
||||
}
|
||||
|
||||
func (c *ApplicationConfig) IsMiddleware() bool {
|
||||
@@ -159,6 +160,13 @@ func (c *ApplicationConfig) GenSharedEntranceURL(ctx context.Context) ([]v1alpha
|
||||
return app.GenSharedEntranceURL(ctx)
|
||||
}
|
||||
|
||||
func (c *ApplicationConfig) GetSelectedGpuTypeValue() string {
|
||||
if c.SelectedGpuType == "" {
|
||||
return "none"
|
||||
}
|
||||
return c.SelectedGpuType
|
||||
}
|
||||
|
||||
func (p *ProviderPermission) GetNamespace(ownerName string) string {
|
||||
if p.Namespace != "" {
|
||||
if p.Namespace == "user-space" || p.Namespace == "user-system" {
|
||||
|
||||
@@ -752,7 +752,7 @@ func getApplicationPolicy(policies []appcfg.AppPolicy, entrances []appv1alpha1.E
|
||||
return string(policyStr), nil
|
||||
}
|
||||
|
||||
func parseAppPermission(data []appcfg.AppPermission) []appcfg.AppPermission {
|
||||
func ParseAppPermission(data []appcfg.AppPermission) []appcfg.AppPermission {
|
||||
permissions := make([]appcfg.AppPermission, 0)
|
||||
for _, p := range data {
|
||||
switch perm := p.(type) {
|
||||
|
||||
@@ -78,7 +78,7 @@ func (h *HelmOps) Uninstall_(client kubernetes.Interface, actionConfig *action.C
|
||||
return err
|
||||
}
|
||||
|
||||
h.app.Permission = parseAppPermission(h.app.Permission)
|
||||
h.app.Permission = ParseAppPermission(h.app.Permission)
|
||||
var perm []appcfg.ProviderPermission
|
||||
for _, p := range h.app.Permission {
|
||||
if t, ok := p.([]appcfg.ProviderPermission); ok {
|
||||
|
||||
@@ -50,7 +50,7 @@ func (h *HelmOps) SetValues() (values map[string]interface{}, err error) {
|
||||
|
||||
values["domain"] = entries
|
||||
userspace := make(map[string]interface{})
|
||||
h.app.Permission = parseAppPermission(h.app.Permission)
|
||||
h.app.Permission = ParseAppPermission(h.app.Permission)
|
||||
for _, p := range h.app.Permission {
|
||||
switch perm := p.(type) {
|
||||
case appcfg.AppDataPermission, appcfg.AppCachePermission, appcfg.UserDataPermission:
|
||||
@@ -170,17 +170,12 @@ func (h *HelmOps) SetValues() (values map[string]interface{}, err error) {
|
||||
values["cluster"] = map[string]interface{}{
|
||||
"arch": arch,
|
||||
}
|
||||
gpuType, err := utils.FindGpuTypeFromNodes(nodes)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to get gpuType err=%v", err)
|
||||
return values, err
|
||||
}
|
||||
values["GPU"] = map[string]interface{}{
|
||||
"Type": gpuType,
|
||||
"Type": h.app.GetSelectedGpuTypeValue(),
|
||||
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
|
||||
}
|
||||
|
||||
values["gpu"] = gpuType
|
||||
values["gpu"] = h.app.GetSelectedGpuTypeValue()
|
||||
|
||||
if h.app.OIDC.Enabled {
|
||||
err = h.createOIDCClient(values, zone, h.app.Namespace)
|
||||
|
||||
@@ -16,7 +16,6 @@ import (
|
||||
apputils "github.com/beclab/Olares/framework/app-service/pkg/utils/app"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"k8s.io/klog/v2"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
@@ -176,19 +175,8 @@ func (p *DownloadingApp) exec(ctx context.Context) error {
|
||||
},
|
||||
}
|
||||
|
||||
var nodes corev1.NodeList
|
||||
err = p.client.List(ctx, &nodes, &client.ListOptions{})
|
||||
if err != nil {
|
||||
klog.Errorf("list node failed %v", err)
|
||||
return err
|
||||
}
|
||||
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
|
||||
if err != nil {
|
||||
klog.Errorf("get gpu type failed %v", gpuType)
|
||||
return err
|
||||
}
|
||||
values["GPU"] = map[string]interface{}{
|
||||
"Type": gpuType,
|
||||
"Type": appConfig.GetSelectedGpuTypeValue(),
|
||||
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
|
||||
}
|
||||
|
||||
|
||||
@@ -22,7 +22,6 @@ import (
|
||||
|
||||
"github.com/pkg/errors"
|
||||
"helm.sh/helm/v3/pkg/action"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"k8s.io/klog/v2"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
@@ -214,19 +213,8 @@ func (p *UpgradingApp) exec(ctx context.Context) error {
|
||||
"username": p.manager.Spec.AppOwner,
|
||||
},
|
||||
}
|
||||
var nodes corev1.NodeList
|
||||
err = p.client.List(ctx, &nodes, &client.ListOptions{})
|
||||
if err != nil {
|
||||
klog.Errorf("list node failed %v", err)
|
||||
return err
|
||||
}
|
||||
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
|
||||
if err != nil {
|
||||
klog.Errorf("get gpu type failed %v", gpuType)
|
||||
return err
|
||||
}
|
||||
values["GPU"] = map[string]interface{}{
|
||||
"Type": gpuType,
|
||||
"Type": appConfig.GetSelectedGpuTypeValue(),
|
||||
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
|
||||
}
|
||||
|
||||
|
||||
@@ -78,13 +78,15 @@ const (
|
||||
SidecarInitContainerName = "olares-sidecar-init"
|
||||
EnvoyConfigWorkDirName = "envoy-config"
|
||||
|
||||
ByteTradeAuthor = "bytetrade.io"
|
||||
NvshareGPU = "nvshare.com/gpu"
|
||||
NvidiaGPU = "nvidia.com/gpu"
|
||||
VirtAiTechVGPU = "virtaitech.com/gpu"
|
||||
PatchOpAdd = "add"
|
||||
PatchOpReplace = "replace"
|
||||
EnvNvshareManagedMemory = "NVSHARE_MANAGED_MEMORY"
|
||||
ByteTradeAuthor = "bytetrade.io"
|
||||
PatchOpAdd = "add"
|
||||
PatchOpReplace = "replace"
|
||||
EnvGPUType = "GPU_TYPE"
|
||||
|
||||
// gpu resource keys
|
||||
NvidiaGPU = "nvidia.com/gpu"
|
||||
NvidiaGB10GPU = "nvidia.com/gb10"
|
||||
AMDAPU = "amd.com/apu"
|
||||
|
||||
AuthorizationLevelOfPublic = "public"
|
||||
AuthorizationLevelOfPrivate = "private"
|
||||
|
||||
@@ -273,11 +273,7 @@ func (c *Creator) installSysApps(ctx context.Context, bflPod *corev1.Pod) error
|
||||
"arch": arch,
|
||||
}
|
||||
|
||||
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
vals["gpu"] = gpuType
|
||||
vals["gpu"] = "none" // unused currently
|
||||
|
||||
userIndex, userSubnet, err := c.getUserSubnet(ctx)
|
||||
if err != nil {
|
||||
|
||||
@@ -16,6 +16,7 @@ import (
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
|
||||
sysv1alpha1 "github.com/beclab/Olares/framework/app-service/api/sys.bytetrade.io/v1alpha1"
|
||||
"github.com/go-viper/mapstructure/v2"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
|
||||
"github.com/beclab/Olares/framework/app-service/api/app.bytetrade.io/v1alpha1"
|
||||
@@ -674,6 +675,7 @@ type ConfigOptions struct {
|
||||
MarketSource string
|
||||
IsAdmin bool
|
||||
RawAppName string
|
||||
SelectedGpu string
|
||||
}
|
||||
|
||||
// GetAppConfig get app installation configuration from app store
|
||||
@@ -740,7 +742,7 @@ func getAppConfigFromRepo(ctx context.Context, options *ConfigOptions) (*appcfg.
|
||||
return getAppConfigFromConfigurationFile(options, chartPath)
|
||||
}
|
||||
|
||||
func toApplicationConfig(app, chart, rawAppName string, cfg *appcfg.AppConfiguration) (*appcfg.ApplicationConfig, string, error) {
|
||||
func toApplicationConfig(app, chart, rawAppName, selectedGpu string, cfg *appcfg.AppConfiguration) (*appcfg.ApplicationConfig, string, error) {
|
||||
var permission []appcfg.AppPermission
|
||||
if cfg.Permission.AppData {
|
||||
permission = append(permission, appcfg.AppDataRW)
|
||||
@@ -788,6 +790,57 @@ func toApplicationConfig(app, chart, rawAppName string, cfg *appcfg.AppConfigura
|
||||
return nil, chart, err
|
||||
}
|
||||
|
||||
// set suppertedGpu to ["nvidia","nvidia-gb10"] by default
|
||||
if len(cfg.Spec.SupportedGpu) == 0 {
|
||||
cfg.Spec.SupportedGpu = []interface{}{utils.NvidiaCardType, utils.GB10ChipType}
|
||||
}
|
||||
|
||||
// try to get selected GPU type special resource requirement
|
||||
if selectedGpu != "" {
|
||||
found := false
|
||||
for _, supportedGpu := range cfg.Spec.SupportedGpu {
|
||||
if str, ok := supportedGpu.(string); ok && str == selectedGpu {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
|
||||
if supportedGpuResourceMap, ok := supportedGpu.(map[string]interface{}); ok {
|
||||
if resourceRequirement, ok := supportedGpuResourceMap[selectedGpu].(map[string]interface{}); ok {
|
||||
found = true
|
||||
var specialResource appcfg.SpecialResource
|
||||
err := mapstructure.Decode(resourceRequirement, &specialResource)
|
||||
if err != nil {
|
||||
return nil, chart, fmt.Errorf("failed to decode special resource for selected GPU type %s: %v", selectedGpu, err)
|
||||
}
|
||||
|
||||
for _, resSetter := range []struct {
|
||||
v **resource.Quantity
|
||||
s *string
|
||||
}{
|
||||
{v: &mem, s: specialResource.RequiredMemory},
|
||||
{v: &disk, s: specialResource.RequiredDisk},
|
||||
{v: &cpu, s: specialResource.RequiredCPU},
|
||||
{v: &gpu, s: specialResource.RequiredGPU},
|
||||
} {
|
||||
|
||||
if resSetter.s != nil && *resSetter.s != "" {
|
||||
*resSetter.v, err = valuePtr(resource.ParseQuantity(*resSetter.s))
|
||||
if err != nil {
|
||||
return nil, chart, fmt.Errorf("failed to parse special resource quantity %s: %v", *resSetter.s, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
break
|
||||
} // end if selected gpu's resource requirement found
|
||||
} // end if supportedGpu is map
|
||||
} // end for supportedGpu
|
||||
|
||||
if !found {
|
||||
return nil, chart, fmt.Errorf("selected GPU type %s is not supported", selectedGpu)
|
||||
}
|
||||
}
|
||||
|
||||
// transform from Policy to AppPolicy
|
||||
var policies []appcfg.AppPolicy
|
||||
for _, p := range cfg.Options.Policies {
|
||||
@@ -877,6 +930,7 @@ func toApplicationConfig(app, chart, rawAppName string, cfg *appcfg.AppConfigura
|
||||
PodsSelectors: podSelectors,
|
||||
HardwareRequirement: cfg.Spec.Hardware,
|
||||
SharedEntrances: cfg.SharedEntrances,
|
||||
SelectedGpuType: selectedGpu,
|
||||
}, chart, nil
|
||||
}
|
||||
|
||||
@@ -890,7 +944,7 @@ func getAppConfigFromConfigurationFile(opt *ConfigOptions, chartPath string) (*a
|
||||
return nil, chartPath, err
|
||||
}
|
||||
|
||||
return toApplicationConfig(opt.App, chartPath, opt.RawAppName, &cfg)
|
||||
return toApplicationConfig(opt.App, chartPath, opt.RawAppName, opt.SelectedGpu, &cfg)
|
||||
}
|
||||
|
||||
func checkVersionFormat(constraint string) error {
|
||||
|
||||
@@ -234,7 +234,9 @@ func CheckAppRequirement(token string, appConfig *appcfg.ApplicationConfig, op v
|
||||
return constants.CPU, constants.SystemCPUPressure, fmt.Errorf(constants.SystemCPUPressureMessage, op)
|
||||
}
|
||||
}
|
||||
if appConfig.Requirement.GPU != nil {
|
||||
|
||||
// only support nvidia gpu managment by HAMi for now
|
||||
if appConfig.Requirement.GPU != nil && appConfig.GetSelectedGpuTypeValue() == utils.NvidiaCardType {
|
||||
if !appConfig.Requirement.GPU.IsZero() && metrics.GPU.Total <= 0 {
|
||||
return constants.GPU, constants.SystemGPUNotAvailable, fmt.Errorf(constants.SystemGPUNotAvailableMessage, op)
|
||||
|
||||
@@ -398,7 +400,9 @@ func GetClusterResource(token string) (*prometheus.ClusterMetrics, []string, err
|
||||
arches.Insert(n.Labels["kubernetes.io/arch"])
|
||||
if quantity, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
} else if quantity, ok = n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
|
||||
} else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
} else if quantity, ok = n.Status.Capacity[constants.AMDAPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
}
|
||||
}
|
||||
|
||||
12
framework/app-service/pkg/utils/gpu_types.go
Normal file
12
framework/app-service/pkg/utils/gpu_types.go
Normal file
@@ -0,0 +1,12 @@
|
||||
package utils
|
||||
|
||||
const (
|
||||
NodeGPUTypeLabel = "gpu.bytetrade.io/type"
|
||||
)
|
||||
|
||||
const (
|
||||
NvidiaCardType = "nvidia" // handling by HAMi
|
||||
AmdGpuCardType = "amd-gpu" //
|
||||
AmdApuCardType = "amd-apu" // AMD APU with integrated GPU , AI Max 395 etc.
|
||||
GB10ChipType = "nvidia-gb10" // NVIDIA GB10 Superchip & unified system memory
|
||||
)
|
||||
@@ -103,24 +103,37 @@ func GetAllNodesTunnelIPCIDRs() (cidrs []string) {
|
||||
return cidrs
|
||||
}
|
||||
|
||||
func FindGpuTypeFromNodes(nodes *corev1.NodeList) (string, error) {
|
||||
gpuType := "none"
|
||||
// func FindGpuTypeFromNodes(nodes *corev1.NodeList) (string, error) {
|
||||
// gpuType := "none"
|
||||
// if nodes == nil {
|
||||
// return gpuType, errors.New("empty node list")
|
||||
// }
|
||||
// for _, n := range nodes.Items {
|
||||
// if _, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
|
||||
// if _, ok = n.Status.Capacity[constants.NvshareGPU]; ok {
|
||||
// return "nvshare", nil
|
||||
|
||||
// }
|
||||
// gpuType = "nvidia"
|
||||
// }
|
||||
// if _, ok := n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
|
||||
// return "virtaitech", nil
|
||||
// }
|
||||
// }
|
||||
// return gpuType, nil
|
||||
// }
|
||||
|
||||
func GetAllGpuTypesFromNodes(nodes *corev1.NodeList) (map[string]struct{}, error) {
|
||||
gpuTypes := make(map[string]struct{})
|
||||
if nodes == nil {
|
||||
return gpuType, errors.New("empty node list")
|
||||
return gpuTypes, errors.New("empty node list")
|
||||
}
|
||||
for _, n := range nodes.Items {
|
||||
if _, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
|
||||
if _, ok = n.Status.Capacity[constants.NvshareGPU]; ok {
|
||||
return "nvshare", nil
|
||||
|
||||
}
|
||||
gpuType = "nvidia"
|
||||
}
|
||||
if _, ok := n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
|
||||
return "virtaitech", nil
|
||||
if typeLabel, ok := n.Labels[NodeGPUTypeLabel]; ok {
|
||||
gpuTypes[typeLabel] = struct{}{} // TODO: add driver version info
|
||||
}
|
||||
}
|
||||
return gpuType, nil
|
||||
return gpuTypes, nil
|
||||
}
|
||||
|
||||
func IsNodeReady(node *corev1.Node) bool {
|
||||
|
||||
@@ -30,7 +30,6 @@ import (
|
||||
admissionv1 "k8s.io/api/admission/v1"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
@@ -544,16 +543,21 @@ type EnvKeyValue struct {
|
||||
}
|
||||
|
||||
// CreatePatchForDeployment add gpu env for deployment and returns patch bytes.
|
||||
func CreatePatchForDeployment(tpl *corev1.PodTemplateSpec, namespace string, gpuRequired *resource.Quantity, typeKey string, envKeyValues []EnvKeyValue) ([]byte, error) {
|
||||
patches, err := addResourceLimits(tpl, namespace, gpuRequired, typeKey, envKeyValues)
|
||||
func CreatePatchForDeployment(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues []EnvKeyValue) ([]byte, error) {
|
||||
patches, err := addResourceLimits(tpl, typeKey, envKeyValues)
|
||||
if err != nil {
|
||||
return []byte{}, err
|
||||
}
|
||||
return json.Marshal(patches)
|
||||
}
|
||||
|
||||
func addResourceLimits(tpl *corev1.PodTemplateSpec, namespace string, gpuRequired *resource.Quantity, typeKey string, envKeyValues []EnvKeyValue) (patch []patchOp, err error) {
|
||||
if typeKey == constants.NvidiaGPU || typeKey == constants.NvshareGPU {
|
||||
func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues []EnvKeyValue) (patch []patchOp, err error) {
|
||||
if typeKey == "" {
|
||||
klog.Warning("No gpu type selected, skip adding resource limits")
|
||||
return patch, nil
|
||||
}
|
||||
|
||||
if typeKey == constants.NvidiaGPU || typeKey == constants.NvidiaGB10GPU {
|
||||
if tpl.Spec.RuntimeClassName != nil {
|
||||
patch = append(patch, patchOp{
|
||||
Op: constants.PatchOpReplace,
|
||||
@@ -584,7 +588,10 @@ func addResourceLimits(tpl *corev1.PodTemplateSpec, namespace string, gpuRequire
|
||||
t := make(map[string]map[string]string)
|
||||
t["limits"] = map[string]string{}
|
||||
for k, v := range container.Resources.Limits {
|
||||
if k.String() == constants.NvidiaGPU || k.String() == constants.NvshareGPU || k.String() == constants.VirtAiTechVGPU {
|
||||
if k.String() == constants.NvidiaGPU ||
|
||||
k.String() == constants.NvidiaGB10GPU ||
|
||||
k.String() == constants.AMDAPU {
|
||||
// unset all previous gpu limits
|
||||
continue
|
||||
}
|
||||
t["limits"][k.String()] = v.String()
|
||||
|
||||
Reference in New Issue
Block a user