Compare commits

...

13 Commits

Author SHA1 Message Date
hysyeah
c17d281df3 fix: app uninstall delete data (#2478) 2026-02-03 20:16:33 +08:00
dkeven
cf8e3cd017 feat(appservice): support updating more fields in api & controller (#2472) 2026-02-03 13:57:41 +08:00
Power-One-2025
e6e85b40d2 fix a link issue 2026-02-03 13:57:41 +08:00
berg
f9f8d2a328 desktop, settings, files, vault: fix multiple known issues (#2467)
feat: update login, system frontend, user service version
2026-02-03 13:57:41 +08:00
eball
e070d5eb37 authelia: add user regulation for TOTP authentication attempts (#2466) 2026-02-03 13:57:41 +08:00
dkeven
aa7afba374 fix(cli): unify config setting for release command (#2465) 2026-02-03 13:57:41 +08:00
eball
63fab4a454 app-service: add support for selecting GPU types in application installation 2026-02-03 13:57:41 +08:00
eball
46b1e62d3a feat: add support for selecting GPU types in application installation (#2458)
* fix: failed release upgrade

* fix: helm upgrade do not use atomic param and allow upgrade failed release

* feat: add clickhouse support

* appservice image tag to 0.4.76

* feat: add icon filed to nats event

* chores: get all node gpu types

* feat: add support for selecting GPU types in application installation

* feat: enhance GPU type selection logic in application installation

* feat: replace hardcoded GPU type with constant for supported GPU selection

* feat: update app config methods to include selected GPU type and enhance validation for NVIDIA GPUs

* feat: update supported GPU handling to include default options and improve validation logic

* feat: update GPU resource handling to unset previous limits before setting new ones

* feat: refactor permission parsing to use exported function and update related calls

---------

Co-authored-by: hys <hysyeah@gmail.com>
2026-02-03 13:57:41 +08:00
hys
f6b1f2c544 fix: add spec ports 2026-02-03 13:57:39 +08:00
hys
f1bec97238 fix: check k8s request before into installing state 2026-02-03 13:56:15 +08:00
hys
9acaaf4f09 fix: v2 app stop 2026-02-03 13:56:13 +08:00
hys
acf813bf5a fix: helm upgrade do not use atomic param and allow upgrade failed release 2026-02-03 13:54:51 +08:00
hys
19bcc5195e fix: failed release upgrade 2026-02-03 13:54:51 +08:00
32 changed files with 578 additions and 450 deletions

View File

@@ -317,7 +317,7 @@ spec:
chown -R 1000:1000 /uploadstemp && \
chown -R 1000:1000 /appdata
- name: olares-app-init
image: beclab/system-frontend:v1.8.4
image: beclab/system-frontend:v1.8.5
imagePullPolicy: IfNotPresent
command:
- /bin/sh
@@ -439,7 +439,7 @@ spec:
- name: NATS_SUBJECT_VAULT
value: os.vault.{{ .Values.bfl.username}}
- name: user-service
image: beclab/user-service:v0.0.84
image: beclab/user-service:v0.0.85
imagePullPolicy: IfNotPresent
ports:
- containerPort: 3000

View File

@@ -8,26 +8,27 @@ import (
"strings"
"time"
"github.com/beclab/Olares/cli/pkg/core/common"
"github.com/beclab/Olares/cli/cmd/config"
"github.com/beclab/Olares/cli/pkg/common"
corecommon "github.com/beclab/Olares/cli/pkg/core/common"
"github.com/beclab/Olares/cli/pkg/core/util"
"github.com/beclab/Olares/cli/pkg/release/builder"
"github.com/spf13/cobra"
"github.com/spf13/viper"
)
func NewCmdRelease() *cobra.Command {
var (
baseDir string
version string
cdn string
ignoreMissingImages bool
extract bool
)
cmd := &cobra.Command{
Use: "release",
Short: "Build release based on a local Olares repository",
Run: func(cmd *cobra.Command, args []string) {
baseDir := viper.GetString(common.FlagBaseDir)
version := viper.GetString(common.FlagVersion)
cdn := viper.GetString(common.FlagCDNService)
ignoreMissingImages := viper.GetBool(common.FlagIgnoreMissingImages)
extract := viper.GetBool(common.FlagExtract)
cwd, err := os.Getwd()
if err != nil {
fmt.Printf("failed to get current working directory: %s\n", err)
@@ -43,7 +44,7 @@ func NewCmdRelease() *cobra.Command {
fmt.Printf("failed to get current user: %s\n", err)
os.Exit(1)
}
baseDir = filepath.Join(usr.HomeDir, common.DefaultBaseDir)
baseDir = filepath.Join(usr.HomeDir, corecommon.DefaultBaseDir)
fmt.Printf("--base-dir unspecified, using: %s\n", baseDir)
time.Sleep(1 * time.Second)
}
@@ -75,11 +76,20 @@ func NewCmdRelease() *cobra.Command {
},
}
cmd.Flags().StringVarP(&baseDir, "base-dir", "b", "", "base directory of Olares, where this release will be extracted to as a new version if --extract/-e is not disabled, defaults to $HOME/"+common.DefaultBaseDir)
cmd.Flags().StringVarP(&version, "version", "v", "", "version of this release, defaults to 0.0.0-local-dev-{yyyymmddhhmmss}")
cmd.Flags().StringVar(&cdn, "cdn-service", common.DefaultOlaresCDNService, "CDN used for downloading checksums of dependencies and images")
cmd.Flags().BoolVar(&ignoreMissingImages, "ignore-missing-images", true, "ignore missing images when downloading cheksums from CDN, only disable this if no new image is added, or the build may fail because the image is not uploaded to the CDN yet")
cmd.Flags().BoolVarP(&extract, "extract", "e", true, "extract this release to --base-dir after build, this can be disabled if only the release file itself is needed")
flagSetter := config.NewFlagSetterFor(cmd)
config.AddBaseDirFlagBy(flagSetter)
config.AddVersionFlagBy(flagSetter)
config.AddCDNServiceFlagBy(flagSetter)
flagSetter.Add(common.FlagIgnoreMissingImages,
"",
true,
"ignore missing images when downloading checksums from CDN, only disable this if no new image is added, or the build may fail because the image is not uploaded to the CDN yet",
)
flagSetter.Add(common.FlagExtract,
"e",
true,
"extract this release to --base-dir after build, this can be disabled if only the release file itself is needed",
)
return cmd
}

View File

@@ -261,12 +261,14 @@ const (
FlagOSPassword = "os-password"
EnvLegacyEncryptedOSPassword = "TERMINUS_OS_PASSWORD"
FlagCDNService = "cdn-service"
FlagManifest = "manifest"
FlagURLOverride = "url-override"
FlagReleaseID = "release-id"
FlagKubeType = "kube-type"
FlagLegacyKubeType = "kube"
FlagCDNService = "cdn-service"
FlagExtract = "extract"
FlagIgnoreMissingImages = "ignore-missing-images"
FlagManifest = "manifest"
FlagURLOverride = "url-override"
FlagReleaseID = "release-id"
FlagKubeType = "kube-type"
FlagLegacyKubeType = "kube"
FlagEnableJuiceFS = "enable-juicefs"
FlagLegacyEnableJuiceFS = "with-juicefs"

View File

@@ -429,7 +429,7 @@ const side = {
},
{
text: "本地访问 Olares",
link: "/manual/best-practices/local-access",
link: "/zh/manual/best-practices/local-access",
},
],
},

View File

@@ -170,7 +170,7 @@ spec:
priorityClassName: "system-cluster-critical"
containers:
- name: app-service
image: beclab/app-service:0.4.77
image: beclab/app-service:0.4.78
imagePullPolicy: IfNotPresent
ports:
- containerPort: 6755

View File

@@ -40,7 +40,7 @@ type UserEnvSyncController struct {
//+kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch
//+kubebuilder:rbac:groups=iam.kubesphere.io,resources=users,verbs=get;list;watch
//+kubebuilder:rbac:groups=sys.bytetrade.io,resources=userenvs,verbs=get;list;watch;create
//+kubebuilder:rbac:groups=sys.bytetrade.io,resources=userenvs,verbs=get;list;watch;create;patch;update
func (r *UserEnvSyncController) SetupWithManager(mgr ctrl.Manager) error {
cmPred := predicate.NewPredicateFuncs(func(obj client.Object) bool {
@@ -164,14 +164,63 @@ func (r *UserEnvSyncController) syncUserEnvForUser(ctx context.Context, username
return 0, fmt.Errorf("list userenvs in %s failed: %w", userNs, err)
}
existSet := make(map[string]struct{}, len(existing.Items))
existByName := make(map[string]*sysv1alpha1.UserEnv, len(existing.Items))
for i := range existing.Items {
existSet[existing.Items[i].EnvName] = struct{}{}
existByName[existing.Items[i].EnvName] = &existing.Items[i]
}
created := 0
for _, spec := range base {
if _, ok := existSet[spec.EnvName]; ok {
if ue, ok := existByName[spec.EnvName]; ok {
original := ue.DeepCopy()
updated := false
if ue.Default == "" && spec.Default != "" {
ue.Default = spec.Default
updated = true
}
if ue.Type == "" && spec.Type != "" {
ue.Type = spec.Type
updated = true
}
if ue.Title == "" && spec.Title != "" {
ue.Title = spec.Title
updated = true
}
if ue.Description == "" && spec.Description != "" {
ue.Description = spec.Description
updated = true
}
if ue.RemoteOptions == "" && spec.RemoteOptions != "" {
ue.RemoteOptions = spec.RemoteOptions
updated = true
}
if ue.Regex == "" && spec.Regex != "" {
ue.Regex = spec.Regex
updated = true
}
if len(spec.Options) > 0 {
existOpt := make(map[string]struct{}, len(ue.Options))
for _, it := range ue.Options {
existOpt[it.Value] = struct{}{}
}
for _, it := range spec.Options {
if _, exists := existOpt[it.Value]; exists {
continue
}
ue.Options = append(ue.Options, it)
existOpt[it.Value] = struct{}{}
updated = true
}
}
if updated {
if err := r.Patch(ctx, ue, client.MergeFrom(original)); err != nil {
return created, fmt.Errorf("patch userenv %s/%s failed: %w", ue.Namespace, ue.Name, err)
}
klog.Infof("UserEnvSync: patched userenv %s/%s for user %s", ue.Namespace, ue.Name, username)
}
continue
}
name, err := apputils.EnvNameToResourceName(spec.EnvName)

View File

@@ -13,6 +13,7 @@ const (
AppMarketSourceKey = constants.AppMarketSourceKey
AppInstallSourceKey = "bytetrade.io/install-source"
AppUninstallAllKey = "bytetrade.io/uninstall-all"
AppDeleteDataKey = "bytetrade.io/delete-data"
AppStopAllKey = "bytetrade.io/stop-all"
AppResumeAllKey = "bytetrade.io/resume-all"
AppImagesKey = "bytetrade.io/images"
@@ -126,15 +127,16 @@ type UpgradeRequest struct {
// InstallRequest represents a request to install an application.
type InstallRequest struct {
Dev bool `json:"devMode"`
RepoURL string `json:"repoUrl"`
CfgURL string `json:"cfgUrl"`
Source AppSource `json:"source"`
Images []Image `json:"images"`
Envs []sysv1alpha1.AppEnvVar `json:"envs"`
RawAppName string `json:"rawAppName"`
Title string `json:"title"`
Entrances []EntranceClone `json:"entrances"`
Dev bool `json:"devMode"`
RepoURL string `json:"repoUrl"`
CfgURL string `json:"cfgUrl"`
Source AppSource `json:"source"`
Images []Image `json:"images"`
Envs []sysv1alpha1.AppEnvVar `json:"envs"`
RawAppName string `json:"rawAppName"`
Title string `json:"title"`
Entrances []EntranceClone `json:"entrances"`
SelectedGpuType string `json:"selectedGpuType"`
}
type Image struct {
@@ -144,7 +146,8 @@ type Image struct {
// UninstallRequest represents a request to uninstall an application.
type UninstallRequest struct {
All bool `json:"all"`
All bool `json:"all"`
DeleteData bool `json:"deleteData"`
}
// StopRequest represents a request to stop an application.

View File

@@ -3,11 +3,14 @@ package apiserver
import (
"context"
"encoding/json"
"fmt"
"os"
"sort"
"strconv"
"strings"
"golang.org/x/exp/maps"
"github.com/beclab/Olares/framework/app-service/api/app.bytetrade.io/v1alpha1"
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
"github.com/beclab/Olares/framework/app-service/pkg/appcfg"
@@ -946,12 +949,37 @@ func (h *Handler) oamValues(req *restful.Request, resp *restful.Response) {
api.HandleError(resp, req, err)
return
}
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
gpuTypes, err := utils.GetAllGpuTypesFromNodes(&nodes)
if err != nil {
klog.Errorf("get gpu type failed %v", gpuType)
klog.Errorf("get gpu type failed %v", err)
api.HandleError(resp, req, err)
return
}
gpuType := "none"
selectedGpuType := req.QueryParameter("gputype")
if len(gpuTypes) > 0 {
if selectedGpuType != "" {
if _, ok := gpuTypes[selectedGpuType]; ok {
gpuType = selectedGpuType
} else {
err := fmt.Errorf("selected gpu type %s not found in cluster", selectedGpuType)
klog.Error(err)
api.HandleError(resp, req, err)
return
}
} else {
if len(gpuTypes) == 1 {
gpuType = maps.Keys(gpuTypes)[0]
} else {
err := fmt.Errorf("multiple gpu types found in cluster, please specify one")
klog.Error(err)
api.HandleError(resp, req, err)
return
}
}
}
values["GPU"] = map[string]interface{}{
"Type": gpuType,
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),

View File

@@ -6,10 +6,12 @@ import (
"io"
"net/http"
"net/url"
"sync"
"time"
sysv1alpha1 "github.com/beclab/Olares/framework/app-service/api/sys.bytetrade.io/v1alpha1"
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
"github.com/beclab/Olares/framework/app-service/pkg/constants"
"github.com/beclab/Olares/framework/app-service/pkg/utils"
apputils "github.com/beclab/Olares/framework/app-service/pkg/utils/app"
"github.com/emicklei/go-restful/v3"
@@ -76,32 +78,74 @@ func (h *Handler) updateAppEnv(req *restful.Request, resp *restful.Response) {
return
}
var refEnvOnce sync.Once
var listErr error
refEnvs := make(map[string]string)
updated := false
original := targetAppEnv.DeepCopy()
for i, existingEnv := range targetAppEnv.Envs {
for _, env := range updatedEnvs {
if existingEnv.EnvName == env.EnvName {
if !existingEnv.Editable {
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' is not editable", env.EnvName))
return
}
if existingEnv.Required && existingEnv.Default == "" && env.Value == "" {
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' is required", env.EnvName))
return
}
if existingEnv.Value != env.Value {
if err := existingEnv.ValidateValue(env.Value); err != nil {
api.HandleBadRequest(resp, req, fmt.Errorf("failed to update app env '%s': %v", env.EnvName, err))
if existingEnv.EnvName != env.EnvName {
continue
}
if !existingEnv.Editable {
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' is not editable", env.EnvName))
return
}
if existingEnv.Required && existingEnv.Default == "" && env.Value == "" && (env.ValueFrom == nil || env.ValueFrom.EnvName == "") {
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' is required", env.EnvName))
return
}
if env.ValueFrom != nil && env.ValueFrom.EnvName != "" && (existingEnv.ValueFrom == nil || existingEnv.ValueFrom.EnvName != env.ValueFrom.EnvName) {
refEnvOnce.Do(func() {
sysenvs := new(sysv1alpha1.SystemEnvList)
listErr = h.ctrlClient.List(req.Request.Context(), sysenvs)
if listErr != nil {
return
}
targetAppEnv.Envs[i].Value = env.Value
updated = true
if existingEnv.ApplyOnChange {
targetAppEnv.NeedApply = true
userenvs := new(sysv1alpha1.UserEnvList)
listErr = h.ctrlClient.List(req.Request.Context(), userenvs, client.InNamespace(utils.UserspaceName(owner)))
for _, sysenv := range sysenvs.Items {
refEnvs[sysenv.EnvName] = sysenv.GetEffectiveValue()
}
for _, userenv := range userenvs.Items {
refEnvs[userenv.EnvName] = userenv.GetEffectiveValue()
}
})
if listErr != nil {
api.HandleInternalError(resp, req, fmt.Errorf("failed to list referenced envs: %s", listErr))
return
}
break
value, ok := refEnvs[env.ValueFrom.EnvName]
if !ok {
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' references unknown env '%s'", env.EnvName, env.ValueFrom.EnvName))
return
}
if existingEnv.Required && value == "" {
api.HandleBadRequest(resp, req, fmt.Errorf("required app env '%s' references empty env '%s'", env.EnvName, env.ValueFrom.EnvName))
return
}
if existingEnv.ValidateValue(value) != nil {
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' references invalid value '%s' from '%s': %v", env.EnvName, value, env.ValueFrom.EnvName, err))
return
}
targetAppEnv.Envs[i].ValueFrom = env.ValueFrom
targetAppEnv.Envs[i].Value = value
targetAppEnv.Envs[i].ValueFrom.Status = constants.EnvRefStatusSynced
updated = true
} else if existingEnv.Value != env.Value {
if err := existingEnv.ValidateValue(env.Value); err != nil {
api.HandleBadRequest(resp, req, fmt.Errorf("failed to update app env '%s': %v", env.EnvName, err))
return
}
targetAppEnv.Envs[i].Value = env.Value
updated = true
}
if updated && existingEnv.ApplyOnChange {
targetAppEnv.NeedApply = true
}
break
}
}

View File

@@ -1,174 +1,33 @@
package apiserver
import (
"fmt"
"sync"
"time"
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
"github.com/beclab/Olares/framework/app-service/pkg/client/clientset"
"github.com/beclab/Olares/framework/app-service/pkg/constants"
"github.com/beclab/Olares/framework/app-service/pkg/utils"
"golang.org/x/exp/maps"
"sigs.k8s.io/controller-runtime/pkg/client"
"github.com/emicklei/go-restful/v3"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/util/retry"
"k8s.io/klog/v2"
)
var running bool = false
var switchLock sync.Mutex
func (h *Handler) disableGpuManagedMemory(req *restful.Request, resp *restful.Response) {
if err := h.nvshareSwitch(req, false); err != nil {
api.HandleError(resp, req, &errors.StatusError{
ErrStatus: metav1.Status{Code: 400, Message: "operation failed, " + err.Error()},
})
func (h *Handler) getGpuTypes(req *restful.Request, resp *restful.Response) {
var nodes corev1.NodeList
err := h.ctrlClient.List(req.Request.Context(), &nodes, &client.ListOptions{})
if err != nil {
klog.Errorf("list node failed %v", err)
api.HandleError(resp, req, err)
return
}
resp.WriteAsJson(map[string]int{"code": 0})
}
func (h *Handler) enableGpuManagedMemory(req *restful.Request, resp *restful.Response) {
if err := h.nvshareSwitch(req, true); err != nil {
api.HandleError(resp, req, &errors.StatusError{
ErrStatus: metav1.Status{Code: 400, Message: "operation failed, " + err.Error()},
})
gpuTypes, err := utils.GetAllGpuTypesFromNodes(&nodes)
if err != nil {
klog.Errorf("get gpu type failed %v", err)
api.HandleError(resp, req, err)
return
}
resp.WriteAsJson(map[string]int{"code": 0})
}
func (h *Handler) nvshareSwitch(req *restful.Request, enable bool) error {
client := req.Attribute(constants.KubeSphereClientAttribute).(*clientset.ClientSet)
switchLock.Lock()
defer switchLock.Unlock()
if running {
return fmt.Errorf("last operation is still running")
}
deployments, err := client.KubeClient.Kubernetes().AppsV1().Deployments("").List(req.Request.Context(), metav1.ListOptions{})
if err != nil {
klog.Error("list deployment error, ", err)
return err
}
envValue := "0"
if enable {
envValue = "1"
}
for _, d := range deployments.Items {
shouldUpdate := false
for i, c := range d.Spec.Template.Spec.Containers {
found := false
for k := range c.Resources.Limits {
if k == constants.NvshareGPU {
found = true
break
}
}
if found {
// a gpu request container
addEnv := true
for n, env := range d.Spec.Template.Spec.Containers[i].Env {
if env.Name == constants.EnvNvshareManagedMemory {
addEnv = false
d.Spec.Template.Spec.Containers[i].Env[n].Value = envValue
break
}
}
if addEnv {
d.Spec.Template.Spec.Containers[i].Env =
append(d.Spec.Template.Spec.Containers[i].Env,
corev1.EnvVar{Name: constants.EnvNvshareManagedMemory, Value: envValue})
}
shouldUpdate = true
} // end found
} // end of container loop
if shouldUpdate {
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
deployment, err := client.KubeClient.Kubernetes().AppsV1().Deployments(d.Namespace).
Get(req.Request.Context(), d.Name, metav1.GetOptions{})
if err != nil {
return err
}
deployment.Spec.Template.Spec.Containers = d.Spec.Template.Spec.Containers
_, err = client.KubeClient.Kubernetes().AppsV1().Deployments(d.Namespace).
Update(req.Request.Context(), deployment, metav1.UpdateOptions{})
return err
})
if err != nil {
klog.Error("update deployment error, ", err, ", ", d.Name, ", ", d.Namespace)
return err
}
} // should update
} // end of deployment loop
// update terminus
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
terminus, err := utils.GetTerminus(req.Request.Context(), h.ctrlClient)
if err != nil {
return err
}
terminus.Spec.Settings[constants.EnvNvshareManagedMemory] = envValue
return h.ctrlClient.Update(req.Request.Context(), terminus)
})
if err != nil {
klog.Error("update terminus error, ", err)
return err
}
running = true
// delay 30s, assume the all pods will be reload in 30s.
delay := time.NewTimer(30 * time.Second)
go func() {
<-delay.C
switchLock.Lock()
defer switchLock.Unlock()
running = false
}()
return nil
}
func (h *Handler) getManagedMemoryValue(req *restful.Request, resp *restful.Response) {
terminus, err := utils.GetTerminus(req.Request.Context(), h.ctrlClient)
if err != nil {
klog.Error("get terminus value error, ", err)
api.HandleError(resp, req, &errors.StatusError{
ErrStatus: metav1.Status{Code: 400, Message: "get value error, " + err.Error()},
})
return
}
managed := true
if v, ok := terminus.Spec.Settings[constants.EnvNvshareManagedMemory]; ok && v == "0" {
managed = false
}
resp.WriteAsJson(&map[string]interface{}{
"managed_memory": managed,
"gpu_types": maps.Keys(gpuTypes),
},
)
}

View File

@@ -21,9 +21,12 @@ import (
"github.com/beclab/Olares/framework/app-service/pkg/utils"
apputils "github.com/beclab/Olares/framework/app-service/pkg/utils/app"
"github.com/beclab/Olares/framework/app-service/pkg/utils/config"
"golang.org/x/exp/maps"
"sigs.k8s.io/controller-runtime/pkg/client"
"github.com/emicklei/go-restful/v3"
"helm.sh/helm/v3/pkg/time"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
@@ -37,7 +40,7 @@ type depRequest struct {
type installHelperIntf interface {
getAdminUsers() (admin []string, isAdmin bool, err error)
getInstalledApps() (installed bool, app []*v1alpha1.Application, err error)
getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion string) (err error)
getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion, selectedGpuType string) (err error)
setAppConfig(req *api.InstallRequest, appName string)
validate(bool, []*v1alpha1.Application) error
setAppEnv(overrides []sysv1alpha1.AppEnvVar) error
@@ -105,6 +108,36 @@ func (h *Handler) install(req *restful.Request, resp *restful.Response) {
}
}
// check selected gpu type can be supported
// if selectedGpuType != "" , then check if the gpu type exists in cluster
// if selectedGpuType == "" , and only one gpu type exists in cluster, then use it
var nodes corev1.NodeList
err = h.ctrlClient.List(req.Request.Context(), &nodes, &client.ListOptions{})
if err != nil {
klog.Errorf("list node failed %v", err)
api.HandleError(resp, req, err)
return
}
gpuTypes, err := utils.GetAllGpuTypesFromNodes(&nodes)
if err != nil {
klog.Errorf("get gpu type failed %v", err)
api.HandleError(resp, req, err)
return
}
if insReq.SelectedGpuType != "" {
if _, ok := gpuTypes[insReq.SelectedGpuType]; !ok {
klog.Errorf("selected gpu type %s not found in cluster", insReq.SelectedGpuType)
api.HandleBadRequest(resp, req, fmt.Errorf("selected gpu type %s not found in cluster", insReq.SelectedGpuType))
return
}
} else {
if len(gpuTypes) == 1 {
insReq.SelectedGpuType = maps.Keys(gpuTypes)[0]
klog.Infof("only one gpu type %s found in cluster, use it as selected gpu type", insReq.SelectedGpuType)
}
}
apiVersion, appCfg, err := apputils.GetApiVersionFromAppConfig(req.Request.Context(), &apputils.ConfigOptions{
App: app,
RawAppName: rawAppName,
@@ -112,6 +145,7 @@ func (h *Handler) install(req *restful.Request, resp *restful.Response) {
RepoURL: insReq.RepoURL,
MarketSource: marketSource,
Version: chartVersion,
SelectedGpu: insReq.SelectedGpuType,
})
klog.Infof("chartVersion: %s", chartVersion)
if err != nil {
@@ -188,7 +222,7 @@ func (h *Handler) install(req *restful.Request, resp *restful.Response) {
return
}
err = helper.getAppConfig(adminUsers, marketSource, isAdmin, appInstalled, installedApps, chartVersion)
err = helper.getAppConfig(adminUsers, marketSource, isAdmin, appInstalled, installedApps, chartVersion, insReq.SelectedGpuType)
if err != nil {
klog.Errorf("Failed to get app config err=%v", err)
return
@@ -423,7 +457,7 @@ func (h *installHandlerHelper) getInstalledApps() (installed bool, app []*v1alph
return
}
func (h *installHandlerHelper) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion string) (err error) {
func (h *installHandlerHelper) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion, selectedGpuType string) (err error) {
var (
admin string
installAsAdmin bool
@@ -472,6 +506,7 @@ func (h *installHandlerHelper) getAppConfig(adminUsers []string, marketSource st
Admin: admin,
IsAdmin: installAsAdmin,
MarketSource: marketSource,
SelectedGpu: selectedGpuType,
})
if err != nil {
klog.Errorf("Failed to get appconfig err=%v", err)
@@ -685,7 +720,7 @@ func (h *installHandlerHelperV2) _validateClusterScope(isAdmin bool, installedAp
return nil
}
func (h *installHandlerHelperV2) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion string) (err error) {
func (h *installHandlerHelperV2) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion, selectedGpuType string) (err error) {
klog.Info("get app config for install handler v2")
var (
@@ -713,6 +748,7 @@ func (h *installHandlerHelperV2) getAppConfig(adminUsers []string, marketSource
Admin: admin,
MarketSource: marketSource,
IsAdmin: isAdmin,
SelectedGpu: selectedGpuType,
})
if err != nil {
klog.Errorf("Failed to get appconfig err=%v", err)

View File

@@ -77,6 +77,7 @@ func (h *Handler) uninstall(req *restful.Request, resp *restful.Response) {
}
am.Annotations[api.AppTokenKey] = token
am.Annotations[api.AppUninstallAllKey] = fmt.Sprintf("%t", request.All)
am.Annotations[api.AppDeleteDataKey] = fmt.Sprintf("%t", request.DeleteData)
err = h.ctrlClient.Update(req.Request.Context(), &am)
if err != nil {
api.HandleError(resp, req, err)

View File

@@ -13,6 +13,7 @@ import (
"github.com/beclab/Olares/framework/app-service/api/app.bytetrade.io/v1alpha1"
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
"github.com/beclab/Olares/framework/app-service/pkg/appcfg"
"github.com/beclab/Olares/framework/app-service/pkg/appinstaller"
"github.com/beclab/Olares/framework/app-service/pkg/appstate"
"github.com/beclab/Olares/framework/app-service/pkg/client/clientset"
"github.com/beclab/Olares/framework/app-service/pkg/constants"
@@ -520,6 +521,7 @@ type applicationPermission struct {
Permissions []permission `json:"permissions"`
}
// Deprecated
func (h *Handler) applicationPermissionList(req *restful.Request, resp *restful.Response) {
owner := req.Attribute(constants.UserContextAttribute).(string)
//token := req.HeaderParameter(constants.AuthorizationTokenKey)
@@ -572,46 +574,39 @@ func (h *Handler) applicationPermissionList(req *restful.Request, resp *restful.
func (h *Handler) getApplicationPermission(req *restful.Request, resp *restful.Response) {
app := req.PathParameter(ParamAppName)
owner := req.Attribute(constants.UserContextAttribute).(string)
client, err := dynamic.NewForConfig(h.kubeConfig)
name, err := apputils.FmtAppMgrName(app, owner, "")
if err != nil {
api.HandleError(resp, req, err)
return
}
var am v1alpha1.ApplicationManager
err = h.ctrlClient.Get(req.Request.Context(), types.NamespacedName{Name: name}, &am)
if err != nil {
api.HandleError(resp, req, err)
return
}
var appConfig appcfg.ApplicationConfig
err = am.GetAppConfig(&appConfig)
if err != nil {
klog.Errorf("Failed to get app config err=%v", err)
api.HandleError(resp, req, err)
return
}
var ret *applicationPermission
apClient := provider.NewApplicationPermissionRequest(client)
namespace := fmt.Sprintf("user-system-%s", owner)
aps, err := apClient.List(req.Request.Context(), namespace, metav1.ListOptions{})
if err != nil {
api.HandleError(resp, req, err)
return
}
for _, ap := range aps.Items {
if ap.Object == nil {
continue
}
appName, _, _ := unstructured.NestedString(ap.Object, "spec", "app")
if appName == app {
perms, _, _ := unstructured.NestedSlice(ap.Object, "spec", "permissions")
permissions := appinstaller.ParseAppPermission(appConfig.Permission)
for _, ap := range permissions {
if perms, ok := ap.([]appcfg.ProviderPermission); ok {
permissions := make([]permission, 0)
for _, p := range perms {
if perm, ok := p.(map[string]interface{}); ok {
ops := make([]string, 0)
for _, op := range perm["ops"].([]interface{}) {
if opStr, ok := op.(string); ok {
ops = append(ops, opStr)
}
}
permissions = append(permissions, permission{
DataType: perm["dataType"].(string),
Group: perm["group"].(string),
Version: perm["version"].(string),
Ops: ops,
})
}
permissions = append(permissions, permission{
DataType: p.ProviderName,
Group: p.AppName,
})
}
ret = &applicationPermission{
App: appName,
App: am.Spec.AppName,
Owner: owner,
Permissions: permissions,
}
@@ -642,6 +637,7 @@ type opApi struct {
URI string `json:"uri"`
}
// Deprecated
func (h *Handler) getProviderRegistry(req *restful.Request, resp *restful.Response) {
dataTypeReq := req.PathParameter(ParamDataType)
groupReq := req.PathParameter(ParamGroup)
@@ -708,56 +704,44 @@ func (h *Handler) getProviderRegistry(req *restful.Request, resp *restful.Respon
func (h *Handler) getApplicationProviderList(req *restful.Request, resp *restful.Response) {
owner := req.Attribute(constants.UserContextAttribute).(string)
app := req.PathParameter(ParamAppName)
client, err := dynamic.NewForConfig(h.kubeConfig)
name, err := apputils.FmtAppMgrName(app, owner, "")
if err != nil {
api.HandleError(resp, req, err)
return
}
var am v1alpha1.ApplicationManager
err = h.ctrlClient.Get(req.Request.Context(), types.NamespacedName{Name: name}, &am)
if err != nil {
api.HandleError(resp, req, err)
return
}
var appConfig appcfg.ApplicationConfig
err = am.GetAppConfig(&appConfig)
if err != nil {
klog.Errorf("Failed to get app config err=%v", err)
api.HandleError(resp, req, err)
return
}
ret := make([]providerRegistry, 0)
rClient := provider.NewRegistryRequest(client)
namespace := fmt.Sprintf("user-system-%s", owner)
prs, err := rClient.List(req.Request.Context(), namespace, metav1.ListOptions{})
if err != nil {
api.HandleError(resp, req, err)
return
}
for _, ap := range prs.Items {
if ap.Object == nil {
continue
}
deployment, _, _ := unstructured.NestedString(ap.Object, "spec", "deployment")
kind, _, _ := unstructured.NestedString(ap.Object, "spec", "kind")
if app == deployment && kind == "provider" {
dataType, _, _ := unstructured.NestedString(ap.Object, "spec", "dataType")
group, _, _ := unstructured.NestedString(ap.Object, "spec", "group")
description, _, _ := unstructured.NestedString(ap.Object, "spec", "description")
endpoint, _, _ := unstructured.NestedString(ap.Object, "spec", "endpoint")
ns, _, _ := unstructured.NestedString(ap.Object, "spec", "namespace")
version, _, _ := unstructured.NestedString(ap.Object, "spec", "version")
opApis := make([]opApi, 0)
opApiList, _, _ := unstructured.NestedSlice(ap.Object, "spec", "opApis")
for _, op := range opApiList {
if aop, ok := op.(map[string]interface{}); ok {
opApis = append(opApis, opApi{
Name: aop["name"].(string),
URI: aop["uri"].(string),
})
}
}
ret = append(ret, providerRegistry{
DataType: dataType,
Deployment: deployment,
Description: description,
Endpoint: endpoint,
Kind: kind,
Group: group,
Namespace: ns,
OpApis: opApis,
Version: version,
ns := am.Spec.AppNamespace
for _, ap := range appConfig.Provider {
dataType := ap.Name
endpoint := ap.Entrance
opApis := make([]opApi, 0)
for _, op := range ap.Paths {
opApis = append(opApis, opApi{
URI: op,
})
}
ret = append(ret, providerRegistry{
DataType: dataType,
Endpoint: endpoint,
Namespace: ns,
OpApis: opApis,
})
}
resp.WriteAsJson(ret)
}

View File

@@ -37,7 +37,6 @@ import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/dynamic"
"k8s.io/klog/v2"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
@@ -308,36 +307,21 @@ func (h *Handler) gpuLimitMutate(ctx context.Context, req *admissionv1.Admission
return resp
}
GPUType, err := h.findNvidiaGpuFromNodes(ctx)
if err != nil && !errors.Is(err, api.ErrGPUNodeNotFound) {
return h.sidecarWebhook.AdmissionError(req.UID, err)
}
GPUType := appcfg.GetSelectedGpuTypeValue()
// no gpu found, no need to inject env, just return.
if GPUType == "" {
if GPUType == "none" || GPUType == "" {
return resp
}
terminus, err := utils.GetTerminus(ctx, h.ctrlClient)
if err != nil {
return h.sidecarWebhook.AdmissionError(req.UID, err)
}
nvshareManagedMemory := ""
if terminus.Spec.Settings != nil {
nvshareManagedMemory = terminus.Spec.Settings[constants.EnvNvshareManagedMemory]
envs := []webhook.EnvKeyValue{
{
Key: constants.EnvGPUType,
Value: GPUType,
},
}
envs := []webhook.EnvKeyValue{}
if nvshareManagedMemory != "" {
envs = append(envs, webhook.EnvKeyValue{
Key: constants.EnvNvshareManagedMemory,
Value: nvshareManagedMemory,
})
}
envs = append(envs, webhook.EnvKeyValue{Key: "NVSHARE_DEBUG", Value: "1"})
patchBytes, err := webhook.CreatePatchForDeployment(tpl, req.Namespace, gpuRequired, GPUType, envs)
patchBytes, err := webhook.CreatePatchForDeployment(tpl, h.getGPUResourceTypeKey(GPUType), envs)
if err != nil {
klog.Errorf("create patch error %v", err)
return h.sidecarWebhook.AdmissionError(req.UID, err)
@@ -347,33 +331,17 @@ func (h *Handler) gpuLimitMutate(ctx context.Context, req *admissionv1.Admission
return resp
}
func (h *Handler) findNvidiaGpuFromNodes(ctx context.Context) (string, error) {
var nodes corev1.NodeList
err := h.ctrlClient.List(ctx, &nodes, &client.ListOptions{})
if err != nil {
return "", err
func (h *Handler) getGPUResourceTypeKey(gpuType string) string {
switch gpuType {
case utils.NvidiaCardType:
return constants.NvidiaGPU
case utils.GB10ChipType:
return constants.NvidiaGB10GPU
case utils.AmdApuCardType:
return constants.AMDAPU
default:
return ""
}
// return nvshare gpu or virtaitech gpu in priority
gtype := ""
for _, n := range nodes.Items {
if _, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
if _, ok = n.Status.Capacity[constants.NvshareGPU]; ok {
return constants.NvshareGPU, nil
}
gtype = constants.NvidiaGPU
}
if _, ok := n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
return constants.VirtAiTechVGPU, nil
}
}
if gtype != "" {
return gtype, nil
}
return "", api.ErrGPUNodeNotFound
}
func (h *Handler) providerRegistryValidate(req *restful.Request, resp *restful.Response) {

View File

@@ -340,7 +340,9 @@ func GetClusterResource(kubeConfig *rest.Config, token string) (*prometheus.Clus
arches.Insert(n.Labels["kubernetes.io/arch"])
if quantity, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
} else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.AMDAPU]; ok {
total += quantity.AsApproximateFloat64()
}
}

View File

@@ -254,21 +254,9 @@ func addServiceToContainer(c *restful.Container, handler *Handler) error {
Param(ws.PathParameter(ParamEntranceName, "the name of a application entrance")).
Returns(http.StatusOK, "Success to set the application entrance policy", nil))
ws.Route(ws.POST("/gpu/disable/managed-memory").
To(handler.disableGpuManagedMemory).
Doc("disable nvshare's managed memory ").
Metadata(restfulspec.KeyOpenAPITags, MODULE_TAGS).
Returns(http.StatusOK, "Success to disable", nil))
ws.Route(ws.POST("/gpu/enable/managed-memory").
To(handler.enableGpuManagedMemory).
Doc("enable nvshare's managed memory ").
Metadata(restfulspec.KeyOpenAPITags, MODULE_TAGS).
Returns(http.StatusOK, "Success to enable", nil))
ws.Route(ws.GET("/gpu/managed-memory").
To(handler.getManagedMemoryValue).
Doc("get nvshare's managed memory enabled or not").
ws.Route(ws.GET("/gpu/types").
To(handler.getGpuTypes).
Doc("get all gpu types in the cluster").
Metadata(restfulspec.KeyOpenAPITags, MODULE_TAGS).
Returns(http.StatusOK, "Success to get ", &ResultResponse{}))

View File

@@ -56,14 +56,19 @@ type AppSpec struct {
Developer string `yaml:"developer" json:"developer"`
RequiredMemory string `yaml:"requiredMemory" json:"requiredMemory"`
RequiredDisk string `yaml:"requiredDisk" json:"requiredDisk"`
SupportClient SupportClient `yaml:"supportClient" json:"supportClient"`
RequiredGPU string `yaml:"requiredGpu" json:"requiredGpu"`
RequiredCPU string `yaml:"requiredCpu" json:"requiredCpu"`
LimitedMemory string `yaml:"limitedMemory" json:"limitedMemory"`
LimitedDisk string `yaml:"limitedDisk" json:"limitedDisk"`
LimitedGPU string `yaml:"limitedGPU" json:"limitedGPU"`
LimitedCPU string `yaml:"limitedCPU" json:"limitedCPU"`
SupportClient SupportClient `yaml:"supportClient" json:"supportClient"`
RunAsUser bool `yaml:"runAsUser" json:"runAsUser"`
RunAsInternal bool `yaml:"runAsInternal" json:"runAsInternal"`
PodGPUConsumePolicy string `yaml:"podGpuConsumePolicy" json:"podGpuConsumePolicy"`
SubCharts []Chart `yaml:"subCharts" json:"subCharts"`
Hardware Hardware `yaml:"hardware" json:"hardware"`
SupportedGpu []any `yaml:"supportedGpu,omitempty" json:"supportedGpu,omitempty"`
}
type Hardware struct {
@@ -188,6 +193,17 @@ type Provider struct {
Verbs []string `yaml:"verbs" json:"verbs"`
}
type SpecialResource struct {
RequiredMemory *string `yaml:"requiredMemory,omitempty" json:"requiredMemory,omitempty"`
RequiredDisk *string `yaml:"requiredDisk,omitempty" json:"requiredDisk,omitempty"`
RequiredGPU *string `yaml:"requiredGpu,omitempty" json:"requiredGpu,omitempty"`
RequiredCPU *string `yaml:"requiredCpu,omitempty" json:"requiredCpu,omitempty"`
LimitedMemory *string `yaml:"limitedMemory,omitempty" json:"limitedMemory,omitempty"`
LimitedDisk *string `yaml:"limitedDisk,omitempty" json:"limitedDisk,omitempty"`
LimitedGPU *string `yaml:"limitedGPU,omitempty" json:"limitedGPU,omitempty"`
LimitedCPU *string `yaml:"limitedCPU,omitempty" json:"limitedCPU,omitempty"`
}
func (c *Chart) Namespace(owner string) string {
if c.Shared {
return fmt.Sprintf("%s-%s", c.Name, "shared")

View File

@@ -100,6 +100,7 @@ type ApplicationConfig struct {
PodsSelectors []metav1.LabelSelector
HardwareRequirement Hardware
SharedEntrances []v1alpha1.Entrance
SelectedGpuType string
}
func (c *ApplicationConfig) IsMiddleware() bool {
@@ -159,6 +160,13 @@ func (c *ApplicationConfig) GenSharedEntranceURL(ctx context.Context) ([]v1alpha
return app.GenSharedEntranceURL(ctx)
}
func (c *ApplicationConfig) GetSelectedGpuTypeValue() string {
if c.SelectedGpuType == "" {
return "none"
}
return c.SelectedGpuType
}
func (p *ProviderPermission) GetNamespace(ownerName string) string {
if p.Namespace != "" {
if p.Namespace == "user-space" || p.Namespace == "user-system" {

View File

@@ -752,7 +752,7 @@ func getApplicationPolicy(policies []appcfg.AppPolicy, entrances []appv1alpha1.E
return string(policyStr), nil
}
func parseAppPermission(data []appcfg.AppPermission) []appcfg.AppPermission {
func ParseAppPermission(data []appcfg.AppPermission) []appcfg.AppPermission {
permissions := make([]appcfg.AppPermission, 0)
for _, p := range data {
switch perm := p.(type) {

View File

@@ -29,8 +29,14 @@ func (h *HelmOps) UninstallAll() error {
if err != nil {
return err
}
appName := fmt.Sprintf("%s-%s", h.app.Namespace, h.app.AppName)
appmgr, err := h.client.AppClient.AppV1alpha1().ApplicationManagers().Get(h.ctx, appName, metav1.GetOptions{})
if err != nil {
return err
}
deleteData := appmgr.Annotations["bytetrade.io/delete-data"] == "true"
appCacheDirs, err := apputils.TryToGetAppdataDirFromDeployment(h.ctx, h.app.Namespace, h.app.AppName, h.app.OwnerName)
appCacheDirs, appDataDirs, err := apputils.TryToGetAppdataDirFromDeployment(h.ctx, h.app.Namespace, h.app.AppName, h.app.OwnerName, deleteData)
if err != nil {
klog.Warningf("get app %s cache dir failed %v", h.app.AppName, err)
}
@@ -48,6 +54,13 @@ func (h *HelmOps) UninstallAll() error {
klog.Errorf("Failed to clear app cache dirs %v err=%v", appCacheDirs, err)
return err
}
if deleteData {
h.ClearData(client, appDataDirs)
if err != nil {
klog.Errorf("Failed to clear app data dirs %v err=%v", appDataDirs, err)
return err
}
}
err = h.DeleteNamespace(client, h.app.Namespace)
if err != nil {
@@ -78,7 +91,7 @@ func (h *HelmOps) Uninstall_(client kubernetes.Interface, actionConfig *action.C
return err
}
h.app.Permission = parseAppPermission(h.app.Permission)
h.app.Permission = ParseAppPermission(h.app.Permission)
var perm []appcfg.ProviderPermission
for _, p := range h.app.Permission {
if t, ok := p.([]appcfg.ProviderPermission); ok {
@@ -117,7 +130,7 @@ func (h *HelmOps) ClearCache(client kubernetes.Interface, appCacheDirs []string)
formattedAppCacheDirs := apputils.FormatCacheDirs(appCacheDirs)
for _, n := range nodes.Items {
URL := fmt.Sprintf(constants.AppDataDirURL, n.Name)
URL := fmt.Sprintf(constants.AppCacheDirURL, n.Name)
c.SetHeader("X-Terminus-Node", n.Name)
c.SetHeader("X-Bfl-User", h.app.OwnerName)
res, e := c.R().SetBody(map[string]interface{}{
@@ -137,6 +150,32 @@ func (h *HelmOps) ClearCache(client kubernetes.Interface, appCacheDirs []string)
return nil
}
func (h *HelmOps) ClearData(client kubernetes.Interface, appDataDirs []string) error {
if len(appDataDirs) > 0 {
klog.Infof("clear app data dirs: %v", appDataDirs)
c := resty.New().SetTimeout(2 * time.Second).
SetAuthToken(h.token)
formattedAppDataDirs := apputils.FormatCacheDirs(appDataDirs)
URL := constants.AppDataDirURL
c.SetHeader("X-Bfl-User", h.app.OwnerName)
res, e := c.R().SetBody(map[string]interface{}{
"dirents": formattedAppDataDirs,
}).Delete(URL)
if e != nil {
klog.Errorf("Failed to delete data dir err=%v", e)
return nil
}
if res.StatusCode() != http.StatusOK {
klog.Infof("delete app data failed with: %v", res.String())
}
}
return nil
}
func (h *HelmOps) ClearMiddlewareRequests(middlewareNamespace string) {
// delete middleware requests crd
for _, mt := range middlewareTypes {

View File

@@ -50,7 +50,7 @@ func (h *HelmOps) SetValues() (values map[string]interface{}, err error) {
values["domain"] = entries
userspace := make(map[string]interface{})
h.app.Permission = parseAppPermission(h.app.Permission)
h.app.Permission = ParseAppPermission(h.app.Permission)
for _, p := range h.app.Permission {
switch perm := p.(type) {
case appcfg.AppDataPermission, appcfg.AppCachePermission, appcfg.UserDataPermission:
@@ -170,17 +170,12 @@ func (h *HelmOps) SetValues() (values map[string]interface{}, err error) {
values["cluster"] = map[string]interface{}{
"arch": arch,
}
gpuType, err := utils.FindGpuTypeFromNodes(nodes)
if err != nil {
klog.Errorf("Failed to get gpuType err=%v", err)
return values, err
}
values["GPU"] = map[string]interface{}{
"Type": gpuType,
"Type": h.app.GetSelectedGpuTypeValue(),
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
}
values["gpu"] = gpuType
values["gpu"] = h.app.GetSelectedGpuTypeValue()
if h.app.OIDC.Enabled {
err = h.createOIDCClient(values, zone, h.app.Namespace)

View File

@@ -16,7 +16,6 @@ import (
apputils "github.com/beclab/Olares/framework/app-service/pkg/utils/app"
"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -176,19 +175,8 @@ func (p *DownloadingApp) exec(ctx context.Context) error {
},
}
var nodes corev1.NodeList
err = p.client.List(ctx, &nodes, &client.ListOptions{})
if err != nil {
klog.Errorf("list node failed %v", err)
return err
}
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
if err != nil {
klog.Errorf("get gpu type failed %v", gpuType)
return err
}
values["GPU"] = map[string]interface{}{
"Type": gpuType,
"Type": appConfig.GetSelectedGpuTypeValue(),
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
}

View File

@@ -22,7 +22,6 @@ import (
"github.com/pkg/errors"
"helm.sh/helm/v3/pkg/action"
corev1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -214,19 +213,8 @@ func (p *UpgradingApp) exec(ctx context.Context) error {
"username": p.manager.Spec.AppOwner,
},
}
var nodes corev1.NodeList
err = p.client.List(ctx, &nodes, &client.ListOptions{})
if err != nil {
klog.Errorf("list node failed %v", err)
return err
}
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
if err != nil {
klog.Errorf("get gpu type failed %v", gpuType)
return err
}
values["GPU"] = map[string]interface{}{
"Type": gpuType,
"Type": appConfig.GetSelectedGpuTypeValue(),
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
}

View File

@@ -78,20 +78,23 @@ const (
SidecarInitContainerName = "olares-sidecar-init"
EnvoyConfigWorkDirName = "envoy-config"
ByteTradeAuthor = "bytetrade.io"
NvshareGPU = "nvshare.com/gpu"
NvidiaGPU = "nvidia.com/gpu"
VirtAiTechVGPU = "virtaitech.com/gpu"
PatchOpAdd = "add"
PatchOpReplace = "replace"
EnvNvshareManagedMemory = "NVSHARE_MANAGED_MEMORY"
ByteTradeAuthor = "bytetrade.io"
PatchOpAdd = "add"
PatchOpReplace = "replace"
EnvGPUType = "GPU_TYPE"
// gpu resource keys
NvidiaGPU = "nvidia.com/gpu"
NvidiaGB10GPU = "nvidia.com/gb10"
AMDAPU = "amd.com/apu"
AuthorizationLevelOfPublic = "public"
AuthorizationLevelOfPrivate = "private"
DependencyTypeSystem = "system"
DependencyTypeApp = "application"
AppDataDirURL = "http://files-service.os-framework/api/resources/cache/%s/"
AppCacheDirURL = "http://files-service.os-framework/api/resources/cache/%s/"
AppDataDirURL = "http://files-service.os-framework/api/resources/drive/Data/"
UserSpaceDirKey = "userspace_hostpath"
UserAppDataDirKey = "appcache_hostpath"

View File

@@ -273,11 +273,7 @@ func (c *Creator) installSysApps(ctx context.Context, bflPod *corev1.Pod) error
"arch": arch,
}
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
if err != nil {
return err
}
vals["gpu"] = gpuType
vals["gpu"] = "none" // unused currently
userIndex, userSubnet, err := c.getUserSubnet(ctx)
if err != nil {

View File

@@ -16,12 +16,12 @@ import (
corev1 "k8s.io/api/core/v1"
sysv1alpha1 "github.com/beclab/Olares/framework/app-service/api/sys.bytetrade.io/v1alpha1"
"github.com/go-viper/mapstructure/v2"
"sigs.k8s.io/controller-runtime/pkg/client"
"github.com/beclab/Olares/framework/app-service/api/app.bytetrade.io/v1alpha1"
"github.com/beclab/Olares/framework/app-service/pkg/appcfg"
"github.com/beclab/Olares/framework/app-service/pkg/constants"
"github.com/beclab/Olares/framework/app-service/pkg/generated/clientset/versioned"
"github.com/beclab/Olares/framework/app-service/pkg/users/userspace"
"github.com/beclab/Olares/framework/app-service/pkg/utils"
"github.com/beclab/Olares/framework/app-service/pkg/utils/files"
@@ -553,7 +553,7 @@ func parseDestination(dest string) (string, string, error) {
return alias, tokens[len(tokens)-1], nil
}
func TryToGetAppdataDirFromDeployment(ctx context.Context, namespace, name, owner string) (appdirs []string, err error) {
func TryToGetAppdataDirFromDeployment(ctx context.Context, namespace, name, owner string, appData bool) (appCacheDirs []string, appDataDirs []string, err error) {
userspaceNs := utils.UserspaceName(owner)
config, err := ctrl.GetConfig()
if err != nil {
@@ -567,7 +567,6 @@ func TryToGetAppdataDirFromDeployment(ctx context.Context, namespace, name, owne
if err != nil {
return
}
appName := fmt.Sprintf("%s-%s", namespace, name)
appCachePath := sts.GetAnnotations()["appcache_hostpath"]
if len(appCachePath) == 0 {
err = errors.New("empty appcache_hostpath")
@@ -576,20 +575,23 @@ func TryToGetAppdataDirFromDeployment(ctx context.Context, namespace, name, owne
if !strings.HasSuffix(appCachePath, "/") {
appCachePath += "/"
}
dClient, err := versioned.NewForConfig(config)
if err != nil {
userspacePath := sts.GetAnnotations()["userspace_hostpath"]
if len(userspacePath) == 0 {
err = errors.New("empty userspace_hostpath annotation")
return
}
appCRD, err := dClient.AppV1alpha1().Applications().Get(ctx, appName, metav1.GetOptions{})
if err != nil {
return
appDataPath := filepath.Join(userspacePath, "Data")
if !strings.HasSuffix(appDataPath, "/") {
appDataPath += "/"
}
deploymentName := appCRD.Spec.DeploymentName
deploymentName := name
deployment, err := clientset.AppsV1().Deployments(namespace).
Get(context.Background(), deploymentName, metav1.GetOptions{})
if err != nil {
if apierrors.IsNotFound(err) {
return tryToGetAppdataDirFromSts(ctx, namespace, deploymentName, appCachePath)
return tryToGetAppdataDirFromSts(ctx, namespace, deploymentName, appCachePath, appDataPath)
}
return
}
@@ -601,15 +603,31 @@ func TryToGetAppdataDirFromDeployment(ctx context.Context, namespace, name, owne
if appDirSet.Has(appDir) {
continue
}
appdirs = append(appdirs, appDir)
appCacheDirs = append(appCacheDirs, appDir)
appDirSet.Insert(appDir)
}
}
}
return appdirs, nil
if appData {
appDirSet := sets.NewString()
for _, v := range deployment.Spec.Template.Spec.Volumes {
if v.HostPath != nil && strings.HasPrefix(v.HostPath.Path, appDataPath) && len(v.HostPath.Path) > len(appDataPath) {
appDir := GetFirstSubDir(v.HostPath.Path, appDataPath)
if appDir != "" {
if appDirSet.Has(appDir) {
continue
}
appDataDirs = append(appDataDirs, appDir)
appDirSet.Insert(appDir)
}
}
}
}
return appCacheDirs, appDataDirs, nil
}
func tryToGetAppdataDirFromSts(ctx context.Context, namespace, stsName, baseDir string) (appdirs []string, err error) {
func tryToGetAppdataDirFromSts(ctx context.Context, namespace, stsName, appCacheDir, appDataDir string) (appCacheDirs []string, appDataDirs []string, err error) {
config, err := ctrl.GetConfig()
if err != nil {
return
@@ -626,18 +644,32 @@ func tryToGetAppdataDirFromSts(ctx context.Context, namespace, stsName, baseDir
}
appDirSet := sets.NewString()
for _, v := range sts.Spec.Template.Spec.Volumes {
if v.HostPath != nil && strings.HasPrefix(v.HostPath.Path, baseDir) && len(v.HostPath.Path) > len(baseDir) {
appDir := GetFirstSubDir(v.HostPath.Path, baseDir)
if v.HostPath != nil && strings.HasPrefix(v.HostPath.Path, appCacheDir) && len(v.HostPath.Path) > len(appCacheDir) {
appDir := GetFirstSubDir(v.HostPath.Path, appCacheDir)
if appDir != "" {
if appDirSet.Has(appDir) {
continue
}
appdirs = append(appdirs, appDir)
appCacheDirs = append(appCacheDirs, appDir)
appDirSet.Insert(appDir)
}
}
}
return appdirs, nil
appDirSet = sets.NewString()
for _, v := range sts.Spec.Template.Spec.Volumes {
if v.HostPath != nil && strings.HasPrefix(v.HostPath.Path, appDataDir) && len(v.HostPath.Path) > len(appDataDir) {
appDir := GetFirstSubDir(v.HostPath.Path, appDataDir)
if appDir != "" {
if appDirSet.Has(appDir) {
continue
}
appDataDirs = append(appDataDirs, appDir)
appDirSet.Insert(appDir)
}
}
}
return appCacheDirs, appDataDirs, nil
}
func GetFirstSubDir(fullPath, basePath string) string {
@@ -674,6 +706,7 @@ type ConfigOptions struct {
MarketSource string
IsAdmin bool
RawAppName string
SelectedGpu string
}
// GetAppConfig get app installation configuration from app store
@@ -740,7 +773,7 @@ func getAppConfigFromRepo(ctx context.Context, options *ConfigOptions) (*appcfg.
return getAppConfigFromConfigurationFile(options, chartPath)
}
func toApplicationConfig(app, chart, rawAppName string, cfg *appcfg.AppConfiguration) (*appcfg.ApplicationConfig, string, error) {
func toApplicationConfig(app, chart, rawAppName, selectedGpu string, cfg *appcfg.AppConfiguration) (*appcfg.ApplicationConfig, string, error) {
var permission []appcfg.AppPermission
if cfg.Permission.AppData {
permission = append(permission, appcfg.AppDataRW)
@@ -788,6 +821,57 @@ func toApplicationConfig(app, chart, rawAppName string, cfg *appcfg.AppConfigura
return nil, chart, err
}
// set suppertedGpu to ["nvidia","nvidia-gb10"] by default
if len(cfg.Spec.SupportedGpu) == 0 {
cfg.Spec.SupportedGpu = []interface{}{utils.NvidiaCardType, utils.GB10ChipType}
}
// try to get selected GPU type special resource requirement
if selectedGpu != "" {
found := false
for _, supportedGpu := range cfg.Spec.SupportedGpu {
if str, ok := supportedGpu.(string); ok && str == selectedGpu {
found = true
break
}
if supportedGpuResourceMap, ok := supportedGpu.(map[string]interface{}); ok {
if resourceRequirement, ok := supportedGpuResourceMap[selectedGpu].(map[string]interface{}); ok {
found = true
var specialResource appcfg.SpecialResource
err := mapstructure.Decode(resourceRequirement, &specialResource)
if err != nil {
return nil, chart, fmt.Errorf("failed to decode special resource for selected GPU type %s: %v", selectedGpu, err)
}
for _, resSetter := range []struct {
v **resource.Quantity
s *string
}{
{v: &mem, s: specialResource.RequiredMemory},
{v: &disk, s: specialResource.RequiredDisk},
{v: &cpu, s: specialResource.RequiredCPU},
{v: &gpu, s: specialResource.RequiredGPU},
} {
if resSetter.s != nil && *resSetter.s != "" {
*resSetter.v, err = valuePtr(resource.ParseQuantity(*resSetter.s))
if err != nil {
return nil, chart, fmt.Errorf("failed to parse special resource quantity %s: %v", *resSetter.s, err)
}
}
}
break
} // end if selected gpu's resource requirement found
} // end if supportedGpu is map
} // end for supportedGpu
if !found {
return nil, chart, fmt.Errorf("selected GPU type %s is not supported", selectedGpu)
}
}
// transform from Policy to AppPolicy
var policies []appcfg.AppPolicy
for _, p := range cfg.Options.Policies {
@@ -877,6 +961,7 @@ func toApplicationConfig(app, chart, rawAppName string, cfg *appcfg.AppConfigura
PodsSelectors: podSelectors,
HardwareRequirement: cfg.Spec.Hardware,
SharedEntrances: cfg.SharedEntrances,
SelectedGpuType: selectedGpu,
}, chart, nil
}
@@ -890,7 +975,7 @@ func getAppConfigFromConfigurationFile(opt *ConfigOptions, chartPath string) (*a
return nil, chartPath, err
}
return toApplicationConfig(opt.App, chartPath, opt.RawAppName, &cfg)
return toApplicationConfig(opt.App, chartPath, opt.RawAppName, opt.SelectedGpu, &cfg)
}
func checkVersionFormat(constraint string) error {

View File

@@ -234,7 +234,9 @@ func CheckAppRequirement(token string, appConfig *appcfg.ApplicationConfig, op v
return constants.CPU, constants.SystemCPUPressure, fmt.Errorf(constants.SystemCPUPressureMessage, op)
}
}
if appConfig.Requirement.GPU != nil {
// only support nvidia gpu managment by HAMi for now
if appConfig.Requirement.GPU != nil && appConfig.GetSelectedGpuTypeValue() == utils.NvidiaCardType {
if !appConfig.Requirement.GPU.IsZero() && metrics.GPU.Total <= 0 {
return constants.GPU, constants.SystemGPUNotAvailable, fmt.Errorf(constants.SystemGPUNotAvailableMessage, op)
@@ -398,7 +400,9 @@ func GetClusterResource(token string) (*prometheus.ClusterMetrics, []string, err
arches.Insert(n.Labels["kubernetes.io/arch"])
if quantity, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
} else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.AMDAPU]; ok {
total += quantity.AsApproximateFloat64()
}
}

View File

@@ -0,0 +1,12 @@
package utils
const (
NodeGPUTypeLabel = "gpu.bytetrade.io/type"
)
const (
NvidiaCardType = "nvidia" // handling by HAMi
AmdGpuCardType = "amd-gpu" //
AmdApuCardType = "amd-apu" // AMD APU with integrated GPU , AI Max 395 etc.
GB10ChipType = "nvidia-gb10" // NVIDIA GB10 Superchip & unified system memory
)

View File

@@ -103,24 +103,37 @@ func GetAllNodesTunnelIPCIDRs() (cidrs []string) {
return cidrs
}
func FindGpuTypeFromNodes(nodes *corev1.NodeList) (string, error) {
gpuType := "none"
// func FindGpuTypeFromNodes(nodes *corev1.NodeList) (string, error) {
// gpuType := "none"
// if nodes == nil {
// return gpuType, errors.New("empty node list")
// }
// for _, n := range nodes.Items {
// if _, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
// if _, ok = n.Status.Capacity[constants.NvshareGPU]; ok {
// return "nvshare", nil
// }
// gpuType = "nvidia"
// }
// if _, ok := n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
// return "virtaitech", nil
// }
// }
// return gpuType, nil
// }
func GetAllGpuTypesFromNodes(nodes *corev1.NodeList) (map[string]struct{}, error) {
gpuTypes := make(map[string]struct{})
if nodes == nil {
return gpuType, errors.New("empty node list")
return gpuTypes, errors.New("empty node list")
}
for _, n := range nodes.Items {
if _, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
if _, ok = n.Status.Capacity[constants.NvshareGPU]; ok {
return "nvshare", nil
}
gpuType = "nvidia"
}
if _, ok := n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
return "virtaitech", nil
if typeLabel, ok := n.Labels[NodeGPUTypeLabel]; ok {
gpuTypes[typeLabel] = struct{}{} // TODO: add driver version info
}
}
return gpuType, nil
return gpuTypes, nil
}
func IsNodeReady(node *corev1.Node) bool {

View File

@@ -30,7 +30,6 @@ import (
admissionv1 "k8s.io/api/admission/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/labels"
@@ -544,16 +543,21 @@ type EnvKeyValue struct {
}
// CreatePatchForDeployment add gpu env for deployment and returns patch bytes.
func CreatePatchForDeployment(tpl *corev1.PodTemplateSpec, namespace string, gpuRequired *resource.Quantity, typeKey string, envKeyValues []EnvKeyValue) ([]byte, error) {
patches, err := addResourceLimits(tpl, namespace, gpuRequired, typeKey, envKeyValues)
func CreatePatchForDeployment(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues []EnvKeyValue) ([]byte, error) {
patches, err := addResourceLimits(tpl, typeKey, envKeyValues)
if err != nil {
return []byte{}, err
}
return json.Marshal(patches)
}
func addResourceLimits(tpl *corev1.PodTemplateSpec, namespace string, gpuRequired *resource.Quantity, typeKey string, envKeyValues []EnvKeyValue) (patch []patchOp, err error) {
if typeKey == constants.NvidiaGPU || typeKey == constants.NvshareGPU {
func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues []EnvKeyValue) (patch []patchOp, err error) {
if typeKey == "" {
klog.Warning("No gpu type selected, skip adding resource limits")
return patch, nil
}
if typeKey == constants.NvidiaGPU || typeKey == constants.NvidiaGB10GPU {
if tpl.Spec.RuntimeClassName != nil {
patch = append(patch, patchOp{
Op: constants.PatchOpReplace,
@@ -584,7 +588,10 @@ func addResourceLimits(tpl *corev1.PodTemplateSpec, namespace string, gpuRequire
t := make(map[string]map[string]string)
t["limits"] = map[string]string{}
for k, v := range container.Resources.Limits {
if k.String() == constants.NvidiaGPU || k.String() == constants.NvshareGPU || k.String() == constants.VirtAiTechVGPU {
if k.String() == constants.NvidiaGPU ||
k.String() == constants.NvidiaGB10GPU ||
k.String() == constants.AMDAPU {
// unset all previous gpu limits
continue
}
t["limits"][k.String()] = v.String()

View File

@@ -431,7 +431,7 @@ spec:
privileged: true
containers:
- name: authelia
image: beclab/auth:0.2.45
image: beclab/auth:0.2.46
imagePullPolicy: IfNotPresent
ports:
- containerPort: 9091

View File

@@ -29,7 +29,7 @@ spec:
name: check-auth
containers:
- name: auth-front
image: beclab/login:v1.7.4
image: beclab/login:v1.8.5
imagePullPolicy: IfNotPresent
ports:
- containerPort: 80