Compare commits
13 Commits
feat/env_s
...
module-app
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c17d281df3 | ||
|
|
cf8e3cd017 | ||
|
|
e6e85b40d2 | ||
|
|
f9f8d2a328 | ||
|
|
e070d5eb37 | ||
|
|
aa7afba374 | ||
|
|
63fab4a454 | ||
|
|
46b1e62d3a | ||
|
|
f6b1f2c544 | ||
|
|
f1bec97238 | ||
|
|
9acaaf4f09 | ||
|
|
acf813bf5a | ||
|
|
19bcc5195e |
@@ -317,7 +317,7 @@ spec:
|
||||
chown -R 1000:1000 /uploadstemp && \
|
||||
chown -R 1000:1000 /appdata
|
||||
- name: olares-app-init
|
||||
image: beclab/system-frontend:v1.8.4
|
||||
image: beclab/system-frontend:v1.8.5
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- /bin/sh
|
||||
@@ -439,7 +439,7 @@ spec:
|
||||
- name: NATS_SUBJECT_VAULT
|
||||
value: os.vault.{{ .Values.bfl.username}}
|
||||
- name: user-service
|
||||
image: beclab/user-service:v0.0.84
|
||||
image: beclab/user-service:v0.0.85
|
||||
imagePullPolicy: IfNotPresent
|
||||
ports:
|
||||
- containerPort: 3000
|
||||
|
||||
@@ -8,26 +8,27 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/beclab/Olares/cli/pkg/core/common"
|
||||
"github.com/beclab/Olares/cli/cmd/config"
|
||||
"github.com/beclab/Olares/cli/pkg/common"
|
||||
corecommon "github.com/beclab/Olares/cli/pkg/core/common"
|
||||
"github.com/beclab/Olares/cli/pkg/core/util"
|
||||
"github.com/beclab/Olares/cli/pkg/release/builder"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/spf13/viper"
|
||||
)
|
||||
|
||||
func NewCmdRelease() *cobra.Command {
|
||||
var (
|
||||
baseDir string
|
||||
version string
|
||||
cdn string
|
||||
ignoreMissingImages bool
|
||||
extract bool
|
||||
)
|
||||
|
||||
cmd := &cobra.Command{
|
||||
Use: "release",
|
||||
Short: "Build release based on a local Olares repository",
|
||||
Run: func(cmd *cobra.Command, args []string) {
|
||||
baseDir := viper.GetString(common.FlagBaseDir)
|
||||
version := viper.GetString(common.FlagVersion)
|
||||
cdn := viper.GetString(common.FlagCDNService)
|
||||
ignoreMissingImages := viper.GetBool(common.FlagIgnoreMissingImages)
|
||||
extract := viper.GetBool(common.FlagExtract)
|
||||
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
fmt.Printf("failed to get current working directory: %s\n", err)
|
||||
@@ -43,7 +44,7 @@ func NewCmdRelease() *cobra.Command {
|
||||
fmt.Printf("failed to get current user: %s\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
baseDir = filepath.Join(usr.HomeDir, common.DefaultBaseDir)
|
||||
baseDir = filepath.Join(usr.HomeDir, corecommon.DefaultBaseDir)
|
||||
fmt.Printf("--base-dir unspecified, using: %s\n", baseDir)
|
||||
time.Sleep(1 * time.Second)
|
||||
}
|
||||
@@ -75,11 +76,20 @@ func NewCmdRelease() *cobra.Command {
|
||||
},
|
||||
}
|
||||
|
||||
cmd.Flags().StringVarP(&baseDir, "base-dir", "b", "", "base directory of Olares, where this release will be extracted to as a new version if --extract/-e is not disabled, defaults to $HOME/"+common.DefaultBaseDir)
|
||||
cmd.Flags().StringVarP(&version, "version", "v", "", "version of this release, defaults to 0.0.0-local-dev-{yyyymmddhhmmss}")
|
||||
cmd.Flags().StringVar(&cdn, "cdn-service", common.DefaultOlaresCDNService, "CDN used for downloading checksums of dependencies and images")
|
||||
cmd.Flags().BoolVar(&ignoreMissingImages, "ignore-missing-images", true, "ignore missing images when downloading cheksums from CDN, only disable this if no new image is added, or the build may fail because the image is not uploaded to the CDN yet")
|
||||
cmd.Flags().BoolVarP(&extract, "extract", "e", true, "extract this release to --base-dir after build, this can be disabled if only the release file itself is needed")
|
||||
flagSetter := config.NewFlagSetterFor(cmd)
|
||||
config.AddBaseDirFlagBy(flagSetter)
|
||||
config.AddVersionFlagBy(flagSetter)
|
||||
config.AddCDNServiceFlagBy(flagSetter)
|
||||
flagSetter.Add(common.FlagIgnoreMissingImages,
|
||||
"",
|
||||
true,
|
||||
"ignore missing images when downloading checksums from CDN, only disable this if no new image is added, or the build may fail because the image is not uploaded to the CDN yet",
|
||||
)
|
||||
flagSetter.Add(common.FlagExtract,
|
||||
"e",
|
||||
true,
|
||||
"extract this release to --base-dir after build, this can be disabled if only the release file itself is needed",
|
||||
)
|
||||
|
||||
return cmd
|
||||
}
|
||||
|
||||
@@ -261,12 +261,14 @@ const (
|
||||
FlagOSPassword = "os-password"
|
||||
EnvLegacyEncryptedOSPassword = "TERMINUS_OS_PASSWORD"
|
||||
|
||||
FlagCDNService = "cdn-service"
|
||||
FlagManifest = "manifest"
|
||||
FlagURLOverride = "url-override"
|
||||
FlagReleaseID = "release-id"
|
||||
FlagKubeType = "kube-type"
|
||||
FlagLegacyKubeType = "kube"
|
||||
FlagCDNService = "cdn-service"
|
||||
FlagExtract = "extract"
|
||||
FlagIgnoreMissingImages = "ignore-missing-images"
|
||||
FlagManifest = "manifest"
|
||||
FlagURLOverride = "url-override"
|
||||
FlagReleaseID = "release-id"
|
||||
FlagKubeType = "kube-type"
|
||||
FlagLegacyKubeType = "kube"
|
||||
|
||||
FlagEnableJuiceFS = "enable-juicefs"
|
||||
FlagLegacyEnableJuiceFS = "with-juicefs"
|
||||
|
||||
@@ -429,7 +429,7 @@ const side = {
|
||||
},
|
||||
{
|
||||
text: "本地访问 Olares",
|
||||
link: "/manual/best-practices/local-access",
|
||||
link: "/zh/manual/best-practices/local-access",
|
||||
},
|
||||
],
|
||||
},
|
||||
|
||||
@@ -170,7 +170,7 @@ spec:
|
||||
priorityClassName: "system-cluster-critical"
|
||||
containers:
|
||||
- name: app-service
|
||||
image: beclab/app-service:0.4.77
|
||||
image: beclab/app-service:0.4.78
|
||||
imagePullPolicy: IfNotPresent
|
||||
ports:
|
||||
- containerPort: 6755
|
||||
|
||||
@@ -40,7 +40,7 @@ type UserEnvSyncController struct {
|
||||
|
||||
//+kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch
|
||||
//+kubebuilder:rbac:groups=iam.kubesphere.io,resources=users,verbs=get;list;watch
|
||||
//+kubebuilder:rbac:groups=sys.bytetrade.io,resources=userenvs,verbs=get;list;watch;create
|
||||
//+kubebuilder:rbac:groups=sys.bytetrade.io,resources=userenvs,verbs=get;list;watch;create;patch;update
|
||||
|
||||
func (r *UserEnvSyncController) SetupWithManager(mgr ctrl.Manager) error {
|
||||
cmPred := predicate.NewPredicateFuncs(func(obj client.Object) bool {
|
||||
@@ -164,14 +164,63 @@ func (r *UserEnvSyncController) syncUserEnvForUser(ctx context.Context, username
|
||||
return 0, fmt.Errorf("list userenvs in %s failed: %w", userNs, err)
|
||||
}
|
||||
|
||||
existSet := make(map[string]struct{}, len(existing.Items))
|
||||
existByName := make(map[string]*sysv1alpha1.UserEnv, len(existing.Items))
|
||||
for i := range existing.Items {
|
||||
existSet[existing.Items[i].EnvName] = struct{}{}
|
||||
existByName[existing.Items[i].EnvName] = &existing.Items[i]
|
||||
}
|
||||
|
||||
created := 0
|
||||
for _, spec := range base {
|
||||
if _, ok := existSet[spec.EnvName]; ok {
|
||||
if ue, ok := existByName[spec.EnvName]; ok {
|
||||
original := ue.DeepCopy()
|
||||
updated := false
|
||||
|
||||
if ue.Default == "" && spec.Default != "" {
|
||||
ue.Default = spec.Default
|
||||
updated = true
|
||||
}
|
||||
if ue.Type == "" && spec.Type != "" {
|
||||
ue.Type = spec.Type
|
||||
updated = true
|
||||
}
|
||||
if ue.Title == "" && spec.Title != "" {
|
||||
ue.Title = spec.Title
|
||||
updated = true
|
||||
}
|
||||
if ue.Description == "" && spec.Description != "" {
|
||||
ue.Description = spec.Description
|
||||
updated = true
|
||||
}
|
||||
if ue.RemoteOptions == "" && spec.RemoteOptions != "" {
|
||||
ue.RemoteOptions = spec.RemoteOptions
|
||||
updated = true
|
||||
}
|
||||
if ue.Regex == "" && spec.Regex != "" {
|
||||
ue.Regex = spec.Regex
|
||||
updated = true
|
||||
}
|
||||
|
||||
if len(spec.Options) > 0 {
|
||||
existOpt := make(map[string]struct{}, len(ue.Options))
|
||||
for _, it := range ue.Options {
|
||||
existOpt[it.Value] = struct{}{}
|
||||
}
|
||||
for _, it := range spec.Options {
|
||||
if _, exists := existOpt[it.Value]; exists {
|
||||
continue
|
||||
}
|
||||
ue.Options = append(ue.Options, it)
|
||||
existOpt[it.Value] = struct{}{}
|
||||
updated = true
|
||||
}
|
||||
}
|
||||
|
||||
if updated {
|
||||
if err := r.Patch(ctx, ue, client.MergeFrom(original)); err != nil {
|
||||
return created, fmt.Errorf("patch userenv %s/%s failed: %w", ue.Namespace, ue.Name, err)
|
||||
}
|
||||
klog.Infof("UserEnvSync: patched userenv %s/%s for user %s", ue.Namespace, ue.Name, username)
|
||||
}
|
||||
continue
|
||||
}
|
||||
name, err := apputils.EnvNameToResourceName(spec.EnvName)
|
||||
|
||||
@@ -13,6 +13,7 @@ const (
|
||||
AppMarketSourceKey = constants.AppMarketSourceKey
|
||||
AppInstallSourceKey = "bytetrade.io/install-source"
|
||||
AppUninstallAllKey = "bytetrade.io/uninstall-all"
|
||||
AppDeleteDataKey = "bytetrade.io/delete-data"
|
||||
AppStopAllKey = "bytetrade.io/stop-all"
|
||||
AppResumeAllKey = "bytetrade.io/resume-all"
|
||||
AppImagesKey = "bytetrade.io/images"
|
||||
@@ -126,15 +127,16 @@ type UpgradeRequest struct {
|
||||
|
||||
// InstallRequest represents a request to install an application.
|
||||
type InstallRequest struct {
|
||||
Dev bool `json:"devMode"`
|
||||
RepoURL string `json:"repoUrl"`
|
||||
CfgURL string `json:"cfgUrl"`
|
||||
Source AppSource `json:"source"`
|
||||
Images []Image `json:"images"`
|
||||
Envs []sysv1alpha1.AppEnvVar `json:"envs"`
|
||||
RawAppName string `json:"rawAppName"`
|
||||
Title string `json:"title"`
|
||||
Entrances []EntranceClone `json:"entrances"`
|
||||
Dev bool `json:"devMode"`
|
||||
RepoURL string `json:"repoUrl"`
|
||||
CfgURL string `json:"cfgUrl"`
|
||||
Source AppSource `json:"source"`
|
||||
Images []Image `json:"images"`
|
||||
Envs []sysv1alpha1.AppEnvVar `json:"envs"`
|
||||
RawAppName string `json:"rawAppName"`
|
||||
Title string `json:"title"`
|
||||
Entrances []EntranceClone `json:"entrances"`
|
||||
SelectedGpuType string `json:"selectedGpuType"`
|
||||
}
|
||||
|
||||
type Image struct {
|
||||
@@ -144,7 +146,8 @@ type Image struct {
|
||||
|
||||
// UninstallRequest represents a request to uninstall an application.
|
||||
type UninstallRequest struct {
|
||||
All bool `json:"all"`
|
||||
All bool `json:"all"`
|
||||
DeleteData bool `json:"deleteData"`
|
||||
}
|
||||
|
||||
// StopRequest represents a request to stop an application.
|
||||
|
||||
@@ -3,11 +3,14 @@ package apiserver
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/exp/maps"
|
||||
|
||||
"github.com/beclab/Olares/framework/app-service/api/app.bytetrade.io/v1alpha1"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/appcfg"
|
||||
@@ -946,12 +949,37 @@ func (h *Handler) oamValues(req *restful.Request, resp *restful.Response) {
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
|
||||
gpuTypes, err := utils.GetAllGpuTypesFromNodes(&nodes)
|
||||
if err != nil {
|
||||
klog.Errorf("get gpu type failed %v", gpuType)
|
||||
klog.Errorf("get gpu type failed %v", err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
|
||||
gpuType := "none"
|
||||
selectedGpuType := req.QueryParameter("gputype")
|
||||
if len(gpuTypes) > 0 {
|
||||
if selectedGpuType != "" {
|
||||
if _, ok := gpuTypes[selectedGpuType]; ok {
|
||||
gpuType = selectedGpuType
|
||||
} else {
|
||||
err := fmt.Errorf("selected gpu type %s not found in cluster", selectedGpuType)
|
||||
klog.Error(err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
} else {
|
||||
if len(gpuTypes) == 1 {
|
||||
gpuType = maps.Keys(gpuTypes)[0]
|
||||
} else {
|
||||
err := fmt.Errorf("multiple gpu types found in cluster, please specify one")
|
||||
klog.Error(err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
values["GPU"] = map[string]interface{}{
|
||||
"Type": gpuType,
|
||||
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
|
||||
|
||||
@@ -6,10 +6,12 @@ import (
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
sysv1alpha1 "github.com/beclab/Olares/framework/app-service/api/sys.bytetrade.io/v1alpha1"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/constants"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/utils"
|
||||
apputils "github.com/beclab/Olares/framework/app-service/pkg/utils/app"
|
||||
"github.com/emicklei/go-restful/v3"
|
||||
@@ -76,32 +78,74 @@ func (h *Handler) updateAppEnv(req *restful.Request, resp *restful.Response) {
|
||||
return
|
||||
}
|
||||
|
||||
var refEnvOnce sync.Once
|
||||
var listErr error
|
||||
refEnvs := make(map[string]string)
|
||||
|
||||
updated := false
|
||||
original := targetAppEnv.DeepCopy()
|
||||
for i, existingEnv := range targetAppEnv.Envs {
|
||||
for _, env := range updatedEnvs {
|
||||
if existingEnv.EnvName == env.EnvName {
|
||||
if !existingEnv.Editable {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' is not editable", env.EnvName))
|
||||
return
|
||||
}
|
||||
if existingEnv.Required && existingEnv.Default == "" && env.Value == "" {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' is required", env.EnvName))
|
||||
return
|
||||
}
|
||||
if existingEnv.Value != env.Value {
|
||||
if err := existingEnv.ValidateValue(env.Value); err != nil {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("failed to update app env '%s': %v", env.EnvName, err))
|
||||
if existingEnv.EnvName != env.EnvName {
|
||||
continue
|
||||
}
|
||||
if !existingEnv.Editable {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' is not editable", env.EnvName))
|
||||
return
|
||||
}
|
||||
if existingEnv.Required && existingEnv.Default == "" && env.Value == "" && (env.ValueFrom == nil || env.ValueFrom.EnvName == "") {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' is required", env.EnvName))
|
||||
return
|
||||
}
|
||||
if env.ValueFrom != nil && env.ValueFrom.EnvName != "" && (existingEnv.ValueFrom == nil || existingEnv.ValueFrom.EnvName != env.ValueFrom.EnvName) {
|
||||
refEnvOnce.Do(func() {
|
||||
sysenvs := new(sysv1alpha1.SystemEnvList)
|
||||
listErr = h.ctrlClient.List(req.Request.Context(), sysenvs)
|
||||
if listErr != nil {
|
||||
return
|
||||
}
|
||||
targetAppEnv.Envs[i].Value = env.Value
|
||||
updated = true
|
||||
if existingEnv.ApplyOnChange {
|
||||
targetAppEnv.NeedApply = true
|
||||
userenvs := new(sysv1alpha1.UserEnvList)
|
||||
listErr = h.ctrlClient.List(req.Request.Context(), userenvs, client.InNamespace(utils.UserspaceName(owner)))
|
||||
for _, sysenv := range sysenvs.Items {
|
||||
refEnvs[sysenv.EnvName] = sysenv.GetEffectiveValue()
|
||||
}
|
||||
for _, userenv := range userenvs.Items {
|
||||
refEnvs[userenv.EnvName] = userenv.GetEffectiveValue()
|
||||
}
|
||||
})
|
||||
if listErr != nil {
|
||||
api.HandleInternalError(resp, req, fmt.Errorf("failed to list referenced envs: %s", listErr))
|
||||
return
|
||||
}
|
||||
break
|
||||
value, ok := refEnvs[env.ValueFrom.EnvName]
|
||||
if !ok {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' references unknown env '%s'", env.EnvName, env.ValueFrom.EnvName))
|
||||
return
|
||||
}
|
||||
if existingEnv.Required && value == "" {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("required app env '%s' references empty env '%s'", env.EnvName, env.ValueFrom.EnvName))
|
||||
return
|
||||
}
|
||||
if existingEnv.ValidateValue(value) != nil {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("app env '%s' references invalid value '%s' from '%s': %v", env.EnvName, value, env.ValueFrom.EnvName, err))
|
||||
return
|
||||
}
|
||||
targetAppEnv.Envs[i].ValueFrom = env.ValueFrom
|
||||
targetAppEnv.Envs[i].Value = value
|
||||
targetAppEnv.Envs[i].ValueFrom.Status = constants.EnvRefStatusSynced
|
||||
updated = true
|
||||
} else if existingEnv.Value != env.Value {
|
||||
if err := existingEnv.ValidateValue(env.Value); err != nil {
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("failed to update app env '%s': %v", env.EnvName, err))
|
||||
return
|
||||
}
|
||||
targetAppEnv.Envs[i].Value = env.Value
|
||||
updated = true
|
||||
}
|
||||
if updated && existingEnv.ApplyOnChange {
|
||||
targetAppEnv.NeedApply = true
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,174 +1,33 @@
|
||||
package apiserver
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/client/clientset"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/constants"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/utils"
|
||||
"golang.org/x/exp/maps"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
|
||||
"github.com/emicklei/go-restful/v3"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/client-go/util/retry"
|
||||
"k8s.io/klog/v2"
|
||||
)
|
||||
|
||||
var running bool = false
|
||||
var switchLock sync.Mutex
|
||||
|
||||
func (h *Handler) disableGpuManagedMemory(req *restful.Request, resp *restful.Response) {
|
||||
if err := h.nvshareSwitch(req, false); err != nil {
|
||||
api.HandleError(resp, req, &errors.StatusError{
|
||||
ErrStatus: metav1.Status{Code: 400, Message: "operation failed, " + err.Error()},
|
||||
})
|
||||
|
||||
func (h *Handler) getGpuTypes(req *restful.Request, resp *restful.Response) {
|
||||
var nodes corev1.NodeList
|
||||
err := h.ctrlClient.List(req.Request.Context(), &nodes, &client.ListOptions{})
|
||||
if err != nil {
|
||||
klog.Errorf("list node failed %v", err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
resp.WriteAsJson(map[string]int{"code": 0})
|
||||
}
|
||||
|
||||
func (h *Handler) enableGpuManagedMemory(req *restful.Request, resp *restful.Response) {
|
||||
if err := h.nvshareSwitch(req, true); err != nil {
|
||||
api.HandleError(resp, req, &errors.StatusError{
|
||||
ErrStatus: metav1.Status{Code: 400, Message: "operation failed, " + err.Error()},
|
||||
})
|
||||
|
||||
gpuTypes, err := utils.GetAllGpuTypesFromNodes(&nodes)
|
||||
if err != nil {
|
||||
klog.Errorf("get gpu type failed %v", err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
resp.WriteAsJson(map[string]int{"code": 0})
|
||||
}
|
||||
|
||||
func (h *Handler) nvshareSwitch(req *restful.Request, enable bool) error {
|
||||
client := req.Attribute(constants.KubeSphereClientAttribute).(*clientset.ClientSet)
|
||||
switchLock.Lock()
|
||||
defer switchLock.Unlock()
|
||||
|
||||
if running {
|
||||
return fmt.Errorf("last operation is still running")
|
||||
}
|
||||
|
||||
deployments, err := client.KubeClient.Kubernetes().AppsV1().Deployments("").List(req.Request.Context(), metav1.ListOptions{})
|
||||
if err != nil {
|
||||
klog.Error("list deployment error, ", err)
|
||||
return err
|
||||
}
|
||||
|
||||
envValue := "0"
|
||||
if enable {
|
||||
envValue = "1"
|
||||
}
|
||||
|
||||
for _, d := range deployments.Items {
|
||||
shouldUpdate := false
|
||||
for i, c := range d.Spec.Template.Spec.Containers {
|
||||
found := false
|
||||
for k := range c.Resources.Limits {
|
||||
if k == constants.NvshareGPU {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if found {
|
||||
// a gpu request container
|
||||
addEnv := true
|
||||
for n, env := range d.Spec.Template.Spec.Containers[i].Env {
|
||||
if env.Name == constants.EnvNvshareManagedMemory {
|
||||
addEnv = false
|
||||
d.Spec.Template.Spec.Containers[i].Env[n].Value = envValue
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if addEnv {
|
||||
d.Spec.Template.Spec.Containers[i].Env =
|
||||
append(d.Spec.Template.Spec.Containers[i].Env,
|
||||
corev1.EnvVar{Name: constants.EnvNvshareManagedMemory, Value: envValue})
|
||||
}
|
||||
|
||||
shouldUpdate = true
|
||||
} // end found
|
||||
} // end of container loop
|
||||
|
||||
if shouldUpdate {
|
||||
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
|
||||
deployment, err := client.KubeClient.Kubernetes().AppsV1().Deployments(d.Namespace).
|
||||
Get(req.Request.Context(), d.Name, metav1.GetOptions{})
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
deployment.Spec.Template.Spec.Containers = d.Spec.Template.Spec.Containers
|
||||
|
||||
_, err = client.KubeClient.Kubernetes().AppsV1().Deployments(d.Namespace).
|
||||
Update(req.Request.Context(), deployment, metav1.UpdateOptions{})
|
||||
|
||||
return err
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
klog.Error("update deployment error, ", err, ", ", d.Name, ", ", d.Namespace)
|
||||
return err
|
||||
}
|
||||
} // should update
|
||||
} // end of deployment loop
|
||||
|
||||
// update terminus
|
||||
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
|
||||
terminus, err := utils.GetTerminus(req.Request.Context(), h.ctrlClient)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
terminus.Spec.Settings[constants.EnvNvshareManagedMemory] = envValue
|
||||
|
||||
return h.ctrlClient.Update(req.Request.Context(), terminus)
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
klog.Error("update terminus error, ", err)
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
running = true
|
||||
// delay 30s, assume the all pods will be reload in 30s.
|
||||
delay := time.NewTimer(30 * time.Second)
|
||||
go func() {
|
||||
<-delay.C
|
||||
switchLock.Lock()
|
||||
defer switchLock.Unlock()
|
||||
|
||||
running = false
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *Handler) getManagedMemoryValue(req *restful.Request, resp *restful.Response) {
|
||||
terminus, err := utils.GetTerminus(req.Request.Context(), h.ctrlClient)
|
||||
if err != nil {
|
||||
klog.Error("get terminus value error, ", err)
|
||||
api.HandleError(resp, req, &errors.StatusError{
|
||||
ErrStatus: metav1.Status{Code: 400, Message: "get value error, " + err.Error()},
|
||||
})
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
managed := true
|
||||
if v, ok := terminus.Spec.Settings[constants.EnvNvshareManagedMemory]; ok && v == "0" {
|
||||
managed = false
|
||||
}
|
||||
|
||||
resp.WriteAsJson(&map[string]interface{}{
|
||||
"managed_memory": managed,
|
||||
"gpu_types": maps.Keys(gpuTypes),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
@@ -21,9 +21,12 @@ import (
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/utils"
|
||||
apputils "github.com/beclab/Olares/framework/app-service/pkg/utils/app"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/utils/config"
|
||||
"golang.org/x/exp/maps"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
|
||||
"github.com/emicklei/go-restful/v3"
|
||||
"helm.sh/helm/v3/pkg/time"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
@@ -37,7 +40,7 @@ type depRequest struct {
|
||||
type installHelperIntf interface {
|
||||
getAdminUsers() (admin []string, isAdmin bool, err error)
|
||||
getInstalledApps() (installed bool, app []*v1alpha1.Application, err error)
|
||||
getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion string) (err error)
|
||||
getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion, selectedGpuType string) (err error)
|
||||
setAppConfig(req *api.InstallRequest, appName string)
|
||||
validate(bool, []*v1alpha1.Application) error
|
||||
setAppEnv(overrides []sysv1alpha1.AppEnvVar) error
|
||||
@@ -105,6 +108,36 @@ func (h *Handler) install(req *restful.Request, resp *restful.Response) {
|
||||
}
|
||||
}
|
||||
|
||||
// check selected gpu type can be supported
|
||||
// if selectedGpuType != "" , then check if the gpu type exists in cluster
|
||||
// if selectedGpuType == "" , and only one gpu type exists in cluster, then use it
|
||||
var nodes corev1.NodeList
|
||||
err = h.ctrlClient.List(req.Request.Context(), &nodes, &client.ListOptions{})
|
||||
if err != nil {
|
||||
klog.Errorf("list node failed %v", err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
gpuTypes, err := utils.GetAllGpuTypesFromNodes(&nodes)
|
||||
if err != nil {
|
||||
klog.Errorf("get gpu type failed %v", err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
|
||||
if insReq.SelectedGpuType != "" {
|
||||
if _, ok := gpuTypes[insReq.SelectedGpuType]; !ok {
|
||||
klog.Errorf("selected gpu type %s not found in cluster", insReq.SelectedGpuType)
|
||||
api.HandleBadRequest(resp, req, fmt.Errorf("selected gpu type %s not found in cluster", insReq.SelectedGpuType))
|
||||
return
|
||||
}
|
||||
} else {
|
||||
if len(gpuTypes) == 1 {
|
||||
insReq.SelectedGpuType = maps.Keys(gpuTypes)[0]
|
||||
klog.Infof("only one gpu type %s found in cluster, use it as selected gpu type", insReq.SelectedGpuType)
|
||||
}
|
||||
}
|
||||
|
||||
apiVersion, appCfg, err := apputils.GetApiVersionFromAppConfig(req.Request.Context(), &apputils.ConfigOptions{
|
||||
App: app,
|
||||
RawAppName: rawAppName,
|
||||
@@ -112,6 +145,7 @@ func (h *Handler) install(req *restful.Request, resp *restful.Response) {
|
||||
RepoURL: insReq.RepoURL,
|
||||
MarketSource: marketSource,
|
||||
Version: chartVersion,
|
||||
SelectedGpu: insReq.SelectedGpuType,
|
||||
})
|
||||
klog.Infof("chartVersion: %s", chartVersion)
|
||||
if err != nil {
|
||||
@@ -188,7 +222,7 @@ func (h *Handler) install(req *restful.Request, resp *restful.Response) {
|
||||
return
|
||||
}
|
||||
|
||||
err = helper.getAppConfig(adminUsers, marketSource, isAdmin, appInstalled, installedApps, chartVersion)
|
||||
err = helper.getAppConfig(adminUsers, marketSource, isAdmin, appInstalled, installedApps, chartVersion, insReq.SelectedGpuType)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to get app config err=%v", err)
|
||||
return
|
||||
@@ -423,7 +457,7 @@ func (h *installHandlerHelper) getInstalledApps() (installed bool, app []*v1alph
|
||||
return
|
||||
}
|
||||
|
||||
func (h *installHandlerHelper) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion string) (err error) {
|
||||
func (h *installHandlerHelper) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion, selectedGpuType string) (err error) {
|
||||
var (
|
||||
admin string
|
||||
installAsAdmin bool
|
||||
@@ -472,6 +506,7 @@ func (h *installHandlerHelper) getAppConfig(adminUsers []string, marketSource st
|
||||
Admin: admin,
|
||||
IsAdmin: installAsAdmin,
|
||||
MarketSource: marketSource,
|
||||
SelectedGpu: selectedGpuType,
|
||||
})
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to get appconfig err=%v", err)
|
||||
@@ -685,7 +720,7 @@ func (h *installHandlerHelperV2) _validateClusterScope(isAdmin bool, installedAp
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *installHandlerHelperV2) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion string) (err error) {
|
||||
func (h *installHandlerHelperV2) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion, selectedGpuType string) (err error) {
|
||||
klog.Info("get app config for install handler v2")
|
||||
|
||||
var (
|
||||
@@ -713,6 +748,7 @@ func (h *installHandlerHelperV2) getAppConfig(adminUsers []string, marketSource
|
||||
Admin: admin,
|
||||
MarketSource: marketSource,
|
||||
IsAdmin: isAdmin,
|
||||
SelectedGpu: selectedGpuType,
|
||||
})
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to get appconfig err=%v", err)
|
||||
|
||||
@@ -77,6 +77,7 @@ func (h *Handler) uninstall(req *restful.Request, resp *restful.Response) {
|
||||
}
|
||||
am.Annotations[api.AppTokenKey] = token
|
||||
am.Annotations[api.AppUninstallAllKey] = fmt.Sprintf("%t", request.All)
|
||||
am.Annotations[api.AppDeleteDataKey] = fmt.Sprintf("%t", request.DeleteData)
|
||||
err = h.ctrlClient.Update(req.Request.Context(), &am)
|
||||
if err != nil {
|
||||
api.HandleError(resp, req, err)
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
"github.com/beclab/Olares/framework/app-service/api/app.bytetrade.io/v1alpha1"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/appcfg"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/appinstaller"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/appstate"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/client/clientset"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/constants"
|
||||
@@ -520,6 +521,7 @@ type applicationPermission struct {
|
||||
Permissions []permission `json:"permissions"`
|
||||
}
|
||||
|
||||
// Deprecated
|
||||
func (h *Handler) applicationPermissionList(req *restful.Request, resp *restful.Response) {
|
||||
owner := req.Attribute(constants.UserContextAttribute).(string)
|
||||
//token := req.HeaderParameter(constants.AuthorizationTokenKey)
|
||||
@@ -572,46 +574,39 @@ func (h *Handler) applicationPermissionList(req *restful.Request, resp *restful.
|
||||
func (h *Handler) getApplicationPermission(req *restful.Request, resp *restful.Response) {
|
||||
app := req.PathParameter(ParamAppName)
|
||||
owner := req.Attribute(constants.UserContextAttribute).(string)
|
||||
client, err := dynamic.NewForConfig(h.kubeConfig)
|
||||
name, err := apputils.FmtAppMgrName(app, owner, "")
|
||||
if err != nil {
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
var am v1alpha1.ApplicationManager
|
||||
err = h.ctrlClient.Get(req.Request.Context(), types.NamespacedName{Name: name}, &am)
|
||||
if err != nil {
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
|
||||
var appConfig appcfg.ApplicationConfig
|
||||
err = am.GetAppConfig(&appConfig)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to get app config err=%v", err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
|
||||
var ret *applicationPermission
|
||||
apClient := provider.NewApplicationPermissionRequest(client)
|
||||
namespace := fmt.Sprintf("user-system-%s", owner)
|
||||
aps, err := apClient.List(req.Request.Context(), namespace, metav1.ListOptions{})
|
||||
if err != nil {
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
for _, ap := range aps.Items {
|
||||
if ap.Object == nil {
|
||||
continue
|
||||
}
|
||||
appName, _, _ := unstructured.NestedString(ap.Object, "spec", "app")
|
||||
if appName == app {
|
||||
perms, _, _ := unstructured.NestedSlice(ap.Object, "spec", "permissions")
|
||||
permissions := appinstaller.ParseAppPermission(appConfig.Permission)
|
||||
for _, ap := range permissions {
|
||||
if perms, ok := ap.([]appcfg.ProviderPermission); ok {
|
||||
permissions := make([]permission, 0)
|
||||
for _, p := range perms {
|
||||
if perm, ok := p.(map[string]interface{}); ok {
|
||||
ops := make([]string, 0)
|
||||
for _, op := range perm["ops"].([]interface{}) {
|
||||
if opStr, ok := op.(string); ok {
|
||||
ops = append(ops, opStr)
|
||||
}
|
||||
}
|
||||
permissions = append(permissions, permission{
|
||||
DataType: perm["dataType"].(string),
|
||||
Group: perm["group"].(string),
|
||||
Version: perm["version"].(string),
|
||||
Ops: ops,
|
||||
})
|
||||
}
|
||||
|
||||
permissions = append(permissions, permission{
|
||||
DataType: p.ProviderName,
|
||||
Group: p.AppName,
|
||||
})
|
||||
}
|
||||
ret = &applicationPermission{
|
||||
App: appName,
|
||||
App: am.Spec.AppName,
|
||||
Owner: owner,
|
||||
Permissions: permissions,
|
||||
}
|
||||
@@ -642,6 +637,7 @@ type opApi struct {
|
||||
URI string `json:"uri"`
|
||||
}
|
||||
|
||||
// Deprecated
|
||||
func (h *Handler) getProviderRegistry(req *restful.Request, resp *restful.Response) {
|
||||
dataTypeReq := req.PathParameter(ParamDataType)
|
||||
groupReq := req.PathParameter(ParamGroup)
|
||||
@@ -708,56 +704,44 @@ func (h *Handler) getProviderRegistry(req *restful.Request, resp *restful.Respon
|
||||
func (h *Handler) getApplicationProviderList(req *restful.Request, resp *restful.Response) {
|
||||
owner := req.Attribute(constants.UserContextAttribute).(string)
|
||||
app := req.PathParameter(ParamAppName)
|
||||
client, err := dynamic.NewForConfig(h.kubeConfig)
|
||||
|
||||
name, err := apputils.FmtAppMgrName(app, owner, "")
|
||||
if err != nil {
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
var am v1alpha1.ApplicationManager
|
||||
err = h.ctrlClient.Get(req.Request.Context(), types.NamespacedName{Name: name}, &am)
|
||||
if err != nil {
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
|
||||
var appConfig appcfg.ApplicationConfig
|
||||
err = am.GetAppConfig(&appConfig)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to get app config err=%v", err)
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
|
||||
ret := make([]providerRegistry, 0)
|
||||
rClient := provider.NewRegistryRequest(client)
|
||||
namespace := fmt.Sprintf("user-system-%s", owner)
|
||||
prs, err := rClient.List(req.Request.Context(), namespace, metav1.ListOptions{})
|
||||
if err != nil {
|
||||
api.HandleError(resp, req, err)
|
||||
return
|
||||
}
|
||||
for _, ap := range prs.Items {
|
||||
if ap.Object == nil {
|
||||
continue
|
||||
}
|
||||
deployment, _, _ := unstructured.NestedString(ap.Object, "spec", "deployment")
|
||||
kind, _, _ := unstructured.NestedString(ap.Object, "spec", "kind")
|
||||
|
||||
if app == deployment && kind == "provider" {
|
||||
dataType, _, _ := unstructured.NestedString(ap.Object, "spec", "dataType")
|
||||
group, _, _ := unstructured.NestedString(ap.Object, "spec", "group")
|
||||
description, _, _ := unstructured.NestedString(ap.Object, "spec", "description")
|
||||
endpoint, _, _ := unstructured.NestedString(ap.Object, "spec", "endpoint")
|
||||
ns, _, _ := unstructured.NestedString(ap.Object, "spec", "namespace")
|
||||
version, _, _ := unstructured.NestedString(ap.Object, "spec", "version")
|
||||
opApis := make([]opApi, 0)
|
||||
opApiList, _, _ := unstructured.NestedSlice(ap.Object, "spec", "opApis")
|
||||
for _, op := range opApiList {
|
||||
if aop, ok := op.(map[string]interface{}); ok {
|
||||
opApis = append(opApis, opApi{
|
||||
Name: aop["name"].(string),
|
||||
URI: aop["uri"].(string),
|
||||
})
|
||||
}
|
||||
}
|
||||
ret = append(ret, providerRegistry{
|
||||
DataType: dataType,
|
||||
Deployment: deployment,
|
||||
Description: description,
|
||||
Endpoint: endpoint,
|
||||
Kind: kind,
|
||||
Group: group,
|
||||
Namespace: ns,
|
||||
OpApis: opApis,
|
||||
Version: version,
|
||||
ns := am.Spec.AppNamespace
|
||||
for _, ap := range appConfig.Provider {
|
||||
dataType := ap.Name
|
||||
endpoint := ap.Entrance
|
||||
opApis := make([]opApi, 0)
|
||||
for _, op := range ap.Paths {
|
||||
opApis = append(opApis, opApi{
|
||||
URI: op,
|
||||
})
|
||||
|
||||
}
|
||||
ret = append(ret, providerRegistry{
|
||||
DataType: dataType,
|
||||
Endpoint: endpoint,
|
||||
Namespace: ns,
|
||||
OpApis: opApis,
|
||||
})
|
||||
}
|
||||
resp.WriteAsJson(ret)
|
||||
}
|
||||
|
||||
@@ -37,7 +37,6 @@ import (
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/client-go/dynamic"
|
||||
"k8s.io/klog/v2"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
|
||||
)
|
||||
|
||||
@@ -308,36 +307,21 @@ func (h *Handler) gpuLimitMutate(ctx context.Context, req *admissionv1.Admission
|
||||
return resp
|
||||
}
|
||||
|
||||
GPUType, err := h.findNvidiaGpuFromNodes(ctx)
|
||||
if err != nil && !errors.Is(err, api.ErrGPUNodeNotFound) {
|
||||
return h.sidecarWebhook.AdmissionError(req.UID, err)
|
||||
}
|
||||
GPUType := appcfg.GetSelectedGpuTypeValue()
|
||||
|
||||
// no gpu found, no need to inject env, just return.
|
||||
if GPUType == "" {
|
||||
if GPUType == "none" || GPUType == "" {
|
||||
return resp
|
||||
}
|
||||
|
||||
terminus, err := utils.GetTerminus(ctx, h.ctrlClient)
|
||||
if err != nil {
|
||||
return h.sidecarWebhook.AdmissionError(req.UID, err)
|
||||
}
|
||||
nvshareManagedMemory := ""
|
||||
if terminus.Spec.Settings != nil {
|
||||
nvshareManagedMemory = terminus.Spec.Settings[constants.EnvNvshareManagedMemory]
|
||||
envs := []webhook.EnvKeyValue{
|
||||
{
|
||||
Key: constants.EnvGPUType,
|
||||
Value: GPUType,
|
||||
},
|
||||
}
|
||||
|
||||
envs := []webhook.EnvKeyValue{}
|
||||
if nvshareManagedMemory != "" {
|
||||
envs = append(envs, webhook.EnvKeyValue{
|
||||
Key: constants.EnvNvshareManagedMemory,
|
||||
Value: nvshareManagedMemory,
|
||||
})
|
||||
}
|
||||
|
||||
envs = append(envs, webhook.EnvKeyValue{Key: "NVSHARE_DEBUG", Value: "1"})
|
||||
|
||||
patchBytes, err := webhook.CreatePatchForDeployment(tpl, req.Namespace, gpuRequired, GPUType, envs)
|
||||
patchBytes, err := webhook.CreatePatchForDeployment(tpl, h.getGPUResourceTypeKey(GPUType), envs)
|
||||
if err != nil {
|
||||
klog.Errorf("create patch error %v", err)
|
||||
return h.sidecarWebhook.AdmissionError(req.UID, err)
|
||||
@@ -347,33 +331,17 @@ func (h *Handler) gpuLimitMutate(ctx context.Context, req *admissionv1.Admission
|
||||
return resp
|
||||
}
|
||||
|
||||
func (h *Handler) findNvidiaGpuFromNodes(ctx context.Context) (string, error) {
|
||||
var nodes corev1.NodeList
|
||||
err := h.ctrlClient.List(ctx, &nodes, &client.ListOptions{})
|
||||
if err != nil {
|
||||
return "", err
|
||||
func (h *Handler) getGPUResourceTypeKey(gpuType string) string {
|
||||
switch gpuType {
|
||||
case utils.NvidiaCardType:
|
||||
return constants.NvidiaGPU
|
||||
case utils.GB10ChipType:
|
||||
return constants.NvidiaGB10GPU
|
||||
case utils.AmdApuCardType:
|
||||
return constants.AMDAPU
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
|
||||
// return nvshare gpu or virtaitech gpu in priority
|
||||
gtype := ""
|
||||
for _, n := range nodes.Items {
|
||||
if _, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
|
||||
if _, ok = n.Status.Capacity[constants.NvshareGPU]; ok {
|
||||
return constants.NvshareGPU, nil
|
||||
}
|
||||
gtype = constants.NvidiaGPU
|
||||
}
|
||||
|
||||
if _, ok := n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
|
||||
return constants.VirtAiTechVGPU, nil
|
||||
}
|
||||
}
|
||||
|
||||
if gtype != "" {
|
||||
return gtype, nil
|
||||
}
|
||||
|
||||
return "", api.ErrGPUNodeNotFound
|
||||
}
|
||||
|
||||
func (h *Handler) providerRegistryValidate(req *restful.Request, resp *restful.Response) {
|
||||
|
||||
@@ -340,7 +340,9 @@ func GetClusterResource(kubeConfig *rest.Config, token string) (*prometheus.Clus
|
||||
arches.Insert(n.Labels["kubernetes.io/arch"])
|
||||
if quantity, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
} else if quantity, ok = n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
|
||||
} else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
} else if quantity, ok = n.Status.Capacity[constants.AMDAPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -254,21 +254,9 @@ func addServiceToContainer(c *restful.Container, handler *Handler) error {
|
||||
Param(ws.PathParameter(ParamEntranceName, "the name of a application entrance")).
|
||||
Returns(http.StatusOK, "Success to set the application entrance policy", nil))
|
||||
|
||||
ws.Route(ws.POST("/gpu/disable/managed-memory").
|
||||
To(handler.disableGpuManagedMemory).
|
||||
Doc("disable nvshare's managed memory ").
|
||||
Metadata(restfulspec.KeyOpenAPITags, MODULE_TAGS).
|
||||
Returns(http.StatusOK, "Success to disable", nil))
|
||||
|
||||
ws.Route(ws.POST("/gpu/enable/managed-memory").
|
||||
To(handler.enableGpuManagedMemory).
|
||||
Doc("enable nvshare's managed memory ").
|
||||
Metadata(restfulspec.KeyOpenAPITags, MODULE_TAGS).
|
||||
Returns(http.StatusOK, "Success to enable", nil))
|
||||
|
||||
ws.Route(ws.GET("/gpu/managed-memory").
|
||||
To(handler.getManagedMemoryValue).
|
||||
Doc("get nvshare's managed memory enabled or not").
|
||||
ws.Route(ws.GET("/gpu/types").
|
||||
To(handler.getGpuTypes).
|
||||
Doc("get all gpu types in the cluster").
|
||||
Metadata(restfulspec.KeyOpenAPITags, MODULE_TAGS).
|
||||
Returns(http.StatusOK, "Success to get ", &ResultResponse{}))
|
||||
|
||||
|
||||
@@ -56,14 +56,19 @@ type AppSpec struct {
|
||||
Developer string `yaml:"developer" json:"developer"`
|
||||
RequiredMemory string `yaml:"requiredMemory" json:"requiredMemory"`
|
||||
RequiredDisk string `yaml:"requiredDisk" json:"requiredDisk"`
|
||||
SupportClient SupportClient `yaml:"supportClient" json:"supportClient"`
|
||||
RequiredGPU string `yaml:"requiredGpu" json:"requiredGpu"`
|
||||
RequiredCPU string `yaml:"requiredCpu" json:"requiredCpu"`
|
||||
LimitedMemory string `yaml:"limitedMemory" json:"limitedMemory"`
|
||||
LimitedDisk string `yaml:"limitedDisk" json:"limitedDisk"`
|
||||
LimitedGPU string `yaml:"limitedGPU" json:"limitedGPU"`
|
||||
LimitedCPU string `yaml:"limitedCPU" json:"limitedCPU"`
|
||||
SupportClient SupportClient `yaml:"supportClient" json:"supportClient"`
|
||||
RunAsUser bool `yaml:"runAsUser" json:"runAsUser"`
|
||||
RunAsInternal bool `yaml:"runAsInternal" json:"runAsInternal"`
|
||||
PodGPUConsumePolicy string `yaml:"podGpuConsumePolicy" json:"podGpuConsumePolicy"`
|
||||
SubCharts []Chart `yaml:"subCharts" json:"subCharts"`
|
||||
Hardware Hardware `yaml:"hardware" json:"hardware"`
|
||||
SupportedGpu []any `yaml:"supportedGpu,omitempty" json:"supportedGpu,omitempty"`
|
||||
}
|
||||
|
||||
type Hardware struct {
|
||||
@@ -188,6 +193,17 @@ type Provider struct {
|
||||
Verbs []string `yaml:"verbs" json:"verbs"`
|
||||
}
|
||||
|
||||
type SpecialResource struct {
|
||||
RequiredMemory *string `yaml:"requiredMemory,omitempty" json:"requiredMemory,omitempty"`
|
||||
RequiredDisk *string `yaml:"requiredDisk,omitempty" json:"requiredDisk,omitempty"`
|
||||
RequiredGPU *string `yaml:"requiredGpu,omitempty" json:"requiredGpu,omitempty"`
|
||||
RequiredCPU *string `yaml:"requiredCpu,omitempty" json:"requiredCpu,omitempty"`
|
||||
LimitedMemory *string `yaml:"limitedMemory,omitempty" json:"limitedMemory,omitempty"`
|
||||
LimitedDisk *string `yaml:"limitedDisk,omitempty" json:"limitedDisk,omitempty"`
|
||||
LimitedGPU *string `yaml:"limitedGPU,omitempty" json:"limitedGPU,omitempty"`
|
||||
LimitedCPU *string `yaml:"limitedCPU,omitempty" json:"limitedCPU,omitempty"`
|
||||
}
|
||||
|
||||
func (c *Chart) Namespace(owner string) string {
|
||||
if c.Shared {
|
||||
return fmt.Sprintf("%s-%s", c.Name, "shared")
|
||||
|
||||
@@ -100,6 +100,7 @@ type ApplicationConfig struct {
|
||||
PodsSelectors []metav1.LabelSelector
|
||||
HardwareRequirement Hardware
|
||||
SharedEntrances []v1alpha1.Entrance
|
||||
SelectedGpuType string
|
||||
}
|
||||
|
||||
func (c *ApplicationConfig) IsMiddleware() bool {
|
||||
@@ -159,6 +160,13 @@ func (c *ApplicationConfig) GenSharedEntranceURL(ctx context.Context) ([]v1alpha
|
||||
return app.GenSharedEntranceURL(ctx)
|
||||
}
|
||||
|
||||
func (c *ApplicationConfig) GetSelectedGpuTypeValue() string {
|
||||
if c.SelectedGpuType == "" {
|
||||
return "none"
|
||||
}
|
||||
return c.SelectedGpuType
|
||||
}
|
||||
|
||||
func (p *ProviderPermission) GetNamespace(ownerName string) string {
|
||||
if p.Namespace != "" {
|
||||
if p.Namespace == "user-space" || p.Namespace == "user-system" {
|
||||
|
||||
@@ -752,7 +752,7 @@ func getApplicationPolicy(policies []appcfg.AppPolicy, entrances []appv1alpha1.E
|
||||
return string(policyStr), nil
|
||||
}
|
||||
|
||||
func parseAppPermission(data []appcfg.AppPermission) []appcfg.AppPermission {
|
||||
func ParseAppPermission(data []appcfg.AppPermission) []appcfg.AppPermission {
|
||||
permissions := make([]appcfg.AppPermission, 0)
|
||||
for _, p := range data {
|
||||
switch perm := p.(type) {
|
||||
|
||||
@@ -29,8 +29,14 @@ func (h *HelmOps) UninstallAll() error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
appName := fmt.Sprintf("%s-%s", h.app.Namespace, h.app.AppName)
|
||||
appmgr, err := h.client.AppClient.AppV1alpha1().ApplicationManagers().Get(h.ctx, appName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
deleteData := appmgr.Annotations["bytetrade.io/delete-data"] == "true"
|
||||
|
||||
appCacheDirs, err := apputils.TryToGetAppdataDirFromDeployment(h.ctx, h.app.Namespace, h.app.AppName, h.app.OwnerName)
|
||||
appCacheDirs, appDataDirs, err := apputils.TryToGetAppdataDirFromDeployment(h.ctx, h.app.Namespace, h.app.AppName, h.app.OwnerName, deleteData)
|
||||
if err != nil {
|
||||
klog.Warningf("get app %s cache dir failed %v", h.app.AppName, err)
|
||||
}
|
||||
@@ -48,6 +54,13 @@ func (h *HelmOps) UninstallAll() error {
|
||||
klog.Errorf("Failed to clear app cache dirs %v err=%v", appCacheDirs, err)
|
||||
return err
|
||||
}
|
||||
if deleteData {
|
||||
h.ClearData(client, appDataDirs)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to clear app data dirs %v err=%v", appDataDirs, err)
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
err = h.DeleteNamespace(client, h.app.Namespace)
|
||||
if err != nil {
|
||||
@@ -78,7 +91,7 @@ func (h *HelmOps) Uninstall_(client kubernetes.Interface, actionConfig *action.C
|
||||
return err
|
||||
}
|
||||
|
||||
h.app.Permission = parseAppPermission(h.app.Permission)
|
||||
h.app.Permission = ParseAppPermission(h.app.Permission)
|
||||
var perm []appcfg.ProviderPermission
|
||||
for _, p := range h.app.Permission {
|
||||
if t, ok := p.([]appcfg.ProviderPermission); ok {
|
||||
@@ -117,7 +130,7 @@ func (h *HelmOps) ClearCache(client kubernetes.Interface, appCacheDirs []string)
|
||||
formattedAppCacheDirs := apputils.FormatCacheDirs(appCacheDirs)
|
||||
|
||||
for _, n := range nodes.Items {
|
||||
URL := fmt.Sprintf(constants.AppDataDirURL, n.Name)
|
||||
URL := fmt.Sprintf(constants.AppCacheDirURL, n.Name)
|
||||
c.SetHeader("X-Terminus-Node", n.Name)
|
||||
c.SetHeader("X-Bfl-User", h.app.OwnerName)
|
||||
res, e := c.R().SetBody(map[string]interface{}{
|
||||
@@ -137,6 +150,32 @@ func (h *HelmOps) ClearCache(client kubernetes.Interface, appCacheDirs []string)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *HelmOps) ClearData(client kubernetes.Interface, appDataDirs []string) error {
|
||||
if len(appDataDirs) > 0 {
|
||||
klog.Infof("clear app data dirs: %v", appDataDirs)
|
||||
|
||||
c := resty.New().SetTimeout(2 * time.Second).
|
||||
SetAuthToken(h.token)
|
||||
|
||||
formattedAppDataDirs := apputils.FormatCacheDirs(appDataDirs)
|
||||
|
||||
URL := constants.AppDataDirURL
|
||||
c.SetHeader("X-Bfl-User", h.app.OwnerName)
|
||||
res, e := c.R().SetBody(map[string]interface{}{
|
||||
"dirents": formattedAppDataDirs,
|
||||
}).Delete(URL)
|
||||
if e != nil {
|
||||
klog.Errorf("Failed to delete data dir err=%v", e)
|
||||
return nil
|
||||
}
|
||||
if res.StatusCode() != http.StatusOK {
|
||||
klog.Infof("delete app data failed with: %v", res.String())
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *HelmOps) ClearMiddlewareRequests(middlewareNamespace string) {
|
||||
// delete middleware requests crd
|
||||
for _, mt := range middlewareTypes {
|
||||
|
||||
@@ -50,7 +50,7 @@ func (h *HelmOps) SetValues() (values map[string]interface{}, err error) {
|
||||
|
||||
values["domain"] = entries
|
||||
userspace := make(map[string]interface{})
|
||||
h.app.Permission = parseAppPermission(h.app.Permission)
|
||||
h.app.Permission = ParseAppPermission(h.app.Permission)
|
||||
for _, p := range h.app.Permission {
|
||||
switch perm := p.(type) {
|
||||
case appcfg.AppDataPermission, appcfg.AppCachePermission, appcfg.UserDataPermission:
|
||||
@@ -170,17 +170,12 @@ func (h *HelmOps) SetValues() (values map[string]interface{}, err error) {
|
||||
values["cluster"] = map[string]interface{}{
|
||||
"arch": arch,
|
||||
}
|
||||
gpuType, err := utils.FindGpuTypeFromNodes(nodes)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to get gpuType err=%v", err)
|
||||
return values, err
|
||||
}
|
||||
values["GPU"] = map[string]interface{}{
|
||||
"Type": gpuType,
|
||||
"Type": h.app.GetSelectedGpuTypeValue(),
|
||||
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
|
||||
}
|
||||
|
||||
values["gpu"] = gpuType
|
||||
values["gpu"] = h.app.GetSelectedGpuTypeValue()
|
||||
|
||||
if h.app.OIDC.Enabled {
|
||||
err = h.createOIDCClient(values, zone, h.app.Namespace)
|
||||
|
||||
@@ -16,7 +16,6 @@ import (
|
||||
apputils "github.com/beclab/Olares/framework/app-service/pkg/utils/app"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"k8s.io/klog/v2"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
@@ -176,19 +175,8 @@ func (p *DownloadingApp) exec(ctx context.Context) error {
|
||||
},
|
||||
}
|
||||
|
||||
var nodes corev1.NodeList
|
||||
err = p.client.List(ctx, &nodes, &client.ListOptions{})
|
||||
if err != nil {
|
||||
klog.Errorf("list node failed %v", err)
|
||||
return err
|
||||
}
|
||||
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
|
||||
if err != nil {
|
||||
klog.Errorf("get gpu type failed %v", gpuType)
|
||||
return err
|
||||
}
|
||||
values["GPU"] = map[string]interface{}{
|
||||
"Type": gpuType,
|
||||
"Type": appConfig.GetSelectedGpuTypeValue(),
|
||||
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
|
||||
}
|
||||
|
||||
|
||||
@@ -22,7 +22,6 @@ import (
|
||||
|
||||
"github.com/pkg/errors"
|
||||
"helm.sh/helm/v3/pkg/action"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"k8s.io/klog/v2"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
@@ -214,19 +213,8 @@ func (p *UpgradingApp) exec(ctx context.Context) error {
|
||||
"username": p.manager.Spec.AppOwner,
|
||||
},
|
||||
}
|
||||
var nodes corev1.NodeList
|
||||
err = p.client.List(ctx, &nodes, &client.ListOptions{})
|
||||
if err != nil {
|
||||
klog.Errorf("list node failed %v", err)
|
||||
return err
|
||||
}
|
||||
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
|
||||
if err != nil {
|
||||
klog.Errorf("get gpu type failed %v", gpuType)
|
||||
return err
|
||||
}
|
||||
values["GPU"] = map[string]interface{}{
|
||||
"Type": gpuType,
|
||||
"Type": appConfig.GetSelectedGpuTypeValue(),
|
||||
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
|
||||
}
|
||||
|
||||
|
||||
@@ -78,20 +78,23 @@ const (
|
||||
SidecarInitContainerName = "olares-sidecar-init"
|
||||
EnvoyConfigWorkDirName = "envoy-config"
|
||||
|
||||
ByteTradeAuthor = "bytetrade.io"
|
||||
NvshareGPU = "nvshare.com/gpu"
|
||||
NvidiaGPU = "nvidia.com/gpu"
|
||||
VirtAiTechVGPU = "virtaitech.com/gpu"
|
||||
PatchOpAdd = "add"
|
||||
PatchOpReplace = "replace"
|
||||
EnvNvshareManagedMemory = "NVSHARE_MANAGED_MEMORY"
|
||||
ByteTradeAuthor = "bytetrade.io"
|
||||
PatchOpAdd = "add"
|
||||
PatchOpReplace = "replace"
|
||||
EnvGPUType = "GPU_TYPE"
|
||||
|
||||
// gpu resource keys
|
||||
NvidiaGPU = "nvidia.com/gpu"
|
||||
NvidiaGB10GPU = "nvidia.com/gb10"
|
||||
AMDAPU = "amd.com/apu"
|
||||
|
||||
AuthorizationLevelOfPublic = "public"
|
||||
AuthorizationLevelOfPrivate = "private"
|
||||
|
||||
DependencyTypeSystem = "system"
|
||||
DependencyTypeApp = "application"
|
||||
AppDataDirURL = "http://files-service.os-framework/api/resources/cache/%s/"
|
||||
AppCacheDirURL = "http://files-service.os-framework/api/resources/cache/%s/"
|
||||
AppDataDirURL = "http://files-service.os-framework/api/resources/drive/Data/"
|
||||
|
||||
UserSpaceDirKey = "userspace_hostpath"
|
||||
UserAppDataDirKey = "appcache_hostpath"
|
||||
|
||||
@@ -273,11 +273,7 @@ func (c *Creator) installSysApps(ctx context.Context, bflPod *corev1.Pod) error
|
||||
"arch": arch,
|
||||
}
|
||||
|
||||
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
vals["gpu"] = gpuType
|
||||
vals["gpu"] = "none" // unused currently
|
||||
|
||||
userIndex, userSubnet, err := c.getUserSubnet(ctx)
|
||||
if err != nil {
|
||||
|
||||
@@ -16,12 +16,12 @@ import (
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
|
||||
sysv1alpha1 "github.com/beclab/Olares/framework/app-service/api/sys.bytetrade.io/v1alpha1"
|
||||
"github.com/go-viper/mapstructure/v2"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
|
||||
"github.com/beclab/Olares/framework/app-service/api/app.bytetrade.io/v1alpha1"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/appcfg"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/constants"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/generated/clientset/versioned"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/users/userspace"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/utils"
|
||||
"github.com/beclab/Olares/framework/app-service/pkg/utils/files"
|
||||
@@ -553,7 +553,7 @@ func parseDestination(dest string) (string, string, error) {
|
||||
return alias, tokens[len(tokens)-1], nil
|
||||
}
|
||||
|
||||
func TryToGetAppdataDirFromDeployment(ctx context.Context, namespace, name, owner string) (appdirs []string, err error) {
|
||||
func TryToGetAppdataDirFromDeployment(ctx context.Context, namespace, name, owner string, appData bool) (appCacheDirs []string, appDataDirs []string, err error) {
|
||||
userspaceNs := utils.UserspaceName(owner)
|
||||
config, err := ctrl.GetConfig()
|
||||
if err != nil {
|
||||
@@ -567,7 +567,6 @@ func TryToGetAppdataDirFromDeployment(ctx context.Context, namespace, name, owne
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
appName := fmt.Sprintf("%s-%s", namespace, name)
|
||||
appCachePath := sts.GetAnnotations()["appcache_hostpath"]
|
||||
if len(appCachePath) == 0 {
|
||||
err = errors.New("empty appcache_hostpath")
|
||||
@@ -576,20 +575,23 @@ func TryToGetAppdataDirFromDeployment(ctx context.Context, namespace, name, owne
|
||||
if !strings.HasSuffix(appCachePath, "/") {
|
||||
appCachePath += "/"
|
||||
}
|
||||
dClient, err := versioned.NewForConfig(config)
|
||||
if err != nil {
|
||||
|
||||
userspacePath := sts.GetAnnotations()["userspace_hostpath"]
|
||||
if len(userspacePath) == 0 {
|
||||
err = errors.New("empty userspace_hostpath annotation")
|
||||
return
|
||||
}
|
||||
appCRD, err := dClient.AppV1alpha1().Applications().Get(ctx, appName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
return
|
||||
appDataPath := filepath.Join(userspacePath, "Data")
|
||||
if !strings.HasSuffix(appDataPath, "/") {
|
||||
appDataPath += "/"
|
||||
}
|
||||
deploymentName := appCRD.Spec.DeploymentName
|
||||
|
||||
deploymentName := name
|
||||
deployment, err := clientset.AppsV1().Deployments(namespace).
|
||||
Get(context.Background(), deploymentName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
return tryToGetAppdataDirFromSts(ctx, namespace, deploymentName, appCachePath)
|
||||
return tryToGetAppdataDirFromSts(ctx, namespace, deploymentName, appCachePath, appDataPath)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -601,15 +603,31 @@ func TryToGetAppdataDirFromDeployment(ctx context.Context, namespace, name, owne
|
||||
if appDirSet.Has(appDir) {
|
||||
continue
|
||||
}
|
||||
appdirs = append(appdirs, appDir)
|
||||
appCacheDirs = append(appCacheDirs, appDir)
|
||||
appDirSet.Insert(appDir)
|
||||
}
|
||||
}
|
||||
}
|
||||
return appdirs, nil
|
||||
if appData {
|
||||
appDirSet := sets.NewString()
|
||||
|
||||
for _, v := range deployment.Spec.Template.Spec.Volumes {
|
||||
if v.HostPath != nil && strings.HasPrefix(v.HostPath.Path, appDataPath) && len(v.HostPath.Path) > len(appDataPath) {
|
||||
appDir := GetFirstSubDir(v.HostPath.Path, appDataPath)
|
||||
if appDir != "" {
|
||||
if appDirSet.Has(appDir) {
|
||||
continue
|
||||
}
|
||||
appDataDirs = append(appDataDirs, appDir)
|
||||
appDirSet.Insert(appDir)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return appCacheDirs, appDataDirs, nil
|
||||
}
|
||||
|
||||
func tryToGetAppdataDirFromSts(ctx context.Context, namespace, stsName, baseDir string) (appdirs []string, err error) {
|
||||
func tryToGetAppdataDirFromSts(ctx context.Context, namespace, stsName, appCacheDir, appDataDir string) (appCacheDirs []string, appDataDirs []string, err error) {
|
||||
config, err := ctrl.GetConfig()
|
||||
if err != nil {
|
||||
return
|
||||
@@ -626,18 +644,32 @@ func tryToGetAppdataDirFromSts(ctx context.Context, namespace, stsName, baseDir
|
||||
}
|
||||
appDirSet := sets.NewString()
|
||||
for _, v := range sts.Spec.Template.Spec.Volumes {
|
||||
if v.HostPath != nil && strings.HasPrefix(v.HostPath.Path, baseDir) && len(v.HostPath.Path) > len(baseDir) {
|
||||
appDir := GetFirstSubDir(v.HostPath.Path, baseDir)
|
||||
if v.HostPath != nil && strings.HasPrefix(v.HostPath.Path, appCacheDir) && len(v.HostPath.Path) > len(appCacheDir) {
|
||||
appDir := GetFirstSubDir(v.HostPath.Path, appCacheDir)
|
||||
if appDir != "" {
|
||||
if appDirSet.Has(appDir) {
|
||||
continue
|
||||
}
|
||||
appdirs = append(appdirs, appDir)
|
||||
appCacheDirs = append(appCacheDirs, appDir)
|
||||
appDirSet.Insert(appDir)
|
||||
}
|
||||
}
|
||||
}
|
||||
return appdirs, nil
|
||||
appDirSet = sets.NewString()
|
||||
|
||||
for _, v := range sts.Spec.Template.Spec.Volumes {
|
||||
if v.HostPath != nil && strings.HasPrefix(v.HostPath.Path, appDataDir) && len(v.HostPath.Path) > len(appDataDir) {
|
||||
appDir := GetFirstSubDir(v.HostPath.Path, appDataDir)
|
||||
if appDir != "" {
|
||||
if appDirSet.Has(appDir) {
|
||||
continue
|
||||
}
|
||||
appDataDirs = append(appDataDirs, appDir)
|
||||
appDirSet.Insert(appDir)
|
||||
}
|
||||
}
|
||||
}
|
||||
return appCacheDirs, appDataDirs, nil
|
||||
}
|
||||
|
||||
func GetFirstSubDir(fullPath, basePath string) string {
|
||||
@@ -674,6 +706,7 @@ type ConfigOptions struct {
|
||||
MarketSource string
|
||||
IsAdmin bool
|
||||
RawAppName string
|
||||
SelectedGpu string
|
||||
}
|
||||
|
||||
// GetAppConfig get app installation configuration from app store
|
||||
@@ -740,7 +773,7 @@ func getAppConfigFromRepo(ctx context.Context, options *ConfigOptions) (*appcfg.
|
||||
return getAppConfigFromConfigurationFile(options, chartPath)
|
||||
}
|
||||
|
||||
func toApplicationConfig(app, chart, rawAppName string, cfg *appcfg.AppConfiguration) (*appcfg.ApplicationConfig, string, error) {
|
||||
func toApplicationConfig(app, chart, rawAppName, selectedGpu string, cfg *appcfg.AppConfiguration) (*appcfg.ApplicationConfig, string, error) {
|
||||
var permission []appcfg.AppPermission
|
||||
if cfg.Permission.AppData {
|
||||
permission = append(permission, appcfg.AppDataRW)
|
||||
@@ -788,6 +821,57 @@ func toApplicationConfig(app, chart, rawAppName string, cfg *appcfg.AppConfigura
|
||||
return nil, chart, err
|
||||
}
|
||||
|
||||
// set suppertedGpu to ["nvidia","nvidia-gb10"] by default
|
||||
if len(cfg.Spec.SupportedGpu) == 0 {
|
||||
cfg.Spec.SupportedGpu = []interface{}{utils.NvidiaCardType, utils.GB10ChipType}
|
||||
}
|
||||
|
||||
// try to get selected GPU type special resource requirement
|
||||
if selectedGpu != "" {
|
||||
found := false
|
||||
for _, supportedGpu := range cfg.Spec.SupportedGpu {
|
||||
if str, ok := supportedGpu.(string); ok && str == selectedGpu {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
|
||||
if supportedGpuResourceMap, ok := supportedGpu.(map[string]interface{}); ok {
|
||||
if resourceRequirement, ok := supportedGpuResourceMap[selectedGpu].(map[string]interface{}); ok {
|
||||
found = true
|
||||
var specialResource appcfg.SpecialResource
|
||||
err := mapstructure.Decode(resourceRequirement, &specialResource)
|
||||
if err != nil {
|
||||
return nil, chart, fmt.Errorf("failed to decode special resource for selected GPU type %s: %v", selectedGpu, err)
|
||||
}
|
||||
|
||||
for _, resSetter := range []struct {
|
||||
v **resource.Quantity
|
||||
s *string
|
||||
}{
|
||||
{v: &mem, s: specialResource.RequiredMemory},
|
||||
{v: &disk, s: specialResource.RequiredDisk},
|
||||
{v: &cpu, s: specialResource.RequiredCPU},
|
||||
{v: &gpu, s: specialResource.RequiredGPU},
|
||||
} {
|
||||
|
||||
if resSetter.s != nil && *resSetter.s != "" {
|
||||
*resSetter.v, err = valuePtr(resource.ParseQuantity(*resSetter.s))
|
||||
if err != nil {
|
||||
return nil, chart, fmt.Errorf("failed to parse special resource quantity %s: %v", *resSetter.s, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
break
|
||||
} // end if selected gpu's resource requirement found
|
||||
} // end if supportedGpu is map
|
||||
} // end for supportedGpu
|
||||
|
||||
if !found {
|
||||
return nil, chart, fmt.Errorf("selected GPU type %s is not supported", selectedGpu)
|
||||
}
|
||||
}
|
||||
|
||||
// transform from Policy to AppPolicy
|
||||
var policies []appcfg.AppPolicy
|
||||
for _, p := range cfg.Options.Policies {
|
||||
@@ -877,6 +961,7 @@ func toApplicationConfig(app, chart, rawAppName string, cfg *appcfg.AppConfigura
|
||||
PodsSelectors: podSelectors,
|
||||
HardwareRequirement: cfg.Spec.Hardware,
|
||||
SharedEntrances: cfg.SharedEntrances,
|
||||
SelectedGpuType: selectedGpu,
|
||||
}, chart, nil
|
||||
}
|
||||
|
||||
@@ -890,7 +975,7 @@ func getAppConfigFromConfigurationFile(opt *ConfigOptions, chartPath string) (*a
|
||||
return nil, chartPath, err
|
||||
}
|
||||
|
||||
return toApplicationConfig(opt.App, chartPath, opt.RawAppName, &cfg)
|
||||
return toApplicationConfig(opt.App, chartPath, opt.RawAppName, opt.SelectedGpu, &cfg)
|
||||
}
|
||||
|
||||
func checkVersionFormat(constraint string) error {
|
||||
|
||||
@@ -234,7 +234,9 @@ func CheckAppRequirement(token string, appConfig *appcfg.ApplicationConfig, op v
|
||||
return constants.CPU, constants.SystemCPUPressure, fmt.Errorf(constants.SystemCPUPressureMessage, op)
|
||||
}
|
||||
}
|
||||
if appConfig.Requirement.GPU != nil {
|
||||
|
||||
// only support nvidia gpu managment by HAMi for now
|
||||
if appConfig.Requirement.GPU != nil && appConfig.GetSelectedGpuTypeValue() == utils.NvidiaCardType {
|
||||
if !appConfig.Requirement.GPU.IsZero() && metrics.GPU.Total <= 0 {
|
||||
return constants.GPU, constants.SystemGPUNotAvailable, fmt.Errorf(constants.SystemGPUNotAvailableMessage, op)
|
||||
|
||||
@@ -398,7 +400,9 @@ func GetClusterResource(token string) (*prometheus.ClusterMetrics, []string, err
|
||||
arches.Insert(n.Labels["kubernetes.io/arch"])
|
||||
if quantity, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
} else if quantity, ok = n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
|
||||
} else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
} else if quantity, ok = n.Status.Capacity[constants.AMDAPU]; ok {
|
||||
total += quantity.AsApproximateFloat64()
|
||||
}
|
||||
}
|
||||
|
||||
12
framework/app-service/pkg/utils/gpu_types.go
Normal file
12
framework/app-service/pkg/utils/gpu_types.go
Normal file
@@ -0,0 +1,12 @@
|
||||
package utils
|
||||
|
||||
const (
|
||||
NodeGPUTypeLabel = "gpu.bytetrade.io/type"
|
||||
)
|
||||
|
||||
const (
|
||||
NvidiaCardType = "nvidia" // handling by HAMi
|
||||
AmdGpuCardType = "amd-gpu" //
|
||||
AmdApuCardType = "amd-apu" // AMD APU with integrated GPU , AI Max 395 etc.
|
||||
GB10ChipType = "nvidia-gb10" // NVIDIA GB10 Superchip & unified system memory
|
||||
)
|
||||
@@ -103,24 +103,37 @@ func GetAllNodesTunnelIPCIDRs() (cidrs []string) {
|
||||
return cidrs
|
||||
}
|
||||
|
||||
func FindGpuTypeFromNodes(nodes *corev1.NodeList) (string, error) {
|
||||
gpuType := "none"
|
||||
// func FindGpuTypeFromNodes(nodes *corev1.NodeList) (string, error) {
|
||||
// gpuType := "none"
|
||||
// if nodes == nil {
|
||||
// return gpuType, errors.New("empty node list")
|
||||
// }
|
||||
// for _, n := range nodes.Items {
|
||||
// if _, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
|
||||
// if _, ok = n.Status.Capacity[constants.NvshareGPU]; ok {
|
||||
// return "nvshare", nil
|
||||
|
||||
// }
|
||||
// gpuType = "nvidia"
|
||||
// }
|
||||
// if _, ok := n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
|
||||
// return "virtaitech", nil
|
||||
// }
|
||||
// }
|
||||
// return gpuType, nil
|
||||
// }
|
||||
|
||||
func GetAllGpuTypesFromNodes(nodes *corev1.NodeList) (map[string]struct{}, error) {
|
||||
gpuTypes := make(map[string]struct{})
|
||||
if nodes == nil {
|
||||
return gpuType, errors.New("empty node list")
|
||||
return gpuTypes, errors.New("empty node list")
|
||||
}
|
||||
for _, n := range nodes.Items {
|
||||
if _, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
|
||||
if _, ok = n.Status.Capacity[constants.NvshareGPU]; ok {
|
||||
return "nvshare", nil
|
||||
|
||||
}
|
||||
gpuType = "nvidia"
|
||||
}
|
||||
if _, ok := n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
|
||||
return "virtaitech", nil
|
||||
if typeLabel, ok := n.Labels[NodeGPUTypeLabel]; ok {
|
||||
gpuTypes[typeLabel] = struct{}{} // TODO: add driver version info
|
||||
}
|
||||
}
|
||||
return gpuType, nil
|
||||
return gpuTypes, nil
|
||||
}
|
||||
|
||||
func IsNodeReady(node *corev1.Node) bool {
|
||||
|
||||
@@ -30,7 +30,6 @@ import (
|
||||
admissionv1 "k8s.io/api/admission/v1"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
@@ -544,16 +543,21 @@ type EnvKeyValue struct {
|
||||
}
|
||||
|
||||
// CreatePatchForDeployment add gpu env for deployment and returns patch bytes.
|
||||
func CreatePatchForDeployment(tpl *corev1.PodTemplateSpec, namespace string, gpuRequired *resource.Quantity, typeKey string, envKeyValues []EnvKeyValue) ([]byte, error) {
|
||||
patches, err := addResourceLimits(tpl, namespace, gpuRequired, typeKey, envKeyValues)
|
||||
func CreatePatchForDeployment(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues []EnvKeyValue) ([]byte, error) {
|
||||
patches, err := addResourceLimits(tpl, typeKey, envKeyValues)
|
||||
if err != nil {
|
||||
return []byte{}, err
|
||||
}
|
||||
return json.Marshal(patches)
|
||||
}
|
||||
|
||||
func addResourceLimits(tpl *corev1.PodTemplateSpec, namespace string, gpuRequired *resource.Quantity, typeKey string, envKeyValues []EnvKeyValue) (patch []patchOp, err error) {
|
||||
if typeKey == constants.NvidiaGPU || typeKey == constants.NvshareGPU {
|
||||
func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues []EnvKeyValue) (patch []patchOp, err error) {
|
||||
if typeKey == "" {
|
||||
klog.Warning("No gpu type selected, skip adding resource limits")
|
||||
return patch, nil
|
||||
}
|
||||
|
||||
if typeKey == constants.NvidiaGPU || typeKey == constants.NvidiaGB10GPU {
|
||||
if tpl.Spec.RuntimeClassName != nil {
|
||||
patch = append(patch, patchOp{
|
||||
Op: constants.PatchOpReplace,
|
||||
@@ -584,7 +588,10 @@ func addResourceLimits(tpl *corev1.PodTemplateSpec, namespace string, gpuRequire
|
||||
t := make(map[string]map[string]string)
|
||||
t["limits"] = map[string]string{}
|
||||
for k, v := range container.Resources.Limits {
|
||||
if k.String() == constants.NvidiaGPU || k.String() == constants.NvshareGPU || k.String() == constants.VirtAiTechVGPU {
|
||||
if k.String() == constants.NvidiaGPU ||
|
||||
k.String() == constants.NvidiaGB10GPU ||
|
||||
k.String() == constants.AMDAPU {
|
||||
// unset all previous gpu limits
|
||||
continue
|
||||
}
|
||||
t["limits"][k.String()] = v.String()
|
||||
|
||||
@@ -431,7 +431,7 @@ spec:
|
||||
privileged: true
|
||||
containers:
|
||||
- name: authelia
|
||||
image: beclab/auth:0.2.45
|
||||
image: beclab/auth:0.2.46
|
||||
imagePullPolicy: IfNotPresent
|
||||
ports:
|
||||
- containerPort: 9091
|
||||
|
||||
@@ -29,7 +29,7 @@ spec:
|
||||
name: check-auth
|
||||
containers:
|
||||
- name: auth-front
|
||||
image: beclab/login:v1.7.4
|
||||
image: beclab/login:v1.8.5
|
||||
imagePullPolicy: IfNotPresent
|
||||
ports:
|
||||
- containerPort: 80
|
||||
|
||||
Reference in New Issue
Block a user