Compare commits

...

8 Commits

Author SHA1 Message Date
dkeven
92771e650c fix(cli): bind config item to the effective command 2026-02-03 16:06:07 +08:00
eball
7dd688c645 app-service: add support for selecting GPU types in application installation (#2470)
* fix: failed release upgrade

* fix: helm upgrade do not use atomic param and allow upgrade failed release

* feat: add icon filed to nats event

* fix: v2 app stop

* fix: check k8s request before into installing state

* fix: add spec ports

* set appservice image tag to 0.4.77

* feat: add support for selecting GPU types in application installation (#2458)

* fix: failed release upgrade

* fix: helm upgrade do not use atomic param and allow upgrade failed release

* feat: add clickhouse support

* appservice image tag to 0.4.76

* feat: add icon filed to nats event

* chores: get all node gpu types

* feat: add support for selecting GPU types in application installation

* feat: enhance GPU type selection logic in application installation

* feat: replace hardcoded GPU type with constant for supported GPU selection

* feat: update app config methods to include selected GPU type and enhance validation for NVIDIA GPUs

* feat: update supported GPU handling to include default options and improve validation logic

* feat: update GPU resource handling to unset previous limits before setting new ones

* feat: refactor permission parsing to use exported function and update related calls

---------

Co-authored-by: hys <hysyeah@gmail.com>

* app-service: add support for selecting GPU types in application installation

---------

Co-authored-by: hys <hysyeah@gmail.com>
2026-02-03 13:02:54 +08:00
eball
7d12b792ea ci: bump version to 1.12.6 (#2471) 2026-02-03 13:02:39 +08:00
Power-One-2025
c37cb9e15d docs: fix a link issue (#2469) 2026-02-03 11:33:32 +08:00
Power-One-2025
c79c2fc253 fix a link issue 2026-02-03 11:22:14 +08:00
berg
32fe6513e6 desktop, settings, files, vault: fix multiple known issues (#2467)
feat: update login, system frontend, user service version
2026-02-02 23:05:05 +08:00
eball
12ec558b27 authelia: add user regulation for TOTP authentication attempts (#2466) 2026-02-02 21:37:46 +08:00
dkeven
375dfceacb fix(cli): unify config setting for release command (#2465) 2026-02-02 17:52:46 +08:00
49 changed files with 389 additions and 433 deletions

View File

@@ -75,7 +75,7 @@ jobs:
steps:
- id: generate
run: |
v=1.12.5-$(echo $RANDOM$RANDOM)
v=1.12.6-$(echo $RANDOM$RANDOM)
echo "version=$v" >> "$GITHUB_OUTPUT"
upload-cli:

View File

@@ -17,7 +17,7 @@ jobs:
steps:
- id: generate
run: |
v=1.12.5-$(date +"%Y%m%d")
v=1.12.6-$(date +"%Y%m%d")
echo "version=$v" >> "$GITHUB_OUTPUT"
release-id:

View File

@@ -317,7 +317,7 @@ spec:
chown -R 1000:1000 /uploadstemp && \
chown -R 1000:1000 /appdata
- name: olares-app-init
image: beclab/system-frontend:v1.8.4
image: beclab/system-frontend:v1.8.5
imagePullPolicy: IfNotPresent
command:
- /bin/sh
@@ -439,7 +439,7 @@ spec:
- name: NATS_SUBJECT_VAULT
value: os.vault.{{ .Values.bfl.username}}
- name: user-service
image: beclab/user-service:v0.0.84
image: beclab/user-service:v0.0.85
imagePullPolicy: IfNotPresent
ports:
- containerPort: 3000

View File

@@ -18,7 +18,7 @@ fi
if [[ x"$VERSION" == x"" ]]; then
if [[ "$LOCAL_RELEASE" == "1" ]]; then
ts=$(date +%Y%m%d%H%M%S)
export VERSION="1.12.5-$ts"
export VERSION="1.12.6-$ts"
echo "will build and use a local release of Olares with version: $VERSION"
echo ""
else
@@ -28,7 +28,7 @@ fi
if [[ "x${VERSION}" == "x" || "x${VERSION:3}" == "xVERSION__" ]]; then
echo "error: Olares version is unspecified, please set the VERSION env var and rerun this script."
echo "for example: VERSION=1.12.5-20241124 bash $0"
echo "for example: VERSION=1.12.6-20241124 bash $0"
exit 1
fi

View File

@@ -158,7 +158,7 @@ export VERSION="#__VERSION__"
if [[ "x${VERSION}" == "x" || "x${VERSION:3}" == "xVERSION__" ]]; then
echo "error: Olares version is unspecified, please set the VERSION env var and rerun this script."
echo "for example: VERSION=1.12.5-20241124 bash $0"
echo "for example: VERSION=1.12.6-20241124 bash $0"
exit 1
fi

View File

@@ -8,26 +8,27 @@ import (
"strings"
"time"
"github.com/beclab/Olares/cli/pkg/core/common"
"github.com/beclab/Olares/cli/cmd/config"
"github.com/beclab/Olares/cli/pkg/common"
corecommon "github.com/beclab/Olares/cli/pkg/core/common"
"github.com/beclab/Olares/cli/pkg/core/util"
"github.com/beclab/Olares/cli/pkg/release/builder"
"github.com/spf13/cobra"
"github.com/spf13/viper"
)
func NewCmdRelease() *cobra.Command {
var (
baseDir string
version string
cdn string
ignoreMissingImages bool
extract bool
)
cmd := &cobra.Command{
Use: "release",
Short: "Build release based on a local Olares repository",
Run: func(cmd *cobra.Command, args []string) {
baseDir := viper.GetString(common.FlagBaseDir)
version := viper.GetString(common.FlagVersion)
cdn := viper.GetString(common.FlagCDNService)
ignoreMissingImages := viper.GetBool(common.FlagIgnoreMissingImages)
extract := viper.GetBool(common.FlagExtract)
cwd, err := os.Getwd()
if err != nil {
fmt.Printf("failed to get current working directory: %s\n", err)
@@ -43,13 +44,13 @@ func NewCmdRelease() *cobra.Command {
fmt.Printf("failed to get current user: %s\n", err)
os.Exit(1)
}
baseDir = filepath.Join(usr.HomeDir, common.DefaultBaseDir)
baseDir = filepath.Join(usr.HomeDir, corecommon.DefaultBaseDir)
fmt.Printf("--base-dir unspecified, using: %s\n", baseDir)
time.Sleep(1 * time.Second)
}
if version == "" {
version = fmt.Sprintf("1.12.5-%s", time.Now().Format("20060102150405"))
version = fmt.Sprintf("1.12.6-%s", time.Now().Format("20060102150405"))
fmt.Printf("--version unspecified, using: %s\n", version)
time.Sleep(1 * time.Second)
}
@@ -75,11 +76,20 @@ func NewCmdRelease() *cobra.Command {
},
}
cmd.Flags().StringVarP(&baseDir, "base-dir", "b", "", "base directory of Olares, where this release will be extracted to as a new version if --extract/-e is not disabled, defaults to $HOME/"+common.DefaultBaseDir)
cmd.Flags().StringVarP(&version, "version", "v", "", "version of this release, defaults to 0.0.0-local-dev-{yyyymmddhhmmss}")
cmd.Flags().StringVar(&cdn, "cdn-service", common.DefaultOlaresCDNService, "CDN used for downloading checksums of dependencies and images")
cmd.Flags().BoolVar(&ignoreMissingImages, "ignore-missing-images", true, "ignore missing images when downloading cheksums from CDN, only disable this if no new image is added, or the build may fail because the image is not uploaded to the CDN yet")
cmd.Flags().BoolVarP(&extract, "extract", "e", true, "extract this release to --base-dir after build, this can be disabled if only the release file itself is needed")
flagSetter := config.NewFlagSetterFor(cmd)
config.AddBaseDirFlagBy(flagSetter)
config.AddVersionFlagBy(flagSetter)
config.AddCDNServiceFlagBy(flagSetter)
flagSetter.Add(common.FlagIgnoreMissingImages,
"",
true,
"ignore missing images when downloading checksums from CDN, only disable this if no new image is added, or the build may fail because the image is not uploaded to the CDN yet",
)
flagSetter.Add(common.FlagExtract,
"e",
true,
"extract this release to --base-dir after build, this can be disabled if only the release file itself is needed",
)
return cmd
}

View File

@@ -13,6 +13,7 @@ import (
"github.com/beclab/Olares/cli/cmd/ctl/user"
"github.com/beclab/Olares/cli/version"
"github.com/spf13/cobra"
"github.com/spf13/viper"
)
func NewDefaultCommand() *cobra.Command {
@@ -25,6 +26,11 @@ func NewDefaultCommand() *cobra.Command {
Short: "Olares Installer",
CompletionOptions: cobra.CompletionOptions{DisableDefaultCmd: true},
Version: version.VERSION,
PersistentPreRun: func(cmd *cobra.Command, args []string) {
viper.BindPFlags(cmd.InheritedFlags())
viper.BindPFlags(cmd.PersistentFlags())
viper.BindPFlags(cmd.Flags())
},
Run: func(cmd *cobra.Command, args []string) {
if showVendor {
fmt.Println(version.VENDOR)

View File

@@ -261,12 +261,14 @@ const (
FlagOSPassword = "os-password"
EnvLegacyEncryptedOSPassword = "TERMINUS_OS_PASSWORD"
FlagCDNService = "cdn-service"
FlagManifest = "manifest"
FlagURLOverride = "url-override"
FlagReleaseID = "release-id"
FlagKubeType = "kube-type"
FlagLegacyKubeType = "kube"
FlagCDNService = "cdn-service"
FlagExtract = "extract"
FlagIgnoreMissingImages = "ignore-missing-images"
FlagManifest = "manifest"
FlagURLOverride = "url-override"
FlagReleaseID = "release-id"
FlagKubeType = "kube-type"
FlagLegacyKubeType = "kube"
FlagEnableJuiceFS = "enable-juicefs"
FlagLegacyEnableJuiceFS = "with-juicefs"

View File

@@ -211,6 +211,9 @@ func NewArgument() *Argument {
arg.IsOlaresInContainer = os.Getenv(ENV_CONTAINER_MODE) == "oic"
si.IsOIC = arg.IsOlaresInContainer
// Ensure BaseDir is initialized before loading master.conf
// so master host config can be loaded from ${base-dir}/master.conf reliably.
arg.SetBaseDir(viper.GetString(FlagBaseDir))
arg.loadMasterHostConfig()
return arg
}

View File

@@ -18,7 +18,6 @@ func AddNodePipeline() error {
}
arg.SetOlaresVersion(viper.GetString(common.FlagVersion))
arg.SetBaseDir(viper.GetString(common.FlagBaseDir))
arg.SetConsoleLog("addnode.log", true)
if err := arg.MasterHostConfig.Validate(); err != nil {

View File

@@ -19,7 +19,6 @@ func ChangeIPPipeline() error {
var arg = common.NewArgument()
arg.SetOlaresVersion(terminusVersion)
arg.SetBaseDir(viper.GetString(common.FlagBaseDir))
arg.SetConsoleLog("changeip.log", true)
arg.SetKubeVersion(kubeType)
arg.SetMinikubeProfile(viper.GetString(common.FlagMiniKubeProfile))

View File

@@ -12,7 +12,6 @@ import (
func CheckDownloadInstallationPackage() error {
arg := common.NewArgument()
arg.SetOlaresVersion(viper.GetString(common.FlagVersion))
arg.SetBaseDir(viper.GetString(common.FlagBaseDir))
runtime, err := common.NewKubeRuntime(*arg)
if err != nil {

View File

@@ -13,7 +13,6 @@ import (
func DownloadInstallationPackage() error {
arg := common.NewArgument()
arg.SetBaseDir(viper.GetString(common.FlagBaseDir))
arg.SetOlaresVersion(viper.GetString(common.FlagVersion))
arg.SetOlaresCDNService(viper.GetString(common.FlagCDNService))

View File

@@ -13,7 +13,6 @@ import (
func DownloadInstallationWizard() error {
arg := common.NewArgument()
arg.SetOlaresVersion(viper.GetString(common.FlagVersion))
arg.SetBaseDir(viper.GetString(common.FlagBaseDir))
arg.SetOlaresCDNService(viper.GetString(common.FlagCDNService))
runtime, err := common.NewKubeRuntime(*arg)

View File

@@ -15,7 +15,6 @@ import (
func InstallGpuDrivers() error {
arg := common.NewArgument()
arg.SetOlaresVersion(viper.GetString(common.FlagVersion))
arg.SetBaseDir(viper.GetString(common.FlagBaseDir))
arg.SetConsoleLog("gpuinstall.log", true)
runtime, err := common.NewKubeRuntime(*arg)
if err != nil {

View File

@@ -20,7 +20,6 @@ func CliInstallTerminusPipeline() error {
}
arg := common.NewArgument()
arg.SetBaseDir(viper.GetString(common.FlagBaseDir))
arg.SetKubeVersion(viper.GetString(common.FlagKubeType))
arg.SetOlaresVersion(viper.GetString(common.FlagVersion))
arg.SetMinikubeProfile(viper.GetString(common.FlagMiniKubeProfile))

View File

@@ -8,7 +8,6 @@ import (
"github.com/beclab/Olares/cli/pkg/core/module"
"github.com/beclab/Olares/cli/pkg/core/pipeline"
"github.com/beclab/Olares/cli/pkg/terminus"
"github.com/spf13/viper"
)
func MasterInfoPipeline() error {
@@ -17,7 +16,6 @@ func MasterInfoPipeline() error {
fmt.Println("error: Only Linux nodes can be added to an Olares cluster!")
os.Exit(1)
}
arg.SetBaseDir(viper.GetString(common.FlagBaseDir))
arg.SetConsoleLog("masterinfo.log", true)
if err := arg.MasterHostConfig.Validate(); err != nil {

View File

@@ -11,7 +11,6 @@ import (
func StartPreCheckPipeline() error {
var arg = common.NewArgument()
arg.SetOlaresVersion(viper.GetString(common.FlagVersion))
arg.SetBaseDir(viper.GetString(common.FlagBaseDir))
arg.SetConsoleLog("precheck.log", true)
runtime, err := common.NewKubeRuntime(*arg)

View File

@@ -28,7 +28,6 @@ func PrepareSystemPipeline(components []string) error {
}
var arg = common.NewArgument()
arg.SetBaseDir(viper.GetString(common.FlagBaseDir))
arg.SetKubeVersion(viper.GetString(common.FlagKubeType))
arg.SetMinikubeProfile(viper.GetString(common.FlagMiniKubeProfile))
arg.SetOlaresVersion(viper.GetString(common.FlagVersion))

View File

@@ -18,7 +18,6 @@ func CliInstallStoragePipeline() error {
}
arg := common.NewArgument()
arg.SetBaseDir(viper.GetString(common.FlagBaseDir))
arg.SetOlaresVersion(viper.GetString(common.FlagVersion))
arg.SetStorage(getStorageConfig())

View File

@@ -20,7 +20,6 @@ func UninstallTerminusPipeline() error {
var arg = common.NewArgument()
arg.SetOlaresVersion(version)
arg.SetBaseDir(viper.GetString(common.FlagBaseDir))
arg.SetConsoleLog("uninstall.log", true)
arg.SetKubeVersion(kubeType)
arg.SetStorage(getStorageConfig())

View File

@@ -46,7 +46,6 @@ func UpgradeOlaresPipeline() error {
}
arg := common.NewArgument()
arg.SetBaseDir(viper.GetString(common.FlagBaseDir))
arg.SetOlaresVersion(viper.GetString(common.FlagVersion))
arg.SetConsoleLog("upgrade.log", true)
arg.SetKubeVersion(phase.GetKubeType())

View File

@@ -7,15 +7,15 @@ import (
"github.com/beclab/Olares/cli/pkg/core/task"
)
type upgrader_1_12_5_20260122 struct {
type upgrader_1_12_6_20260122 struct {
breakingUpgraderBase
}
func (u upgrader_1_12_5_20260122) Version() *semver.Version {
return semver.MustParse("1.12.3-20260122")
func (u upgrader_1_12_6_20260122) Version() *semver.Version {
return semver.MustParse("1.12.6-20260122")
}
func (u upgrader_1_12_5_20260122) UpgradeSystemComponents() []task.Interface {
func (u upgrader_1_12_6_20260122) UpgradeSystemComponents() []task.Interface {
pre := []task.Interface{
&task.LocalTask{
Name: "UpgradeL4BFLProxy",
@@ -28,5 +28,5 @@ func (u upgrader_1_12_5_20260122) UpgradeSystemComponents() []task.Interface {
}
func init() {
registerDailyUpgrader(upgrader_1_12_5_20260122{})
registerDailyUpgrader(upgrader_1_12_6_20260122{})
}

View File

@@ -429,7 +429,7 @@ const side = {
},
{
text: "本地访问 Olares",
link: "/manual/best-practices/local-access",
link: "/zh/manual/best-practices/local-access",
},
],
},

View File

@@ -170,7 +170,7 @@ spec:
priorityClassName: "system-cluster-critical"
containers:
- name: app-service
image: beclab/app-service:0.4.77
image: beclab/app-service:0.4.78
imagePullPolicy: IfNotPresent
ports:
- containerPort: 6755

View File

@@ -126,15 +126,16 @@ type UpgradeRequest struct {
// InstallRequest represents a request to install an application.
type InstallRequest struct {
Dev bool `json:"devMode"`
RepoURL string `json:"repoUrl"`
CfgURL string `json:"cfgUrl"`
Source AppSource `json:"source"`
Images []Image `json:"images"`
Envs []sysv1alpha1.AppEnvVar `json:"envs"`
RawAppName string `json:"rawAppName"`
Title string `json:"title"`
Entrances []EntranceClone `json:"entrances"`
Dev bool `json:"devMode"`
RepoURL string `json:"repoUrl"`
CfgURL string `json:"cfgUrl"`
Source AppSource `json:"source"`
Images []Image `json:"images"`
Envs []sysv1alpha1.AppEnvVar `json:"envs"`
RawAppName string `json:"rawAppName"`
Title string `json:"title"`
Entrances []EntranceClone `json:"entrances"`
SelectedGpuType string `json:"selectedGpuType"`
}
type Image struct {

View File

@@ -3,11 +3,14 @@ package apiserver
import (
"context"
"encoding/json"
"fmt"
"os"
"sort"
"strconv"
"strings"
"golang.org/x/exp/maps"
"github.com/beclab/Olares/framework/app-service/api/app.bytetrade.io/v1alpha1"
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
"github.com/beclab/Olares/framework/app-service/pkg/appcfg"
@@ -946,12 +949,37 @@ func (h *Handler) oamValues(req *restful.Request, resp *restful.Response) {
api.HandleError(resp, req, err)
return
}
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
gpuTypes, err := utils.GetAllGpuTypesFromNodes(&nodes)
if err != nil {
klog.Errorf("get gpu type failed %v", gpuType)
klog.Errorf("get gpu type failed %v", err)
api.HandleError(resp, req, err)
return
}
gpuType := "none"
selectedGpuType := req.QueryParameter("gputype")
if len(gpuTypes) > 0 {
if selectedGpuType != "" {
if _, ok := gpuTypes[selectedGpuType]; ok {
gpuType = selectedGpuType
} else {
err := fmt.Errorf("selected gpu type %s not found in cluster", selectedGpuType)
klog.Error(err)
api.HandleError(resp, req, err)
return
}
} else {
if len(gpuTypes) == 1 {
gpuType = maps.Keys(gpuTypes)[0]
} else {
err := fmt.Errorf("multiple gpu types found in cluster, please specify one")
klog.Error(err)
api.HandleError(resp, req, err)
return
}
}
}
values["GPU"] = map[string]interface{}{
"Type": gpuType,
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),

View File

@@ -1,174 +1,33 @@
package apiserver
import (
"fmt"
"sync"
"time"
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
"github.com/beclab/Olares/framework/app-service/pkg/client/clientset"
"github.com/beclab/Olares/framework/app-service/pkg/constants"
"github.com/beclab/Olares/framework/app-service/pkg/utils"
"golang.org/x/exp/maps"
"sigs.k8s.io/controller-runtime/pkg/client"
"github.com/emicklei/go-restful/v3"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/util/retry"
"k8s.io/klog/v2"
)
var running bool = false
var switchLock sync.Mutex
func (h *Handler) disableGpuManagedMemory(req *restful.Request, resp *restful.Response) {
if err := h.nvshareSwitch(req, false); err != nil {
api.HandleError(resp, req, &errors.StatusError{
ErrStatus: metav1.Status{Code: 400, Message: "operation failed, " + err.Error()},
})
func (h *Handler) getGpuTypes(req *restful.Request, resp *restful.Response) {
var nodes corev1.NodeList
err := h.ctrlClient.List(req.Request.Context(), &nodes, &client.ListOptions{})
if err != nil {
klog.Errorf("list node failed %v", err)
api.HandleError(resp, req, err)
return
}
resp.WriteAsJson(map[string]int{"code": 0})
}
func (h *Handler) enableGpuManagedMemory(req *restful.Request, resp *restful.Response) {
if err := h.nvshareSwitch(req, true); err != nil {
api.HandleError(resp, req, &errors.StatusError{
ErrStatus: metav1.Status{Code: 400, Message: "operation failed, " + err.Error()},
})
gpuTypes, err := utils.GetAllGpuTypesFromNodes(&nodes)
if err != nil {
klog.Errorf("get gpu type failed %v", err)
api.HandleError(resp, req, err)
return
}
resp.WriteAsJson(map[string]int{"code": 0})
}
func (h *Handler) nvshareSwitch(req *restful.Request, enable bool) error {
client := req.Attribute(constants.KubeSphereClientAttribute).(*clientset.ClientSet)
switchLock.Lock()
defer switchLock.Unlock()
if running {
return fmt.Errorf("last operation is still running")
}
deployments, err := client.KubeClient.Kubernetes().AppsV1().Deployments("").List(req.Request.Context(), metav1.ListOptions{})
if err != nil {
klog.Error("list deployment error, ", err)
return err
}
envValue := "0"
if enable {
envValue = "1"
}
for _, d := range deployments.Items {
shouldUpdate := false
for i, c := range d.Spec.Template.Spec.Containers {
found := false
for k := range c.Resources.Limits {
if k == constants.NvshareGPU {
found = true
break
}
}
if found {
// a gpu request container
addEnv := true
for n, env := range d.Spec.Template.Spec.Containers[i].Env {
if env.Name == constants.EnvNvshareManagedMemory {
addEnv = false
d.Spec.Template.Spec.Containers[i].Env[n].Value = envValue
break
}
}
if addEnv {
d.Spec.Template.Spec.Containers[i].Env =
append(d.Spec.Template.Spec.Containers[i].Env,
corev1.EnvVar{Name: constants.EnvNvshareManagedMemory, Value: envValue})
}
shouldUpdate = true
} // end found
} // end of container loop
if shouldUpdate {
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
deployment, err := client.KubeClient.Kubernetes().AppsV1().Deployments(d.Namespace).
Get(req.Request.Context(), d.Name, metav1.GetOptions{})
if err != nil {
return err
}
deployment.Spec.Template.Spec.Containers = d.Spec.Template.Spec.Containers
_, err = client.KubeClient.Kubernetes().AppsV1().Deployments(d.Namespace).
Update(req.Request.Context(), deployment, metav1.UpdateOptions{})
return err
})
if err != nil {
klog.Error("update deployment error, ", err, ", ", d.Name, ", ", d.Namespace)
return err
}
} // should update
} // end of deployment loop
// update terminus
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
terminus, err := utils.GetTerminus(req.Request.Context(), h.ctrlClient)
if err != nil {
return err
}
terminus.Spec.Settings[constants.EnvNvshareManagedMemory] = envValue
return h.ctrlClient.Update(req.Request.Context(), terminus)
})
if err != nil {
klog.Error("update terminus error, ", err)
return err
}
running = true
// delay 30s, assume the all pods will be reload in 30s.
delay := time.NewTimer(30 * time.Second)
go func() {
<-delay.C
switchLock.Lock()
defer switchLock.Unlock()
running = false
}()
return nil
}
func (h *Handler) getManagedMemoryValue(req *restful.Request, resp *restful.Response) {
terminus, err := utils.GetTerminus(req.Request.Context(), h.ctrlClient)
if err != nil {
klog.Error("get terminus value error, ", err)
api.HandleError(resp, req, &errors.StatusError{
ErrStatus: metav1.Status{Code: 400, Message: "get value error, " + err.Error()},
})
return
}
managed := true
if v, ok := terminus.Spec.Settings[constants.EnvNvshareManagedMemory]; ok && v == "0" {
managed = false
}
resp.WriteAsJson(&map[string]interface{}{
"managed_memory": managed,
"gpu_types": maps.Keys(gpuTypes),
},
)
}

View File

@@ -21,9 +21,12 @@ import (
"github.com/beclab/Olares/framework/app-service/pkg/utils"
apputils "github.com/beclab/Olares/framework/app-service/pkg/utils/app"
"github.com/beclab/Olares/framework/app-service/pkg/utils/config"
"golang.org/x/exp/maps"
"sigs.k8s.io/controller-runtime/pkg/client"
"github.com/emicklei/go-restful/v3"
"helm.sh/helm/v3/pkg/time"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
@@ -37,7 +40,7 @@ type depRequest struct {
type installHelperIntf interface {
getAdminUsers() (admin []string, isAdmin bool, err error)
getInstalledApps() (installed bool, app []*v1alpha1.Application, err error)
getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion string) (err error)
getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion, selectedGpuType string) (err error)
setAppConfig(req *api.InstallRequest, appName string)
validate(bool, []*v1alpha1.Application) error
setAppEnv(overrides []sysv1alpha1.AppEnvVar) error
@@ -105,6 +108,36 @@ func (h *Handler) install(req *restful.Request, resp *restful.Response) {
}
}
// check selected gpu type can be supported
// if selectedGpuType != "" , then check if the gpu type exists in cluster
// if selectedGpuType == "" , and only one gpu type exists in cluster, then use it
var nodes corev1.NodeList
err = h.ctrlClient.List(req.Request.Context(), &nodes, &client.ListOptions{})
if err != nil {
klog.Errorf("list node failed %v", err)
api.HandleError(resp, req, err)
return
}
gpuTypes, err := utils.GetAllGpuTypesFromNodes(&nodes)
if err != nil {
klog.Errorf("get gpu type failed %v", err)
api.HandleError(resp, req, err)
return
}
if insReq.SelectedGpuType != "" {
if _, ok := gpuTypes[insReq.SelectedGpuType]; !ok {
klog.Errorf("selected gpu type %s not found in cluster", insReq.SelectedGpuType)
api.HandleBadRequest(resp, req, fmt.Errorf("selected gpu type %s not found in cluster", insReq.SelectedGpuType))
return
}
} else {
if len(gpuTypes) == 1 {
insReq.SelectedGpuType = maps.Keys(gpuTypes)[0]
klog.Infof("only one gpu type %s found in cluster, use it as selected gpu type", insReq.SelectedGpuType)
}
}
apiVersion, appCfg, err := apputils.GetApiVersionFromAppConfig(req.Request.Context(), &apputils.ConfigOptions{
App: app,
RawAppName: rawAppName,
@@ -112,6 +145,7 @@ func (h *Handler) install(req *restful.Request, resp *restful.Response) {
RepoURL: insReq.RepoURL,
MarketSource: marketSource,
Version: chartVersion,
SelectedGpu: insReq.SelectedGpuType,
})
klog.Infof("chartVersion: %s", chartVersion)
if err != nil {
@@ -188,7 +222,7 @@ func (h *Handler) install(req *restful.Request, resp *restful.Response) {
return
}
err = helper.getAppConfig(adminUsers, marketSource, isAdmin, appInstalled, installedApps, chartVersion)
err = helper.getAppConfig(adminUsers, marketSource, isAdmin, appInstalled, installedApps, chartVersion, insReq.SelectedGpuType)
if err != nil {
klog.Errorf("Failed to get app config err=%v", err)
return
@@ -423,7 +457,7 @@ func (h *installHandlerHelper) getInstalledApps() (installed bool, app []*v1alph
return
}
func (h *installHandlerHelper) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion string) (err error) {
func (h *installHandlerHelper) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion, selectedGpuType string) (err error) {
var (
admin string
installAsAdmin bool
@@ -472,6 +506,7 @@ func (h *installHandlerHelper) getAppConfig(adminUsers []string, marketSource st
Admin: admin,
IsAdmin: installAsAdmin,
MarketSource: marketSource,
SelectedGpu: selectedGpuType,
})
if err != nil {
klog.Errorf("Failed to get appconfig err=%v", err)
@@ -685,7 +720,7 @@ func (h *installHandlerHelperV2) _validateClusterScope(isAdmin bool, installedAp
return nil
}
func (h *installHandlerHelperV2) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion string) (err error) {
func (h *installHandlerHelperV2) getAppConfig(adminUsers []string, marketSource string, isAdmin, appInstalled bool, installedApps []*v1alpha1.Application, chartVersion, selectedGpuType string) (err error) {
klog.Info("get app config for install handler v2")
var (
@@ -713,6 +748,7 @@ func (h *installHandlerHelperV2) getAppConfig(adminUsers []string, marketSource
Admin: admin,
MarketSource: marketSource,
IsAdmin: isAdmin,
SelectedGpu: selectedGpuType,
})
if err != nil {
klog.Errorf("Failed to get appconfig err=%v", err)

View File

@@ -13,6 +13,7 @@ import (
"github.com/beclab/Olares/framework/app-service/api/app.bytetrade.io/v1alpha1"
"github.com/beclab/Olares/framework/app-service/pkg/apiserver/api"
"github.com/beclab/Olares/framework/app-service/pkg/appcfg"
"github.com/beclab/Olares/framework/app-service/pkg/appinstaller"
"github.com/beclab/Olares/framework/app-service/pkg/appstate"
"github.com/beclab/Olares/framework/app-service/pkg/client/clientset"
"github.com/beclab/Olares/framework/app-service/pkg/constants"
@@ -520,6 +521,7 @@ type applicationPermission struct {
Permissions []permission `json:"permissions"`
}
// Deprecated
func (h *Handler) applicationPermissionList(req *restful.Request, resp *restful.Response) {
owner := req.Attribute(constants.UserContextAttribute).(string)
//token := req.HeaderParameter(constants.AuthorizationTokenKey)
@@ -572,46 +574,39 @@ func (h *Handler) applicationPermissionList(req *restful.Request, resp *restful.
func (h *Handler) getApplicationPermission(req *restful.Request, resp *restful.Response) {
app := req.PathParameter(ParamAppName)
owner := req.Attribute(constants.UserContextAttribute).(string)
client, err := dynamic.NewForConfig(h.kubeConfig)
name, err := apputils.FmtAppMgrName(app, owner, "")
if err != nil {
api.HandleError(resp, req, err)
return
}
var am v1alpha1.ApplicationManager
err = h.ctrlClient.Get(req.Request.Context(), types.NamespacedName{Name: name}, &am)
if err != nil {
api.HandleError(resp, req, err)
return
}
var appConfig appcfg.ApplicationConfig
err = am.GetAppConfig(&appConfig)
if err != nil {
klog.Errorf("Failed to get app config err=%v", err)
api.HandleError(resp, req, err)
return
}
var ret *applicationPermission
apClient := provider.NewApplicationPermissionRequest(client)
namespace := fmt.Sprintf("user-system-%s", owner)
aps, err := apClient.List(req.Request.Context(), namespace, metav1.ListOptions{})
if err != nil {
api.HandleError(resp, req, err)
return
}
for _, ap := range aps.Items {
if ap.Object == nil {
continue
}
appName, _, _ := unstructured.NestedString(ap.Object, "spec", "app")
if appName == app {
perms, _, _ := unstructured.NestedSlice(ap.Object, "spec", "permissions")
permissions := appinstaller.ParseAppPermission(appConfig.Permission)
for _, ap := range permissions {
if perms, ok := ap.([]appcfg.ProviderPermission); ok {
permissions := make([]permission, 0)
for _, p := range perms {
if perm, ok := p.(map[string]interface{}); ok {
ops := make([]string, 0)
for _, op := range perm["ops"].([]interface{}) {
if opStr, ok := op.(string); ok {
ops = append(ops, opStr)
}
}
permissions = append(permissions, permission{
DataType: perm["dataType"].(string),
Group: perm["group"].(string),
Version: perm["version"].(string),
Ops: ops,
})
}
permissions = append(permissions, permission{
DataType: p.ProviderName,
Group: p.AppName,
})
}
ret = &applicationPermission{
App: appName,
App: am.Spec.AppName,
Owner: owner,
Permissions: permissions,
}
@@ -642,6 +637,7 @@ type opApi struct {
URI string `json:"uri"`
}
// Deprecated
func (h *Handler) getProviderRegistry(req *restful.Request, resp *restful.Response) {
dataTypeReq := req.PathParameter(ParamDataType)
groupReq := req.PathParameter(ParamGroup)
@@ -708,56 +704,44 @@ func (h *Handler) getProviderRegistry(req *restful.Request, resp *restful.Respon
func (h *Handler) getApplicationProviderList(req *restful.Request, resp *restful.Response) {
owner := req.Attribute(constants.UserContextAttribute).(string)
app := req.PathParameter(ParamAppName)
client, err := dynamic.NewForConfig(h.kubeConfig)
name, err := apputils.FmtAppMgrName(app, owner, "")
if err != nil {
api.HandleError(resp, req, err)
return
}
var am v1alpha1.ApplicationManager
err = h.ctrlClient.Get(req.Request.Context(), types.NamespacedName{Name: name}, &am)
if err != nil {
api.HandleError(resp, req, err)
return
}
var appConfig appcfg.ApplicationConfig
err = am.GetAppConfig(&appConfig)
if err != nil {
klog.Errorf("Failed to get app config err=%v", err)
api.HandleError(resp, req, err)
return
}
ret := make([]providerRegistry, 0)
rClient := provider.NewRegistryRequest(client)
namespace := fmt.Sprintf("user-system-%s", owner)
prs, err := rClient.List(req.Request.Context(), namespace, metav1.ListOptions{})
if err != nil {
api.HandleError(resp, req, err)
return
}
for _, ap := range prs.Items {
if ap.Object == nil {
continue
}
deployment, _, _ := unstructured.NestedString(ap.Object, "spec", "deployment")
kind, _, _ := unstructured.NestedString(ap.Object, "spec", "kind")
if app == deployment && kind == "provider" {
dataType, _, _ := unstructured.NestedString(ap.Object, "spec", "dataType")
group, _, _ := unstructured.NestedString(ap.Object, "spec", "group")
description, _, _ := unstructured.NestedString(ap.Object, "spec", "description")
endpoint, _, _ := unstructured.NestedString(ap.Object, "spec", "endpoint")
ns, _, _ := unstructured.NestedString(ap.Object, "spec", "namespace")
version, _, _ := unstructured.NestedString(ap.Object, "spec", "version")
opApis := make([]opApi, 0)
opApiList, _, _ := unstructured.NestedSlice(ap.Object, "spec", "opApis")
for _, op := range opApiList {
if aop, ok := op.(map[string]interface{}); ok {
opApis = append(opApis, opApi{
Name: aop["name"].(string),
URI: aop["uri"].(string),
})
}
}
ret = append(ret, providerRegistry{
DataType: dataType,
Deployment: deployment,
Description: description,
Endpoint: endpoint,
Kind: kind,
Group: group,
Namespace: ns,
OpApis: opApis,
Version: version,
ns := am.Spec.AppNamespace
for _, ap := range appConfig.Provider {
dataType := ap.Name
endpoint := ap.Entrance
opApis := make([]opApi, 0)
for _, op := range ap.Paths {
opApis = append(opApis, opApi{
URI: op,
})
}
ret = append(ret, providerRegistry{
DataType: dataType,
Endpoint: endpoint,
Namespace: ns,
OpApis: opApis,
})
}
resp.WriteAsJson(ret)
}

View File

@@ -37,7 +37,6 @@ import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/dynamic"
"k8s.io/klog/v2"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
@@ -308,36 +307,21 @@ func (h *Handler) gpuLimitMutate(ctx context.Context, req *admissionv1.Admission
return resp
}
GPUType, err := h.findNvidiaGpuFromNodes(ctx)
if err != nil && !errors.Is(err, api.ErrGPUNodeNotFound) {
return h.sidecarWebhook.AdmissionError(req.UID, err)
}
GPUType := appcfg.GetSelectedGpuTypeValue()
// no gpu found, no need to inject env, just return.
if GPUType == "" {
if GPUType == "none" || GPUType == "" {
return resp
}
terminus, err := utils.GetTerminus(ctx, h.ctrlClient)
if err != nil {
return h.sidecarWebhook.AdmissionError(req.UID, err)
}
nvshareManagedMemory := ""
if terminus.Spec.Settings != nil {
nvshareManagedMemory = terminus.Spec.Settings[constants.EnvNvshareManagedMemory]
envs := []webhook.EnvKeyValue{
{
Key: constants.EnvGPUType,
Value: GPUType,
},
}
envs := []webhook.EnvKeyValue{}
if nvshareManagedMemory != "" {
envs = append(envs, webhook.EnvKeyValue{
Key: constants.EnvNvshareManagedMemory,
Value: nvshareManagedMemory,
})
}
envs = append(envs, webhook.EnvKeyValue{Key: "NVSHARE_DEBUG", Value: "1"})
patchBytes, err := webhook.CreatePatchForDeployment(tpl, req.Namespace, gpuRequired, GPUType, envs)
patchBytes, err := webhook.CreatePatchForDeployment(tpl, h.getGPUResourceTypeKey(GPUType), envs)
if err != nil {
klog.Errorf("create patch error %v", err)
return h.sidecarWebhook.AdmissionError(req.UID, err)
@@ -347,33 +331,17 @@ func (h *Handler) gpuLimitMutate(ctx context.Context, req *admissionv1.Admission
return resp
}
func (h *Handler) findNvidiaGpuFromNodes(ctx context.Context) (string, error) {
var nodes corev1.NodeList
err := h.ctrlClient.List(ctx, &nodes, &client.ListOptions{})
if err != nil {
return "", err
func (h *Handler) getGPUResourceTypeKey(gpuType string) string {
switch gpuType {
case utils.NvidiaCardType:
return constants.NvidiaGPU
case utils.GB10ChipType:
return constants.NvidiaGB10GPU
case utils.AmdApuCardType:
return constants.AMDAPU
default:
return ""
}
// return nvshare gpu or virtaitech gpu in priority
gtype := ""
for _, n := range nodes.Items {
if _, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
if _, ok = n.Status.Capacity[constants.NvshareGPU]; ok {
return constants.NvshareGPU, nil
}
gtype = constants.NvidiaGPU
}
if _, ok := n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
return constants.VirtAiTechVGPU, nil
}
}
if gtype != "" {
return gtype, nil
}
return "", api.ErrGPUNodeNotFound
}
func (h *Handler) providerRegistryValidate(req *restful.Request, resp *restful.Response) {

View File

@@ -340,7 +340,9 @@ func GetClusterResource(kubeConfig *rest.Config, token string) (*prometheus.Clus
arches.Insert(n.Labels["kubernetes.io/arch"])
if quantity, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
} else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.AMDAPU]; ok {
total += quantity.AsApproximateFloat64()
}
}

View File

@@ -254,21 +254,9 @@ func addServiceToContainer(c *restful.Container, handler *Handler) error {
Param(ws.PathParameter(ParamEntranceName, "the name of a application entrance")).
Returns(http.StatusOK, "Success to set the application entrance policy", nil))
ws.Route(ws.POST("/gpu/disable/managed-memory").
To(handler.disableGpuManagedMemory).
Doc("disable nvshare's managed memory ").
Metadata(restfulspec.KeyOpenAPITags, MODULE_TAGS).
Returns(http.StatusOK, "Success to disable", nil))
ws.Route(ws.POST("/gpu/enable/managed-memory").
To(handler.enableGpuManagedMemory).
Doc("enable nvshare's managed memory ").
Metadata(restfulspec.KeyOpenAPITags, MODULE_TAGS).
Returns(http.StatusOK, "Success to enable", nil))
ws.Route(ws.GET("/gpu/managed-memory").
To(handler.getManagedMemoryValue).
Doc("get nvshare's managed memory enabled or not").
ws.Route(ws.GET("/gpu/types").
To(handler.getGpuTypes).
Doc("get all gpu types in the cluster").
Metadata(restfulspec.KeyOpenAPITags, MODULE_TAGS).
Returns(http.StatusOK, "Success to get ", &ResultResponse{}))

View File

@@ -56,14 +56,19 @@ type AppSpec struct {
Developer string `yaml:"developer" json:"developer"`
RequiredMemory string `yaml:"requiredMemory" json:"requiredMemory"`
RequiredDisk string `yaml:"requiredDisk" json:"requiredDisk"`
SupportClient SupportClient `yaml:"supportClient" json:"supportClient"`
RequiredGPU string `yaml:"requiredGpu" json:"requiredGpu"`
RequiredCPU string `yaml:"requiredCpu" json:"requiredCpu"`
LimitedMemory string `yaml:"limitedMemory" json:"limitedMemory"`
LimitedDisk string `yaml:"limitedDisk" json:"limitedDisk"`
LimitedGPU string `yaml:"limitedGPU" json:"limitedGPU"`
LimitedCPU string `yaml:"limitedCPU" json:"limitedCPU"`
SupportClient SupportClient `yaml:"supportClient" json:"supportClient"`
RunAsUser bool `yaml:"runAsUser" json:"runAsUser"`
RunAsInternal bool `yaml:"runAsInternal" json:"runAsInternal"`
PodGPUConsumePolicy string `yaml:"podGpuConsumePolicy" json:"podGpuConsumePolicy"`
SubCharts []Chart `yaml:"subCharts" json:"subCharts"`
Hardware Hardware `yaml:"hardware" json:"hardware"`
SupportedGpu []any `yaml:"supportedGpu,omitempty" json:"supportedGpu,omitempty"`
}
type Hardware struct {
@@ -188,6 +193,17 @@ type Provider struct {
Verbs []string `yaml:"verbs" json:"verbs"`
}
type SpecialResource struct {
RequiredMemory *string `yaml:"requiredMemory,omitempty" json:"requiredMemory,omitempty"`
RequiredDisk *string `yaml:"requiredDisk,omitempty" json:"requiredDisk,omitempty"`
RequiredGPU *string `yaml:"requiredGpu,omitempty" json:"requiredGpu,omitempty"`
RequiredCPU *string `yaml:"requiredCpu,omitempty" json:"requiredCpu,omitempty"`
LimitedMemory *string `yaml:"limitedMemory,omitempty" json:"limitedMemory,omitempty"`
LimitedDisk *string `yaml:"limitedDisk,omitempty" json:"limitedDisk,omitempty"`
LimitedGPU *string `yaml:"limitedGPU,omitempty" json:"limitedGPU,omitempty"`
LimitedCPU *string `yaml:"limitedCPU,omitempty" json:"limitedCPU,omitempty"`
}
func (c *Chart) Namespace(owner string) string {
if c.Shared {
return fmt.Sprintf("%s-%s", c.Name, "shared")

View File

@@ -100,6 +100,7 @@ type ApplicationConfig struct {
PodsSelectors []metav1.LabelSelector
HardwareRequirement Hardware
SharedEntrances []v1alpha1.Entrance
SelectedGpuType string
}
func (c *ApplicationConfig) IsMiddleware() bool {
@@ -159,6 +160,13 @@ func (c *ApplicationConfig) GenSharedEntranceURL(ctx context.Context) ([]v1alpha
return app.GenSharedEntranceURL(ctx)
}
func (c *ApplicationConfig) GetSelectedGpuTypeValue() string {
if c.SelectedGpuType == "" {
return "none"
}
return c.SelectedGpuType
}
func (p *ProviderPermission) GetNamespace(ownerName string) string {
if p.Namespace != "" {
if p.Namespace == "user-space" || p.Namespace == "user-system" {

View File

@@ -752,7 +752,7 @@ func getApplicationPolicy(policies []appcfg.AppPolicy, entrances []appv1alpha1.E
return string(policyStr), nil
}
func parseAppPermission(data []appcfg.AppPermission) []appcfg.AppPermission {
func ParseAppPermission(data []appcfg.AppPermission) []appcfg.AppPermission {
permissions := make([]appcfg.AppPermission, 0)
for _, p := range data {
switch perm := p.(type) {

View File

@@ -78,7 +78,7 @@ func (h *HelmOps) Uninstall_(client kubernetes.Interface, actionConfig *action.C
return err
}
h.app.Permission = parseAppPermission(h.app.Permission)
h.app.Permission = ParseAppPermission(h.app.Permission)
var perm []appcfg.ProviderPermission
for _, p := range h.app.Permission {
if t, ok := p.([]appcfg.ProviderPermission); ok {

View File

@@ -50,7 +50,7 @@ func (h *HelmOps) SetValues() (values map[string]interface{}, err error) {
values["domain"] = entries
userspace := make(map[string]interface{})
h.app.Permission = parseAppPermission(h.app.Permission)
h.app.Permission = ParseAppPermission(h.app.Permission)
for _, p := range h.app.Permission {
switch perm := p.(type) {
case appcfg.AppDataPermission, appcfg.AppCachePermission, appcfg.UserDataPermission:
@@ -170,17 +170,12 @@ func (h *HelmOps) SetValues() (values map[string]interface{}, err error) {
values["cluster"] = map[string]interface{}{
"arch": arch,
}
gpuType, err := utils.FindGpuTypeFromNodes(nodes)
if err != nil {
klog.Errorf("Failed to get gpuType err=%v", err)
return values, err
}
values["GPU"] = map[string]interface{}{
"Type": gpuType,
"Type": h.app.GetSelectedGpuTypeValue(),
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
}
values["gpu"] = gpuType
values["gpu"] = h.app.GetSelectedGpuTypeValue()
if h.app.OIDC.Enabled {
err = h.createOIDCClient(values, zone, h.app.Namespace)

View File

@@ -16,7 +16,6 @@ import (
apputils "github.com/beclab/Olares/framework/app-service/pkg/utils/app"
"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -176,19 +175,8 @@ func (p *DownloadingApp) exec(ctx context.Context) error {
},
}
var nodes corev1.NodeList
err = p.client.List(ctx, &nodes, &client.ListOptions{})
if err != nil {
klog.Errorf("list node failed %v", err)
return err
}
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
if err != nil {
klog.Errorf("get gpu type failed %v", gpuType)
return err
}
values["GPU"] = map[string]interface{}{
"Type": gpuType,
"Type": appConfig.GetSelectedGpuTypeValue(),
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
}

View File

@@ -22,7 +22,6 @@ import (
"github.com/pkg/errors"
"helm.sh/helm/v3/pkg/action"
corev1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -214,19 +213,8 @@ func (p *UpgradingApp) exec(ctx context.Context) error {
"username": p.manager.Spec.AppOwner,
},
}
var nodes corev1.NodeList
err = p.client.List(ctx, &nodes, &client.ListOptions{})
if err != nil {
klog.Errorf("list node failed %v", err)
return err
}
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
if err != nil {
klog.Errorf("get gpu type failed %v", gpuType)
return err
}
values["GPU"] = map[string]interface{}{
"Type": gpuType,
"Type": appConfig.GetSelectedGpuTypeValue(),
"Cuda": os.Getenv("OLARES_SYSTEM_CUDA_VERSION"),
}

View File

@@ -78,13 +78,15 @@ const (
SidecarInitContainerName = "olares-sidecar-init"
EnvoyConfigWorkDirName = "envoy-config"
ByteTradeAuthor = "bytetrade.io"
NvshareGPU = "nvshare.com/gpu"
NvidiaGPU = "nvidia.com/gpu"
VirtAiTechVGPU = "virtaitech.com/gpu"
PatchOpAdd = "add"
PatchOpReplace = "replace"
EnvNvshareManagedMemory = "NVSHARE_MANAGED_MEMORY"
ByteTradeAuthor = "bytetrade.io"
PatchOpAdd = "add"
PatchOpReplace = "replace"
EnvGPUType = "GPU_TYPE"
// gpu resource keys
NvidiaGPU = "nvidia.com/gpu"
NvidiaGB10GPU = "nvidia.com/gb10"
AMDAPU = "amd.com/apu"
AuthorizationLevelOfPublic = "public"
AuthorizationLevelOfPrivate = "private"

View File

@@ -273,11 +273,7 @@ func (c *Creator) installSysApps(ctx context.Context, bflPod *corev1.Pod) error
"arch": arch,
}
gpuType, err := utils.FindGpuTypeFromNodes(&nodes)
if err != nil {
return err
}
vals["gpu"] = gpuType
vals["gpu"] = "none" // unused currently
userIndex, userSubnet, err := c.getUserSubnet(ctx)
if err != nil {

View File

@@ -16,6 +16,7 @@ import (
corev1 "k8s.io/api/core/v1"
sysv1alpha1 "github.com/beclab/Olares/framework/app-service/api/sys.bytetrade.io/v1alpha1"
"github.com/go-viper/mapstructure/v2"
"sigs.k8s.io/controller-runtime/pkg/client"
"github.com/beclab/Olares/framework/app-service/api/app.bytetrade.io/v1alpha1"
@@ -674,6 +675,7 @@ type ConfigOptions struct {
MarketSource string
IsAdmin bool
RawAppName string
SelectedGpu string
}
// GetAppConfig get app installation configuration from app store
@@ -740,7 +742,7 @@ func getAppConfigFromRepo(ctx context.Context, options *ConfigOptions) (*appcfg.
return getAppConfigFromConfigurationFile(options, chartPath)
}
func toApplicationConfig(app, chart, rawAppName string, cfg *appcfg.AppConfiguration) (*appcfg.ApplicationConfig, string, error) {
func toApplicationConfig(app, chart, rawAppName, selectedGpu string, cfg *appcfg.AppConfiguration) (*appcfg.ApplicationConfig, string, error) {
var permission []appcfg.AppPermission
if cfg.Permission.AppData {
permission = append(permission, appcfg.AppDataRW)
@@ -788,6 +790,57 @@ func toApplicationConfig(app, chart, rawAppName string, cfg *appcfg.AppConfigura
return nil, chart, err
}
// set suppertedGpu to ["nvidia","nvidia-gb10"] by default
if len(cfg.Spec.SupportedGpu) == 0 {
cfg.Spec.SupportedGpu = []interface{}{utils.NvidiaCardType, utils.GB10ChipType}
}
// try to get selected GPU type special resource requirement
if selectedGpu != "" {
found := false
for _, supportedGpu := range cfg.Spec.SupportedGpu {
if str, ok := supportedGpu.(string); ok && str == selectedGpu {
found = true
break
}
if supportedGpuResourceMap, ok := supportedGpu.(map[string]interface{}); ok {
if resourceRequirement, ok := supportedGpuResourceMap[selectedGpu].(map[string]interface{}); ok {
found = true
var specialResource appcfg.SpecialResource
err := mapstructure.Decode(resourceRequirement, &specialResource)
if err != nil {
return nil, chart, fmt.Errorf("failed to decode special resource for selected GPU type %s: %v", selectedGpu, err)
}
for _, resSetter := range []struct {
v **resource.Quantity
s *string
}{
{v: &mem, s: specialResource.RequiredMemory},
{v: &disk, s: specialResource.RequiredDisk},
{v: &cpu, s: specialResource.RequiredCPU},
{v: &gpu, s: specialResource.RequiredGPU},
} {
if resSetter.s != nil && *resSetter.s != "" {
*resSetter.v, err = valuePtr(resource.ParseQuantity(*resSetter.s))
if err != nil {
return nil, chart, fmt.Errorf("failed to parse special resource quantity %s: %v", *resSetter.s, err)
}
}
}
break
} // end if selected gpu's resource requirement found
} // end if supportedGpu is map
} // end for supportedGpu
if !found {
return nil, chart, fmt.Errorf("selected GPU type %s is not supported", selectedGpu)
}
}
// transform from Policy to AppPolicy
var policies []appcfg.AppPolicy
for _, p := range cfg.Options.Policies {
@@ -877,6 +930,7 @@ func toApplicationConfig(app, chart, rawAppName string, cfg *appcfg.AppConfigura
PodsSelectors: podSelectors,
HardwareRequirement: cfg.Spec.Hardware,
SharedEntrances: cfg.SharedEntrances,
SelectedGpuType: selectedGpu,
}, chart, nil
}
@@ -890,7 +944,7 @@ func getAppConfigFromConfigurationFile(opt *ConfigOptions, chartPath string) (*a
return nil, chartPath, err
}
return toApplicationConfig(opt.App, chartPath, opt.RawAppName, &cfg)
return toApplicationConfig(opt.App, chartPath, opt.RawAppName, opt.SelectedGpu, &cfg)
}
func checkVersionFormat(constraint string) error {

View File

@@ -234,7 +234,9 @@ func CheckAppRequirement(token string, appConfig *appcfg.ApplicationConfig, op v
return constants.CPU, constants.SystemCPUPressure, fmt.Errorf(constants.SystemCPUPressureMessage, op)
}
}
if appConfig.Requirement.GPU != nil {
// only support nvidia gpu managment by HAMi for now
if appConfig.Requirement.GPU != nil && appConfig.GetSelectedGpuTypeValue() == utils.NvidiaCardType {
if !appConfig.Requirement.GPU.IsZero() && metrics.GPU.Total <= 0 {
return constants.GPU, constants.SystemGPUNotAvailable, fmt.Errorf(constants.SystemGPUNotAvailableMessage, op)
@@ -398,7 +400,9 @@ func GetClusterResource(token string) (*prometheus.ClusterMetrics, []string, err
arches.Insert(n.Labels["kubernetes.io/arch"])
if quantity, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
} else if quantity, ok = n.Status.Capacity[constants.NvidiaGB10GPU]; ok {
total += quantity.AsApproximateFloat64()
} else if quantity, ok = n.Status.Capacity[constants.AMDAPU]; ok {
total += quantity.AsApproximateFloat64()
}
}

View File

@@ -0,0 +1,12 @@
package utils
const (
NodeGPUTypeLabel = "gpu.bytetrade.io/type"
)
const (
NvidiaCardType = "nvidia" // handling by HAMi
AmdGpuCardType = "amd-gpu" //
AmdApuCardType = "amd-apu" // AMD APU with integrated GPU , AI Max 395 etc.
GB10ChipType = "nvidia-gb10" // NVIDIA GB10 Superchip & unified system memory
)

View File

@@ -103,24 +103,37 @@ func GetAllNodesTunnelIPCIDRs() (cidrs []string) {
return cidrs
}
func FindGpuTypeFromNodes(nodes *corev1.NodeList) (string, error) {
gpuType := "none"
// func FindGpuTypeFromNodes(nodes *corev1.NodeList) (string, error) {
// gpuType := "none"
// if nodes == nil {
// return gpuType, errors.New("empty node list")
// }
// for _, n := range nodes.Items {
// if _, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
// if _, ok = n.Status.Capacity[constants.NvshareGPU]; ok {
// return "nvshare", nil
// }
// gpuType = "nvidia"
// }
// if _, ok := n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
// return "virtaitech", nil
// }
// }
// return gpuType, nil
// }
func GetAllGpuTypesFromNodes(nodes *corev1.NodeList) (map[string]struct{}, error) {
gpuTypes := make(map[string]struct{})
if nodes == nil {
return gpuType, errors.New("empty node list")
return gpuTypes, errors.New("empty node list")
}
for _, n := range nodes.Items {
if _, ok := n.Status.Capacity[constants.NvidiaGPU]; ok {
if _, ok = n.Status.Capacity[constants.NvshareGPU]; ok {
return "nvshare", nil
}
gpuType = "nvidia"
}
if _, ok := n.Status.Capacity[constants.VirtAiTechVGPU]; ok {
return "virtaitech", nil
if typeLabel, ok := n.Labels[NodeGPUTypeLabel]; ok {
gpuTypes[typeLabel] = struct{}{} // TODO: add driver version info
}
}
return gpuType, nil
return gpuTypes, nil
}
func IsNodeReady(node *corev1.Node) bool {

View File

@@ -30,7 +30,6 @@ import (
admissionv1 "k8s.io/api/admission/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/labels"
@@ -544,16 +543,21 @@ type EnvKeyValue struct {
}
// CreatePatchForDeployment add gpu env for deployment and returns patch bytes.
func CreatePatchForDeployment(tpl *corev1.PodTemplateSpec, namespace string, gpuRequired *resource.Quantity, typeKey string, envKeyValues []EnvKeyValue) ([]byte, error) {
patches, err := addResourceLimits(tpl, namespace, gpuRequired, typeKey, envKeyValues)
func CreatePatchForDeployment(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues []EnvKeyValue) ([]byte, error) {
patches, err := addResourceLimits(tpl, typeKey, envKeyValues)
if err != nil {
return []byte{}, err
}
return json.Marshal(patches)
}
func addResourceLimits(tpl *corev1.PodTemplateSpec, namespace string, gpuRequired *resource.Quantity, typeKey string, envKeyValues []EnvKeyValue) (patch []patchOp, err error) {
if typeKey == constants.NvidiaGPU || typeKey == constants.NvshareGPU {
func addResourceLimits(tpl *corev1.PodTemplateSpec, typeKey string, envKeyValues []EnvKeyValue) (patch []patchOp, err error) {
if typeKey == "" {
klog.Warning("No gpu type selected, skip adding resource limits")
return patch, nil
}
if typeKey == constants.NvidiaGPU || typeKey == constants.NvidiaGB10GPU {
if tpl.Spec.RuntimeClassName != nil {
patch = append(patch, patchOp{
Op: constants.PatchOpReplace,
@@ -584,7 +588,10 @@ func addResourceLimits(tpl *corev1.PodTemplateSpec, namespace string, gpuRequire
t := make(map[string]map[string]string)
t["limits"] = map[string]string{}
for k, v := range container.Resources.Limits {
if k.String() == constants.NvidiaGPU || k.String() == constants.NvshareGPU || k.String() == constants.VirtAiTechVGPU {
if k.String() == constants.NvidiaGPU ||
k.String() == constants.NvidiaGB10GPU ||
k.String() == constants.AMDAPU {
// unset all previous gpu limits
continue
}
t["limits"][k.String()] = v.String()

View File

@@ -431,7 +431,7 @@ spec:
privileged: true
containers:
- name: authelia
image: beclab/auth:0.2.45
image: beclab/auth:0.2.46
imagePullPolicy: IfNotPresent
ports:
- containerPort: 9091

View File

@@ -29,7 +29,7 @@ spec:
name: check-auth
containers:
- name: auth-front
image: beclab/login:v1.7.4
image: beclab/login:v1.8.5
imagePullPolicy: IfNotPresent
ports:
- containerPort: 80