Compare commits

...

41 Commits

Author SHA1 Message Date
eball
679bf8b662 Merge branch 'main' into cli/feat/install_on_spark
* main:
  app-service: support injecting gpu memory and container selection (#2581)
  docs: revamp the "Advanced" page (previously "Developer") (#2534)
  fix: conditionally install storage for juicefs (#2579)
  docs: add troubleshooting guide for missing apps in Market (#2574)
  feat(olares-app): update olares-app version to v1.9.6 (#2573)
  docs: update installation method of drivers on windows (#2566)
  docs: update the initialization steps for OpenClaw tutorial (#2567)
  docs: add troubleshooting guide for memory not released after stopping apps (#2565)
  docs: updates for releasing resources and uninstalling shared apps (#2568)
  feat(olares-app): update olares-app version to v1.9.5 (#2563)
  docs: update method for installing drivers on windows (#2564)
  docs: fix space nav display, extract use-case/developer sidebars, add note in space docs (#2562)
  docs: update custom domain tutorial screenshots and align copy with latest UI (#2559)
  cli: upgrade l4-bfl-proxy to v0.3.11 (#2557)
  l4-bfl-proxy: skip nginx reload if configuration has not changed (#2556)
  docs: update content related to reference app (#2530)
  Use the Robot font to match the theme of the rest of the applications
  bfl: add sync urls to master node (#2540)
2026-02-28 11:11:50 +08:00
dkeven
51137311ac feat(gpu): update HAMi to v2.6.11 2026-02-27 16:55:49 +08:00
eball
69080d5ba3 feat(gpu): remove GB10 device plugin installation and related checks 2026-02-27 14:48:31 +08:00
eball
80798daaf3 feat(gpu): remove GPU type specification from DaemonSet and values.yaml 2026-02-27 14:04:30 +08:00
eball
8270785303 feat(gpu): remove chip type handling from GPU label updates 2026-02-27 11:11:34 +08:00
dkeven
c94e4300c7 feat(gpu): supports auto binding GPU to app 2026-02-26 21:07:12 +08:00
eball
e27dd6bb5c feat: add chip type support for AMD and NVIDIA GPUs in node label updates 2026-02-26 19:12:31 +08:00
dkeven
c00eec3efd feat: update hami version to v2.6.11-compatible-arm 2026-02-13 11:20:04 +08:00
eball
5855b8ad0a Merge branch 'main' into cli/feat/install_on_spark
* main:
  appservice: stop app if it is hami cause unschedule no wait (#2533)
  fix(cli): ignore finished pods in readiness check (#2528)
  docs: add SMB account management to Settings (#2526)
  docs: resolve comments on managing apps (#2523)
2026-02-13 11:13:54 +08:00
eball
0bb131aa76 feat: enhance storage device detection with USB serial properties 2026-02-12 11:49:07 +08:00
eball
ffaedaf889 Merge branch 'main' into cli/feat/install_on_spark
* main:
  olares-app: update version to v1.9.3 (#2524)
  docs: add skills and plugins management for OpenClaw (#2521)
  olares-app: update version to v1.9.2 (#2520)
2026-02-12 11:09:37 +08:00
eball
1aa3e2dc89 Merge branch 'cli/feat/install_on_spark' of github.com:beclab/olares into cli/feat/install_on_spark
* 'cli/feat/install_on_spark' of github.com:beclab/olares:
  fix: amd gpu check (#2522)
2026-02-12 11:08:31 +08:00
hysyeah
476aee0e8e fix: amd gpu check (#2522) 2026-02-11 17:04:09 +08:00
eball
45428fd29f fix: update klauspost/cpuid to v2.3.0 2026-02-11 14:50:22 +08:00
eball
0edae500ef Merge branch 'cli/feat/install_on_spark' of github.com:beclab/olares into cli/feat/install_on_spark
* 'cli/feat/install_on_spark' of github.com:beclab/olares:
  feat(gpu): update hami version to v2.6.10-compatible for spark
2026-02-11 11:33:58 +08:00
eball
8f3744fb0f fix: remove gb10 device plugin checking 2026-02-11 11:33:30 +08:00
dkeven
1c971d8dac feat(gpu): update hami version to v2.6.10-compatible for spark 2026-02-11 00:07:06 +08:00
eball
9dd0a2715b Merge branch 'main' into cli/feat/install_on_spark
* main:
  authelia: add auth type param to user regulation (#2518)
  docs: update instructions per latest operations (#2517)
  olares-app: update version to v1.9.1 (#2515)
  docs: fix sunshine address for .local domain and formatting for olares one docs (#2512)
  bfl: remove deprecated ingress mode handling from NginxController  (#2511)
  appservice: handle case for system applications without configuration in provider list (#2509)
  docs: add OpenClaw tutorial (#2506)
  docs: batch add docs for one (#2457)
  settings, market, files, vault, desktop: fix some ui bugs (#2503)
  feat(cli): add more lines to default journalctl limit (#2502)
  app-service: handle case for system apps without configuration in permission API  (#2499)
  fix(cli): seperate dmesg args for dmesg logs (#2497)
  docs: add FAQs about activation and login (#2481)
  docs: add middleware data access and integration guides (#2444)
  docs: add docs for distributing olares apps (#2484)
2026-02-10 20:46:10 +08:00
eball
b378a09a8c refactor: remove GB10 chip type check from GPU info update 2026-02-10 20:45:50 +08:00
hys
06b3c4474a feat: amd device plugin and container toolkit install 2026-02-10 15:03:07 +08:00
eball
5bd7561fcc Merge commit '5109ad001c62c59e049c36b6e3ee9c1a59e3c96e' into cli/feat/install_on_spark
* commit '5109ad001c62c59e049c36b6e3ee9c1a59e3c96e':
  Modify release-daemon.yaml for arm64 support
2026-02-05 14:24:59 +08:00
eball
3f9e38f14c feat: update ARM64 package sources in release workflow for improved compatibility 2026-02-05 13:56:20 +08:00
eball
9d59d566c5 feat: enhance ARM64 support by adding architecture-specific package installations 2026-02-05 13:54:26 +08:00
eball
989f54059e feat: streamline ARM64 cross-compilation setup in release workflow 2026-02-05 13:48:56 +08:00
eball
89041683a6 feat: enhance ARM64 multi-architecture support in release workflow 2026-02-05 13:41:58 +08:00
eball
b79f36deb8 feat: update multi-arch setup for ARM64 in release workflow 2026-02-05 13:37:26 +08:00
eball
4beb69747d feat: enhance multi-architecture support for ARM64 in release workflow 2026-02-05 13:31:32 +08:00
eball
0776188aa8 feat: enable CGO for building on ARM architecture and adjust build constraints for Linux 2026-02-05 13:19:30 +08:00
eball
90408a9f1f Merge branch 'main' into cli/feat/install_on_spark
* main:
  fix: seafile trim commit_id for syncing and change psql ccnet init (#2495)
  backup: sync systemEnv default remote url (#2492)
  download-server: nats message publish modify (#2489)
  fix(cli): clear master host config when uninstalling (#2488)
  market, settings: support optional data deletion and fix bugs. (#2486)
  fix(cli): do not override upgrade target version by config file (#2483)
  app-service: feat app uninstall delete data (#2480)
  fix(cli): bind config item to the effective command (#2474)
  feat: support more scheme update to env crs (#2473)
2026-02-05 13:00:10 +08:00
eball
997c9d4142 Merge branch 'main' into cli/feat/install_on_spark
* main:
  app-service: add support for selecting GPU types in application installation (#2470)
  ci: bump version to 1.12.6 (#2471)
  fix a link issue
  desktop, settings, files, vault: fix multiple known issues (#2467)
  authelia: add user regulation for TOTP authentication attempts (#2466)
2026-02-03 14:23:31 +08:00
eball
0fa4f298a9 fix(gpu): update pod selector for hami-device-plugin based on GB10 chip detection
fix(deploy): bump app-service image version to 0.4.78
2026-02-03 11:30:48 +08:00
eball
4004acf8d6 Merge commit '375dfceacbdae1430a27929525cbabd9b5991d38' into cli/feat/install_on_spark
* commit '375dfceacbdae1430a27929525cbabd9b5991d38':
  fix(cli): unify config setting for release command (#2465)
2026-02-02 17:53:07 +08:00
eball
9c0b2d4b62 Merge branch 'main' into cli/feat/install_on_spark
* main:
  fix(cli): set node port range in minikube to allow smb service (#2460)
  settings, user service: update wallpaper style (#2463)
  bfl: enhance user login background handling with style support (#2464)
  feat: search upgrade to v0.1.6 (#2459)
2026-02-02 15:00:05 +08:00
eball
7037a7afa0 Merge branch 'cli/feat/install_on_spark' of github.com:beclab/olares into cli/feat/install_on_spark
* 'cli/feat/install_on_spark' of github.com:beclab/olares:
  feat: add nvidia device plugin for gb10
  feat(connector): enhance GB10 chip detection with environment variable support
  feat(amdgpu): refactor AMD GPU detection and support for GB10 chip and APU
  feat(gpu): enhance DGX Spark support and update GPU type handling
  settings: add settings new version and update provider api (#2456)
  refactor(cli): unify config of command line options and envs (#2453)
  appservice: v2 app stop (#2455)
  download-server:add download err category && modify aria2 max concurrent (#2445)
  docs: add storage expansion via CLI (#2409)
  cli: upgrade l4-bfl-proxy to v0.3.10 (#2442)
  l4: skip invalid expose port  (#2441)
  appservice: add clickhouse support (#2440)
  daemon: change pcap open timeout to 1 millisecond to prevent close hang (#2439)
  tapr: add clickhouse support  (#2437)
  feat(gpu): supports dynamic detection of hot plugged-in GPUs (#2435)
  docs/update/olares-space-storage-info

# Conflicts:
#	cli/pkg/common/kube_runtime.go
2026-02-02 14:58:20 +08:00
dkeven
5159ecb753 feat: add nvidia device plugin for gb10 2026-01-29 15:19:57 +08:00
eball
979cd37ce1 feat(connector): enhance GB10 chip detection with environment variable support 2026-01-29 15:18:58 +08:00
eball
f33a1f1a00 feat(amdgpu): refactor AMD GPU detection and support for GB10 chip and APU 2026-01-29 15:18:58 +08:00
eball
dee6474fda feat(gpu): enhance DGX Spark support and update GPU type handling 2026-01-29 15:18:55 +08:00
eball
1102751eb0 feat(connector): enhance GB10 chip detection with environment variable support 2026-01-28 11:10:18 +08:00
eball
9194c0c04c feat(amdgpu): refactor AMD GPU detection and support for GB10 chip and APU 2026-01-26 16:48:40 +08:00
eball
884f4f5585 feat(gpu): enhance DGX Spark support and update GPU type handling 2026-01-22 20:03:34 +08:00
28 changed files with 746 additions and 123 deletions

121
cli/pkg/amdgpu/module.go Normal file
View File

@@ -0,0 +1,121 @@
package amdgpu
import (
"time"
"github.com/beclab/Olares/cli/pkg/common"
"github.com/beclab/Olares/cli/pkg/core/prepare"
"github.com/beclab/Olares/cli/pkg/core/task"
)
// InstallAmdContainerToolkitModule installs AMD container toolkit on supported Ubuntu if ROCm is installed.
type InstallAmdContainerToolkitModule struct {
common.KubeModule
Skip bool // conditional execution based on ROCm detection
SkipRocmCheck bool
}
func (m *InstallAmdContainerToolkitModule) IsSkip() bool {
return m.Skip
}
func (m *InstallAmdContainerToolkitModule) Init() {
m.Name = "InstallAmdContainerToolkit"
if m.IsSkip() {
return
}
prepareCollection := prepare.PrepareCollection{}
if !m.SkipRocmCheck {
prepareCollection = append(prepareCollection, new(RocmInstalled))
}
updateAmdSource := &task.RemoteTask{
Name: "UpdateAmdContainerToolkitSource",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Action: new(UpdateAmdContainerToolkitSource),
Prepare: &prepareCollection,
Parallel: false,
Retry: 1,
}
installAmdContainerToolkit := &task.RemoteTask{
Name: "InstallAmdContainerToolkit",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepareCollection,
Action: new(InstallAmdContainerToolkit),
Parallel: false,
Retry: 1,
}
generateAndValidateCDI := &task.RemoteTask{
Name: "GenerateAndValidateAmdCDI",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepareCollection,
Action: new(GenerateAndValidateAmdCDI),
Parallel: false,
Retry: 1,
}
m.Tasks = []task.Interface{
updateAmdSource,
installAmdContainerToolkit,
generateAndValidateCDI,
}
}
// InstallAmdPluginModule installs AMD GPU device plugin on Kubernetes.
type InstallAmdPluginModule struct {
common.KubeModule
Skip bool // conditional execution based on GPU enablement
}
func (m *InstallAmdPluginModule) IsSkip() bool {
return m.Skip
}
func (m *InstallAmdPluginModule) Init() {
m.Name = "InstallAmdPlugin"
// update node with AMD GPU labels
updateNode := &task.RemoteTask{
Name: "UpdateNodeAmdGPUInfo",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
},
Action: new(UpdateNodeAmdGPUInfo),
Parallel: false,
Retry: 1,
}
installPlugin := &task.RemoteTask{
Name: "InstallAmdPlugin",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
},
Action: new(InstallAmdPlugin),
Parallel: false,
Retry: 1,
}
checkGpuState := &task.RemoteTask{
Name: "CheckAmdGPUState",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
new(RocmInstalled),
},
Action: new(CheckAmdGpuStatus),
Parallel: false,
Retry: 50,
Delay: 10 * time.Second,
}
m.Tasks = []task.Interface{
updateNode,
installPlugin,
checkGpuState,
}
}

View File

@@ -0,0 +1,56 @@
package amdgpu
import (
"github.com/beclab/Olares/cli/pkg/bootstrap/precheck"
"github.com/beclab/Olares/cli/pkg/common"
"github.com/beclab/Olares/cli/pkg/core/connector"
"github.com/beclab/Olares/cli/pkg/core/logger"
)
// RocmInstalled checks if AMD ROCm is installed on the system.
type RocmInstalled struct {
common.KubePrepare
}
func (p *RocmInstalled) PreCheck(runtime connector.Runtime) (bool, error) {
rocmV, err := connector.RocmVersion()
if err != nil {
logger.Debugf("ROCm version check error: %v", err)
return false, nil
}
if rocmV == nil {
return false, nil
}
logger.Infof("Detected ROCm version: %s", rocmV.Original())
return true, nil
}
// RocmNotInstalled checks if AMD ROCm is NOT installed on the system.
type RocmNotInstalled struct {
common.KubePrepare
RocmInstalled
}
func (p *RocmNotInstalled) PreCheck(runtime connector.Runtime) (bool, error) {
installed, err := p.RocmInstalled.PreCheck(runtime)
if err != nil {
return false, err
}
return !installed, nil
}
// ContainerdInstalled checks if containerd is installed on the system.
type ContainerdInstalled struct {
common.KubePrepare
}
func (p *ContainerdInstalled) PreCheck(runtime connector.Runtime) (bool, error) {
containerdCheck := precheck.ConflictingContainerdCheck{}
if err := containerdCheck.Check(runtime); err != nil {
return true, nil
}
logger.Info("containerd is not installed, ignore task")
return false, nil
}

View File

@@ -1,17 +1,20 @@
package amdgpu
import (
"context"
"fmt"
"os/exec"
"path"
"path/filepath"
"github.com/beclab/Olares/cli/pkg/clientset"
"github.com/beclab/Olares/cli/pkg/common"
cc "github.com/beclab/Olares/cli/pkg/core/common"
"github.com/beclab/Olares/cli/pkg/core/connector"
"github.com/beclab/Olares/cli/pkg/core/logger"
"github.com/beclab/Olares/cli/pkg/core/task"
"github.com/beclab/Olares/cli/pkg/utils"
"github.com/beclab/Olares/cli/pkg/core/util"
"github.com/beclab/Olares/cli/pkg/gpu"
"github.com/Masterminds/semver/v3"
"github.com/pkg/errors"
@@ -26,8 +29,8 @@ func (m *InstallAmdRocmModule) Init() {
m.Name = "InstallAMDGPU"
installAmd := &task.RemoteTask{
Name: "InstallAmdRocm",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Name: "InstallAmdRocm",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Action: &InstallAmdRocm{
// no manifest needed
},
@@ -51,7 +54,7 @@ func (t *InstallAmdRocm) Execute(runtime connector.Runtime) error {
return nil
}
amdGPUExists, err := utils.HasAmdIGPU(runtime)
amdGPUExists, err := connector.HasAmdAPUOrGPU(runtime)
if err != nil {
return err
}
@@ -59,7 +62,7 @@ func (t *InstallAmdRocm) Execute(runtime connector.Runtime) error {
if !amdGPUExists {
return nil
}
rocmV, _ := utils.RocmVersion()
rocmV, _ := connector.RocmVersion()
min := semver.MustParse("7.1.1")
if rocmV != nil && rocmV.LessThan(min) {
return fmt.Errorf("detected ROCm version %s, which is lower than required %s; please uninstall existing ROCm/AMDGPU components before installation with command: olares-cli amdgpu uninstall", rocmV.Original(), min.Original())
@@ -131,3 +134,163 @@ func (t *AmdgpuUninstallAction) Execute(runtime connector.Runtime) error {
logger.Warn("Warning: Please reboot your machine after uninstall to fully remove ROCm components.")
return nil
}
// UpdateAmdContainerToolkitSource configures the AMD container toolkit APT repository.
type UpdateAmdContainerToolkitSource struct {
common.KubeAction
}
func (t *UpdateAmdContainerToolkitSource) Execute(runtime connector.Runtime) error {
// Install prerequisites
if _, err := runtime.GetRunner().SudoCmd("apt update && apt install -y wget gnupg2", false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to install prerequisites for AMD container toolkit")
}
if _, err := runtime.GetRunner().SudoCmd("install -d -m 0755 /etc/apt/keyrings", false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to create /etc/apt/keyrings directory")
}
cmd := "wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null"
if _, err := runtime.GetRunner().SudoCmd(cmd, false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to download and install AMD ROCm GPG key")
}
si := runtime.GetSystemInfo()
var ubuntuCodename string
if si.IsUbuntuVersionEqual(connector.Ubuntu2404) {
ubuntuCodename = "noble"
} else if si.IsUbuntuVersionEqual(connector.Ubuntu2204) {
ubuntuCodename = "jammy"
} else {
return fmt.Errorf("unsupported Ubuntu version for AMD container toolkit")
}
aptSourceLine := fmt.Sprintf("deb [signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amd-container-toolkit/apt/ %s main", ubuntuCodename)
cmd = fmt.Sprintf("echo '%s' > /etc/apt/sources.list.d/amd-container-toolkit.list", aptSourceLine)
if _, err := runtime.GetRunner().SudoCmd(cmd, false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to add AMD container toolkit APT source")
}
logger.Infof("AMD container toolkit repository configured successfully")
return nil
}
// InstallAmdContainerToolkit installs the AMD container toolkit package.
type InstallAmdContainerToolkit struct {
common.KubeAction
}
func (t *InstallAmdContainerToolkit) Execute(runtime connector.Runtime) error {
logger.Infof("Installing AMD container toolkit...")
if _, err := runtime.GetRunner().SudoCmd("apt update && apt install -y amd-container-toolkit", false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to install AMD container toolkit")
}
logger.Infof("AMD container toolkit installed successfully")
return nil
}
// GenerateAndValidateAmdCDI generates and validates the AMD CDI spec.
type GenerateAndValidateAmdCDI struct {
common.KubeAction
}
func (t *GenerateAndValidateAmdCDI) Execute(runtime connector.Runtime) error {
// Ensure /etc/cdi directory exists
if _, err := runtime.GetRunner().SudoCmd("install -d -m 0755 /etc/cdi", false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to create /etc/cdi directory")
}
// Generate CDI spec
logger.Infof("Generating AMD CDI spec...")
if _, err := runtime.GetRunner().SudoCmd("amd-ctk cdi generate --output=/etc/cdi/amd.json", false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to generate AMD CDI spec")
}
// Validate CDI spec
logger.Infof("Validating AMD CDI spec...")
if _, err := runtime.GetRunner().SudoCmd("amd-ctk cdi validate --path=/etc/cdi/amd.json", false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to validate AMD CDI spec")
}
logger.Infof("AMD CDI spec generated and validated successfully")
return nil
}
// UpdateNodeAmdGPUInfo updates Kubernetes node labels with AMD GPU information.
type UpdateNodeAmdGPUInfo struct {
common.KubeAction
}
func (u *UpdateNodeAmdGPUInfo) Execute(runtime connector.Runtime) error {
client, err := clientset.NewKubeClient()
if err != nil {
return errors.Wrap(errors.WithStack(err), "kubeclient create error")
}
// Check if AMD GPU/APU exists
amdGPUExists, err := connector.HasAmdAPUOrGPU(runtime)
if err != nil {
return err
}
if !amdGPUExists {
logger.Info("AMD GPU/APU is not detected")
return nil
}
// Get ROCm version
rocmV, err := connector.RocmVersion()
if err != nil || rocmV == nil {
logger.Info("ROCm is not installed")
return nil
}
rocmVersion := rocmV.Original()
// Determine GPU type (APU vs discrete GPU)
gpuType := gpu.AmdGpuCardType
if runtime.GetSystemInfo().IsAmdApu() {
gpuType = gpu.AmdApuCardType
}
// Use ROCm version as both driver and "cuda" version for AMD
return gpu.UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), &rocmVersion, nil, nil, &gpuType)
}
// InstallAmdPlugin installs the AMD GPU device plugin DaemonSet.
type InstallAmdPlugin struct {
common.KubeAction
}
func (t *InstallAmdPlugin) Execute(runtime connector.Runtime) error {
amdPluginPath := path.Join(runtime.GetInstallerDir(), "wizard/config/gpu/nvidia/amdgpu-device-plugin.yaml")
_, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("kubectl apply -f %s", amdPluginPath), false, true)
if err != nil {
return errors.Wrap(errors.WithStack(err), "failed to apply AMD GPU device plugin")
}
logger.Infof("AMD GPU device plugin installed successfully")
return nil
}
// CheckAmdGpuStatus checks if the AMD GPU device plugin pod is running.
type CheckAmdGpuStatus struct {
common.KubeAction
}
func (t *CheckAmdGpuStatus) Execute(runtime connector.Runtime) error {
kubectlpath, err := util.GetCommand(common.CommandKubectl)
if err != nil {
return fmt.Errorf("kubectl not found")
}
// Check AMD device plugin pod status using the label from amdgpu-device-plugin.yaml
selector := "name=amdgpu-dp-ds"
cmd := fmt.Sprintf("%s get pod -n kube-system -l '%s' -o jsonpath='{.items[*].status.phase}'", kubectlpath, selector)
rphase, _ := runtime.GetRunner().SudoCmd(cmd, false, false)
if rphase == "Running" {
logger.Infof("AMD GPU device plugin is running")
return nil
}
return fmt.Errorf("AMD GPU device plugin state is not Running (current: %s)", rphase)
}

View File

@@ -59,7 +59,7 @@ func (t *PatchTask) Execute(runtime connector.Runtime) error {
pre_reqs = pre_reqs + " network-manager "
}
pre_reqs += " conntrack socat apache2-utils net-tools make gcc bison flex tree unzip "
pre_reqs += " conntrack socat apache2-utils net-tools make gcc bison flex tree unzip lshw"
var systemInfo = runtime.GetSystemInfo()
var platformFamily = systemInfo.GetOsPlatformFamily()

View File

@@ -338,7 +338,9 @@ func (c *CudaChecker) Name() string {
}
func (c *CudaChecker) Check(runtime connector.Runtime) error {
if !runtime.GetSystemInfo().IsLinux() {
if !runtime.GetSystemInfo().IsLinux() ||
// Skip check on NVIDIA DGX Spark systems, which have their own GPU management
runtime.GetSystemInfo().IsGB10Chip() {
return nil
}
@@ -388,17 +390,17 @@ func (r *RocmChecker) Check(runtime connector.Runtime) error {
return nil
}
// detect AMD GPU presence
amdGPUExists, err := utils.HasAmdIGPU(runtime)
// detect AMD APU/GPU presence
amdGPUExists, err := connector.HasAmdAPUOrGPU(runtime)
if err != nil {
return err
}
// no AMD GPU found, no need to check rocm
// no AMD APU/GPU found, no need to check rocm
if !amdGPUExists {
return nil
}
curV, err := utils.RocmVersion()
curV, err := connector.RocmVersion()
if err != nil && !os.IsNotExist(err) {
return err
}

View File

@@ -210,6 +210,7 @@ func NewArgument() *Argument {
arg.IsCloudInstance, _ = strconv.ParseBool(os.Getenv(ENV_TERMINUS_IS_CLOUD_VERSION))
arg.IsOlaresInContainer = os.Getenv(ENV_CONTAINER_MODE) == "oic"
si.IsOIC = arg.IsOlaresInContainer
si.ProductName = arg.GetProductName()
// Ensure BaseDir is initialized before loading master.conf
// so master host config can be loaded from ${base-dir}/master.conf reliably.
@@ -415,6 +416,57 @@ func (a *Argument) SetSwapConfig(config SwapConfig) {
a.Swappiness = config.Swappiness
}
func (a *Argument) SetMasterHostOverride(config MasterHostConfig) {
if config.MasterHost != "" {
a.MasterHost = config.MasterHost
}
if config.MasterNodeName != "" {
a.MasterNodeName = config.MasterNodeName
}
// set a dummy name to bypass validity checks
// as it will be overridden later when the node name is fetched
if a.MasterNodeName == "" {
a.MasterNodeName = "master"
}
if config.MasterSSHPassword != "" {
a.MasterSSHPassword = config.MasterSSHPassword
}
if config.MasterSSHUser != "" {
a.MasterSSHUser = config.MasterSSHUser
}
if config.MasterSSHPort != 0 {
a.MasterSSHPort = config.MasterSSHPort
}
if config.MasterSSHPrivateKeyPath != "" {
a.MasterSSHPrivateKeyPath = config.MasterSSHPrivateKeyPath
}
}
func (a *Argument) LoadMasterHostConfigIfAny() error {
if a.BaseDir == "" {
return errors.New("basedir unset")
}
content, err := os.ReadFile(filepath.Join(a.BaseDir, MasterHostConfigFile))
if os.IsNotExist(err) {
return nil
}
if err != nil {
return err
}
return json.Unmarshal(content, a.MasterHostConfig)
}
func (a *Argument) GetProductName() string {
data, err := os.ReadFile("/sys/class/dmi/id/product_name")
if err != nil {
fmt.Printf("\nCannot get product name on this device, %s\n", err)
return ""
}
return strings.TrimSpace(string(data))
}
func NewKubeRuntime(arg Argument) (*KubeRuntime, error) {
loader := NewLoader(arg)
cluster, err := loader.Load()

View File

@@ -98,4 +98,6 @@ const (
const (
ZfsSnapshotter = "/var/lib/containerd/io.containerd.snapshotter.v1.zfs"
ENV_GB10_CHIP = "GB10_CHIP" // for building images for NVIDIA GB10 Superchip systems
)

View File

@@ -0,0 +1,117 @@
package connector
import (
"fmt"
"os"
"os/exec"
"strings"
"github.com/Masterminds/semver/v3"
)
func hasAmdAPU(cmdExec func(s string) (string, error)) (bool, error) {
// Detect by CPU model names that bundle AMD AI NPU/graphics
targets := []string{
"AMD Ryzen AI Max+ 395",
"AMD Ryzen AI Max 390",
"AMD Ryzen AI Max 385",
"AMD Ryzen AI 9 HX 375",
"AMD Ryzen AI 9 HX 370",
"AMD Ryzen AI 9 365",
}
// try lscpu first: extract 'Model name' field
out, err := cmdExec("lscpu 2>/dev/null | awk -F': *' '/^Model name/{print $2; exit}' || true")
if err != nil {
return false, err
}
if out != "" {
lo := strings.ToLower(strings.TrimSpace(out))
for _, t := range targets {
if strings.Contains(lo, strings.ToLower(t)) {
return true, nil
}
}
}
// fallback to /proc/cpuinfo
out, err = cmdExec("awk -F': *' '/^model name/{print $2; exit}' /proc/cpuinfo 2>/dev/null || true")
if err != nil {
return false, err
}
if out != "" {
lo := strings.ToLower(strings.TrimSpace(out))
for _, t := range targets {
if strings.Contains(lo, strings.ToLower(t)) {
return true, nil
}
}
}
return false, nil
}
func hasAmdAPUOrGPU(cmdExec func(s string) (string, error)) (bool, error) {
out, err := cmdExec("lspci -d '1002:' 2>/dev/null | grep 'AMD' || true")
if err != nil {
return false, err
}
if out != "" {
return true, nil
}
out, err = cmdExec("lshw -c display -numeric -disable network 2>/dev/null | grep 'vendor: .* \\[1002\\]' || true")
if err != nil {
return false, err
}
if out != "" {
return true, nil
}
return false, nil
}
func HasAmdAPU(execRuntime Runtime) (bool, error) {
return hasAmdAPU(func(s string) (string, error) {
return execRuntime.GetRunner().SudoCmd(s, false, false)
})
}
func HasAmdAPULocal() (bool, error) {
return hasAmdAPU(func(s string) (string, error) {
out, err := exec.Command("sh", "-c", s).Output()
if err != nil {
return "", err
}
return string(out), nil
})
}
func HasAmdAPUOrGPULocal() (bool, error) {
return hasAmdAPUOrGPU(func(s string) (string, error) {
out, err := exec.Command("sh", "-c", s).Output()
if err != nil {
return "", err
}
return string(out), nil
})
}
func HasAmdAPUOrGPU(execRuntime Runtime) (bool, error) {
return hasAmdAPUOrGPU(func(s string) (string, error) {
return execRuntime.GetRunner().SudoCmd(s, false, false)
})
}
func RocmVersion() (*semver.Version, error) {
const rocmVersionFile = "/opt/rocm/.info/version"
data, err := os.ReadFile(rocmVersionFile)
if err != nil {
// no ROCm installed, nothing to check
if os.IsNotExist(err) {
return nil, err
}
return nil, err
}
curStr := strings.TrimSpace(string(data))
cur, err := semver.NewVersion(curStr)
if err != nil {
return nil, fmt.Errorf("invalid rocm version: %s", curStr)
}
return cur, nil
}

View File

@@ -76,6 +76,10 @@ type Systems interface {
IsPveOrPveLxc() bool
IsRaspbian() bool
IsLinux() bool
IsGB10Chip() bool
IsAmdApu() bool
IsAmdGPU() bool
IsAmdGPUOrAPU() bool
IsUbuntu() bool
IsDebian() bool
@@ -111,16 +115,18 @@ type Systems interface {
}
type SystemInfo struct {
HostInfo *HostInfo `json:"host"`
CpuInfo *CpuInfo `json:"cpu"`
DiskInfo *DiskInfo `json:"disk"`
MemoryInfo *MemoryInfo `json:"memory"`
FsInfo *FileSystemInfo `json:"filesystem"`
CgroupInfo *CgroupInfo `json:"cgroup,omitempty"`
LocalIp string `json:"local_ip"`
NatGateway string `json:"nat_gateway"`
PkgManager string `json:"pkg_manager"`
IsOIC bool `json:"is_oic,omitempty"`
HostInfo *HostInfo `json:"host"`
CpuInfo *CpuInfo `json:"cpu"`
DiskInfo *DiskInfo `json:"disk"`
MemoryInfo *MemoryInfo `json:"memory"`
FsInfo *FileSystemInfo `json:"filesystem"`
CgroupInfo *CgroupInfo `json:"cgroup,omitempty"`
LocalIp string `json:"local_ip"`
NatGateway string `json:"nat_gateway"`
PkgManager string `json:"pkg_manager"`
IsOIC bool `json:"is_oic,omitempty"`
ProductName string `json:"product_name,omitempty"`
HasAmdGPU bool `json:"has_amd_gpu,omitempty"`
}
func (s *SystemInfo) IsSupport() error {
@@ -235,6 +241,22 @@ func (s *SystemInfo) IsLinux() bool {
return s.HostInfo.OsType == common.Linux
}
func (s *SystemInfo) IsGB10Chip() bool {
return s.CpuInfo.IsGB10Chip
}
func (s *SystemInfo) IsAmdApu() bool {
return s.CpuInfo.HasAmdAPU
}
func (s *SystemInfo) IsAmdGPU() bool {
return s.HasAmdGPU
}
func (s *SystemInfo) IsAmdGPUOrAPU() bool {
return s.CpuInfo.HasAmdAPU || s.HasAmdGPU
}
func (s *SystemInfo) IsUbuntu() bool {
return s.HostInfo.OsPlatformFamily == common.Ubuntu
}
@@ -322,6 +344,12 @@ func GetSystemInfo() *SystemInfo {
si.MemoryInfo = getMem()
si.FsInfo = getFs()
hasAmdGPU, err := getAmdGPU()
if err != nil {
panic(errors.Wrap(err, "failed to get amd apu/gpu"))
}
si.HasAmdGPU = hasAmdGPU
localIP, err := util.GetLocalIP()
if err != nil {
panic(errors.Wrap(err, "failed to get local ip"))
@@ -437,6 +465,28 @@ type CpuInfo struct {
CpuModel string `json:"cpu_model"`
CpuLogicalCount int `json:"cpu_logical_count"`
CpuPhysicalCount int `json:"cpu_physical_count"`
IsGB10Chip bool `json:"is_gb10_chip,omitempty"`
HasAmdAPU bool `json:"has_amd_apu,omitempty"`
}
// Not considering the case where AMD GPU and AMD APU coexist.
func getAmdGPU() (bool, error) {
APUOrGPUExists, err := HasAmdAPUOrGPULocal()
if err != nil {
fmt.Printf("Error checking AMD APU/GPU: %v\n", err)
return false, err
}
hasAmdAPU, err := HasAmdAPULocal()
if err != nil {
fmt.Printf("Error checking AMD APU: %v\n", err)
return false, err
}
if APUOrGPUExists && !hasAmdAPU {
return true, nil
}
return false, nil
}
func getCpu() *CpuInfo {
@@ -452,10 +502,36 @@ func getCpu() *CpuInfo {
cpuModel = cpuInfo[0].ModelName
}
// check if is GB10 chip
isGB10Chip := false
// In Linux systems, it is recognized via lspci as "NVIDIA Corporation Device 2e12 (rev a1)
// or NVIDIA Corporation GB20B [GB10] (rev a1)
cmd := exec.Command("sh", "-c", "lspci | grep -i vga | egrep 'GB10|2e12'")
output, err := cmd.Output()
if err == nil && strings.TrimSpace(string(output)) != "" {
isGB10Chip = true
} else {
fmt.Printf("Error checking GB10 chip: %v\n", err)
gb10env := os.Getenv(common.ENV_GB10_CHIP)
if gb10env == "1" || strings.EqualFold(gb10env, "true") {
isGB10Chip = true
}
}
// check if it has amd igpu
hasAmdAPU, err := HasAmdAPULocal()
if err != nil {
fmt.Printf("Error checking AMD iGPU: %v\n", err)
hasAmdAPU = false
}
return &CpuInfo{
CpuModel: cpuModel,
CpuLogicalCount: cpuLogicalCount,
CpuPhysicalCount: cpuPhysicalCount,
IsGB10Chip: isGB10Chip,
HasAmdAPU: hasAmdAPU,
}
}

View File

@@ -37,6 +37,11 @@ type CudaInstalled struct {
}
func (p *CudaInstalled) PreCheck(runtime connector.Runtime) (bool, error) {
if runtime.GetSystemInfo().IsGB10Chip() {
logger.Debug("Assume DGX Spark or GB10 OEM system has CUDA installed")
return true, nil
}
st, err := utils.GetNvidiaStatus(runtime)
if err != nil {
return false, err
@@ -50,17 +55,15 @@ func (p *CudaInstalled) PreCheck(runtime connector.Runtime) (bool, error) {
type CudaNotInstalled struct {
common.KubePrepare
CudaInstalled
}
func (p *CudaNotInstalled) PreCheck(runtime connector.Runtime) (bool, error) {
st, err := utils.GetNvidiaStatus(runtime)
installed, err := p.CudaInstalled.PreCheck(runtime)
if err != nil {
return false, err
}
if st == nil || !st.Installed {
return true, nil
}
return false, nil
return !installed, nil
}
type CurrentNodeInK8s struct {

View File

@@ -325,7 +325,8 @@ func (t *CheckGpuStatus) Execute(runtime connector.Runtime) error {
return fmt.Errorf("kubectl not found")
}
cmd := fmt.Sprintf("%s get pod -n kube-system -l 'app.kubernetes.io/component=hami-device-plugin' -o jsonpath='{.items[*].status.phase}'", kubectlpath)
selector := "app.kubernetes.io/component=hami-device-plugin"
cmd := fmt.Sprintf("%s get pod -n kube-system -l '%s' -o jsonpath='{.items[*].status.phase}'", kubectlpath, selector)
rphase, _ := runtime.GetRunner().SudoCmd(cmd, false, false)
if rphase == "Running" {
@@ -363,7 +364,16 @@ func (u *UpdateNodeGPUInfo) Execute(runtime connector.Runtime) error {
driverVersion = st.LibraryVersion
}
return UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), &driverVersion, &st.CudaVersion, &supported)
// TODO:
gpuType := NvidiaCardType
switch {
case runtime.GetSystemInfo().IsAmdApu():
gpuType = AmdApuCardType
case runtime.GetSystemInfo().IsGB10Chip():
gpuType = GB10ChipType
}
return UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), &driverVersion, &st.CudaVersion, &supported, &gpuType)
}
type RemoveNodeLabels struct {
@@ -376,12 +386,12 @@ func (u *RemoveNodeLabels) Execute(runtime connector.Runtime) error {
return errors.Wrap(errors.WithStack(err), "kubeclient create error")
}
return UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), nil, nil, nil)
return UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), nil, nil, nil, nil)
}
// update k8s node labels gpu.bytetrade.io/driver and gpu.bytetrade.io/cuda.
// if labels are not exists, create it.
func UpdateNodeGpuLabel(ctx context.Context, client kubernetes.Interface, driver, cuda *string, supported *string) error {
func UpdateNodeGpuLabel(ctx context.Context, client kubernetes.Interface, driver, cuda *string, supported *string, gpuType *string) error {
// get node name from hostname
nodeName, err := os.Hostname()
if err != nil {
@@ -408,6 +418,7 @@ func UpdateNodeGpuLabel(ctx context.Context, client kubernetes.Interface, driver
{GpuDriverLabel, driver},
{GpuCudaLabel, cuda},
{GpuCudaSupportedLabel, supported},
{GpuType, gpuType},
} {
old, ok := labels[label.key]
switch {

View File

@@ -8,4 +8,13 @@ var (
GpuDriverLabel = GpuLabelGroup + "/driver"
GpuCudaLabel = GpuLabelGroup + "/cuda"
GpuCudaSupportedLabel = GpuLabelGroup + "/cuda-supported"
GpuType = GpuLabelGroup + "/type"
)
const (
NvidiaCardType = "nvidia" // handling by HAMi
AmdGpuCardType = "amd-gpu" //
AmdApuCardType = "amd-apu" // AMD APU with integrated GPU , AI Max 395 etc.
GB10ChipType = "nvidia-gb10" // NVIDIA GB10 Superchip & unified system memory
StrixHaloChipType = "strix-halo" // AMD Strix Halo GPU & unified system memory
)

View File

@@ -1,6 +1,7 @@
package cluster
import (
"github.com/beclab/Olares/cli/pkg/amdgpu"
"github.com/beclab/Olares/cli/pkg/common"
"github.com/beclab/Olares/cli/pkg/core/module"
"github.com/beclab/Olares/cli/pkg/gpu"
@@ -58,6 +59,12 @@ func (l *linuxInstallPhaseBuilder) installGpuPlugin() phase {
return []module.Module{
&gpu.RestartK3sServiceModule{Skip: !(l.runtime.Arg.Kubetype == common.K3s)},
&gpu.InstallPluginModule{Skip: skipGpuPlugin},
&amdgpu.InstallAmdPluginModule{Skip: func() bool {
if l.runtime.GetSystemInfo().IsAmdGPUOrAPU() {
return false
}
return true
}()},
}
}

View File

@@ -83,6 +83,13 @@ func (l *linuxPhaseBuilder) build() []module.Module {
addModule(gpuModuleBuilder(func() []module.Module {
return []module.Module{
&amdgpu.InstallAmdRocmModule{},
&amdgpu.InstallAmdContainerToolkitModule{Skip: func() bool {
if l.runtime.GetSystemInfo().IsAmdGPUOrAPU() {
return false
}
return true
}(),
},
&gpu.InstallDriversModule{
ManifestModule: manifest.ManifestModule{
Manifest: l.manifestMap,

View File

@@ -9,7 +9,6 @@ import (
"github.com/beclab/Olares/cli/pkg/core/connector"
"github.com/beclab/Olares/cli/pkg/core/logger"
"github.com/beclab/Olares/cli/pkg/core/task"
"github.com/beclab/Olares/cli/pkg/utils"
)
type WelcomeMessage struct {
@@ -73,7 +72,7 @@ func (t *WelcomeMessage) Execute(runtime connector.Runtime) error {
// If AMD GPU on Ubuntu 22.04/24.04, print warning about reboot for ROCm
if si := runtime.GetSystemInfo(); si.IsUbuntu() && (si.IsUbuntuVersionEqual(connector.Ubuntu2204) || si.IsUbuntuVersionEqual(connector.Ubuntu2404)) {
if hasAmd, _ := utils.HasAmdIGPU(runtime); hasAmd {
if hasAmd, _ := connector.HasAmdAPUOrGPU(runtime); hasAmd {
logger.Warnf("\x1b[31mWarning: To enable ROCm, please reboot your machine after activation.\x1b[0m")
fmt.Println()
}

View File

@@ -380,7 +380,7 @@ func (a *upgradeGPUDriverIfNeeded) Execute(runtime connector.Runtime) error {
if err != nil {
return errors.Wrap(errors.WithStack(err), "kubeclient create error")
}
err = gpu.UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), &targetDriverVersionStr, ptr.To(common.CurrentVerifiedCudaVersion), ptr.To("true"))
err = gpu.UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), &targetDriverVersionStr, ptr.To(common.CurrentVerifiedCudaVersion), ptr.To("true"), ptr.To(gpu.NvidiaCardType))
if err != nil {
return err
}

View File

@@ -1,67 +0,0 @@
package utils
import (
"fmt"
"os"
"strings"
"github.com/Masterminds/semver/v3"
"github.com/beclab/Olares/cli/pkg/core/connector"
)
func HasAmdIGPU(execRuntime connector.Runtime) (bool, error) {
// Detect by CPU model names that bundle AMD AI NPU/graphics
targets := []string{
"AMD Ryzen AI Max+ 395",
"AMD Ryzen AI Max 390",
"AMD Ryzen AI Max 385",
"AMD Ryzen AI 9 HX 375",
"AMD Ryzen AI 9 HX 370",
"AMD Ryzen AI 9 365",
}
// try lscpu first: extract 'Model name' field
out, err := execRuntime.GetRunner().SudoCmd("lscpu 2>/dev/null | awk -F': *' '/^Model name/{print $2; exit}' || true", false, false)
if err != nil {
return false, err
}
if out != "" {
lo := strings.ToLower(strings.TrimSpace(out))
for _, t := range targets {
if strings.Contains(lo, strings.ToLower(t)) {
return true, nil
}
}
}
// fallback to /proc/cpuinfo
out, err = execRuntime.GetRunner().SudoCmd("awk -F': *' '/^model name/{print $2; exit}' /proc/cpuinfo 2>/dev/null || true", false, false)
if err != nil {
return false, err
}
if out != "" {
lo := strings.ToLower(strings.TrimSpace(out))
for _, t := range targets {
if strings.Contains(lo, strings.ToLower(t)) {
return true, nil
}
}
}
return false, nil
}
func RocmVersion() (*semver.Version, error) {
const rocmVersionFile = "/opt/rocm/.info/version"
data, err := os.ReadFile(rocmVersionFile)
if err != nil {
// no ROCm installed, nothing to check
if os.IsNotExist(err) {
return nil, err
}
return nil, err
}
curStr := strings.TrimSpace(string(data))
cur, err := semver.NewVersion(curStr)
if err != nil {
return nil, fmt.Errorf("invalid rocm version: %s", curStr)
}
return cur, nil
}

View File

@@ -1,7 +1,7 @@
project_name: olaresd
builds:
- env:
- CGO_ENABLED=0
- CGO_ENABLED=1
# - CC=aarch64-linux-gnu-gcc
# - CXX=aarch64-linux-gnu-g++
main: ./cmd/terminusd/main.go
@@ -17,6 +17,12 @@ builds:
goamd64: v1
env:
- CGO_ENABLED=1
- goarch: arm64
goos: linux
env:
- CGO_ENABLED=1
- CC=aarch64-linux-gnu-gcc
- CXX=aarch64-linux-gnu-g++
tags:
containers_image_openpgp
ldflags:

View File

@@ -20,6 +20,9 @@ build: fmt vet ;$(info $(M)...Begin to build terminusd.) @
build-linux: fmt vet ;$(info $(M)...Begin to build terminusd (linux version).) @
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o bin/olaresd cmd/terminusd/main.go
build-arm: fmt vet ;$(info $(M)...Begin to build terminusd (linux version).) @
CGO_ENABLED=1 GOOS=linux GOARCH=arm64 go build -o bin/olaresd cmd/terminusd/main.go
build-linux-in-docker:
docker run -it --platform linux/amd64 --rm \
-v $(current_dir):/olaresd \
@@ -27,3 +30,11 @@ build-linux-in-docker:
-e DEBIAN_FRONTEND=noninteractive \
golang:1.24.11 \
sh -c "apt-get -y update; apt-get -y install libudev-dev libpcap-dev; make build-linux"
build-arm-in-docker:
docker run -it --platform linux/arm64 --rm \
-v $(current_dir):/olaresd \
-w /olaresd \
-e DEBIAN_FRONTEND=noninteractive \
golang:1.24.11 \
sh -c "apt-get -y update; apt-get -y install libudev-dev libpcap-dev; make build-arm"

View File

@@ -31,7 +31,7 @@ require (
github.com/jaypipes/ghw v0.13.0
github.com/jochenvg/go-udev v0.0.0-20171110120927-d6b62d56d37b
github.com/joho/godotenv v1.5.1
github.com/klauspost/cpuid/v2 v2.2.8
github.com/klauspost/cpuid/v2 v2.3.0
github.com/labstack/echo/v4 v4.0.0-00010101000000-000000000000
github.com/libp2p/go-netroute v0.2.2
github.com/mackerelio/go-osstat v0.2.5

View File

@@ -205,8 +205,8 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
github.com/klauspost/cpuid/v2 v2.2.8 h1:+StwCXwm9PdpiEkPyzBXIy+M9KUb4ODm0Zarf1kS5BM=
github.com/klauspost/cpuid/v2 v2.2.8/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y=
github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
@@ -471,7 +471,6 @@ golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=

View File

@@ -1,5 +1,5 @@
//go:build !(linux && amd64)
// +build !linux !amd64
//go:build !linux
// +build !linux
package intranet

View File

@@ -1,5 +1,5 @@
//go:build linux && amd64
// +build linux,amd64
//go:build linux
// +build linux
package intranet

View File

@@ -42,7 +42,7 @@ func detectdStorageDevices(ctx context.Context, bus string) (usbDevs []storageDe
for _, d := range ds {
if d.Properties()["ID_BUS"] == bus {
usbs = append(usbs, d)
} else if d.Properties()["ID_BUS"] == "ata" &&
} else if (d.Properties()["ID_BUS"] == "ata" || d.Properties()["ID_BUS"] == "scsi") &&
d.Properties()["ID_USB_TYPE"] == "disk" &&
bus == "usb" {
usbs = append(usbs, d)
@@ -97,14 +97,18 @@ func detectdStorageDevices(ctx context.Context, bus string) (usbDevs []storageDe
idSerial := device.Properties()["ID_SERIAL"]
idSerialShort := device.Properties()["ID_SERIAL_SHORT"]
idUsbSerial := device.Properties()["ID_USB_SERIAL"]
idUsbSerialShort := device.Properties()["ID_USB_SERIAL_SHORT"]
partUUID := device.Properties()["ID_PART_ENTRY_UUID"]
usbDevs = append(usbDevs, storageDevice{
DevPath: devPath,
Vender: vender,
IDSerial: idSerial,
IDSerialShort: idSerialShort,
PartitionUUID: partUUID,
DevPath: devPath,
Vender: vender,
IDSerial: idSerial,
IDSerialShort: idSerialShort,
IDUsbSerial: idUsbSerial,
IDUsbSerialShort: idUsbSerialShort,
PartitionUUID: partUUID,
})
}
@@ -199,7 +203,10 @@ func MountedHddPath(ctx context.Context) ([]string, error) {
func FilterBySerial(serial string) func(dev storageDevice) bool {
return func(dev storageDevice) bool {
return strings.HasSuffix(serial, dev.IDSerial) || strings.HasSuffix(serial, dev.IDSerialShort)
return strings.HasSuffix(serial, dev.IDSerial) ||
strings.HasSuffix(serial, dev.IDSerialShort) ||
strings.HasSuffix(serial, dev.IDUsbSerial) ||
strings.HasSuffix(serial, dev.IDUsbSerialShort)
}
}

View File

@@ -3,11 +3,13 @@ package utils
import "strings"
type storageDevice struct {
DevPath string
Vender string
IDSerial string
IDSerialShort string
PartitionUUID string
DevPath string
Vender string
IDSerial string
IDSerialShort string
IDUsbSerial string
IDUsbSerialShort string
PartitionUUID string
}
type mountedPath struct {

View File

@@ -4,7 +4,7 @@ nameOverride: ""
fullnameOverride: ""
namespaceOverride: ""
imagePullSecrets: []
version: "v2.6.10"
version: "v2.6.11"
# Nvidia GPU Parameters
resourceName: "nvidia.com/gpu"

View File

@@ -0,0 +1,40 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: amdgpu-device-plugin
namespace: kube-system
spec:
selector:
matchLabels:
name: amdgpu-dp-ds
template:
metadata:
labels:
name: amdgpu-dp-ds
spec:
restartPolicy: Always
nodeSelector:
kubernetes.io/arch: amd64
priorityClassName: system-node-critical
tolerations:
- key: CriticalAddonsOnly
operator: Exists
containers:
- image: rocm/k8s-device-plugin
name: amdgpu-dp-cntr
securityContext:
privileged: true
capabilities:
drop: ["ALL"]
volumeMounts:
- name: dp
mountPath: /var/lib/kubelet/device-plugins
- name: sys
mountPath: /sys
volumes:
- name: dp
hostPath:
path: /var/lib/kubelet/device-plugins
- name: sys
hostPath:
path: /sys

View File

@@ -3,7 +3,7 @@ target: prebuilt
output:
containers:
-
name: beclab/hami:v2.6.10
name: beclab/hami:v2.6.11
-
name: beclab/hami-webui-fe-oss:v1.0.8
-