Compare commits
41 Commits
module-l4-
...
cli/feat/i
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
679bf8b662 | ||
|
|
51137311ac | ||
|
|
69080d5ba3 | ||
|
|
80798daaf3 | ||
|
|
8270785303 | ||
|
|
c94e4300c7 | ||
|
|
e27dd6bb5c | ||
|
|
c00eec3efd | ||
|
|
5855b8ad0a | ||
|
|
0bb131aa76 | ||
|
|
ffaedaf889 | ||
|
|
1aa3e2dc89 | ||
|
|
476aee0e8e | ||
|
|
45428fd29f | ||
|
|
0edae500ef | ||
|
|
8f3744fb0f | ||
|
|
1c971d8dac | ||
|
|
9dd0a2715b | ||
|
|
b378a09a8c | ||
|
|
06b3c4474a | ||
|
|
5bd7561fcc | ||
|
|
3f9e38f14c | ||
|
|
9d59d566c5 | ||
|
|
989f54059e | ||
|
|
89041683a6 | ||
|
|
b79f36deb8 | ||
|
|
4beb69747d | ||
|
|
0776188aa8 | ||
|
|
90408a9f1f | ||
|
|
997c9d4142 | ||
|
|
0fa4f298a9 | ||
|
|
4004acf8d6 | ||
|
|
9c0b2d4b62 | ||
|
|
7037a7afa0 | ||
|
|
5159ecb753 | ||
|
|
979cd37ce1 | ||
|
|
f33a1f1a00 | ||
|
|
dee6474fda | ||
|
|
1102751eb0 | ||
|
|
9194c0c04c | ||
|
|
884f4f5585 |
121
cli/pkg/amdgpu/module.go
Normal file
121
cli/pkg/amdgpu/module.go
Normal file
@@ -0,0 +1,121 @@
|
||||
package amdgpu
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/beclab/Olares/cli/pkg/common"
|
||||
"github.com/beclab/Olares/cli/pkg/core/prepare"
|
||||
"github.com/beclab/Olares/cli/pkg/core/task"
|
||||
)
|
||||
|
||||
// InstallAmdContainerToolkitModule installs AMD container toolkit on supported Ubuntu if ROCm is installed.
|
||||
type InstallAmdContainerToolkitModule struct {
|
||||
common.KubeModule
|
||||
Skip bool // conditional execution based on ROCm detection
|
||||
SkipRocmCheck bool
|
||||
}
|
||||
|
||||
func (m *InstallAmdContainerToolkitModule) IsSkip() bool {
|
||||
return m.Skip
|
||||
}
|
||||
|
||||
func (m *InstallAmdContainerToolkitModule) Init() {
|
||||
m.Name = "InstallAmdContainerToolkit"
|
||||
if m.IsSkip() {
|
||||
return
|
||||
}
|
||||
|
||||
prepareCollection := prepare.PrepareCollection{}
|
||||
if !m.SkipRocmCheck {
|
||||
prepareCollection = append(prepareCollection, new(RocmInstalled))
|
||||
}
|
||||
|
||||
updateAmdSource := &task.RemoteTask{
|
||||
Name: "UpdateAmdContainerToolkitSource",
|
||||
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
||||
Action: new(UpdateAmdContainerToolkitSource),
|
||||
Prepare: &prepareCollection,
|
||||
Parallel: false,
|
||||
Retry: 1,
|
||||
}
|
||||
|
||||
installAmdContainerToolkit := &task.RemoteTask{
|
||||
Name: "InstallAmdContainerToolkit",
|
||||
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
||||
Prepare: &prepareCollection,
|
||||
Action: new(InstallAmdContainerToolkit),
|
||||
Parallel: false,
|
||||
Retry: 1,
|
||||
}
|
||||
|
||||
generateAndValidateCDI := &task.RemoteTask{
|
||||
Name: "GenerateAndValidateAmdCDI",
|
||||
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
||||
Prepare: &prepareCollection,
|
||||
Action: new(GenerateAndValidateAmdCDI),
|
||||
Parallel: false,
|
||||
Retry: 1,
|
||||
}
|
||||
|
||||
m.Tasks = []task.Interface{
|
||||
updateAmdSource,
|
||||
installAmdContainerToolkit,
|
||||
generateAndValidateCDI,
|
||||
}
|
||||
}
|
||||
|
||||
// InstallAmdPluginModule installs AMD GPU device plugin on Kubernetes.
|
||||
type InstallAmdPluginModule struct {
|
||||
common.KubeModule
|
||||
Skip bool // conditional execution based on GPU enablement
|
||||
}
|
||||
|
||||
func (m *InstallAmdPluginModule) IsSkip() bool {
|
||||
return m.Skip
|
||||
}
|
||||
|
||||
func (m *InstallAmdPluginModule) Init() {
|
||||
m.Name = "InstallAmdPlugin"
|
||||
|
||||
// update node with AMD GPU labels
|
||||
updateNode := &task.RemoteTask{
|
||||
Name: "UpdateNodeAmdGPUInfo",
|
||||
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
||||
Prepare: &prepare.PrepareCollection{
|
||||
new(common.OnlyFirstMaster),
|
||||
},
|
||||
Action: new(UpdateNodeAmdGPUInfo),
|
||||
Parallel: false,
|
||||
Retry: 1,
|
||||
}
|
||||
|
||||
installPlugin := &task.RemoteTask{
|
||||
Name: "InstallAmdPlugin",
|
||||
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
||||
Prepare: &prepare.PrepareCollection{
|
||||
new(common.OnlyFirstMaster),
|
||||
},
|
||||
Action: new(InstallAmdPlugin),
|
||||
Parallel: false,
|
||||
Retry: 1,
|
||||
}
|
||||
|
||||
checkGpuState := &task.RemoteTask{
|
||||
Name: "CheckAmdGPUState",
|
||||
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
||||
Prepare: &prepare.PrepareCollection{
|
||||
new(common.OnlyFirstMaster),
|
||||
new(RocmInstalled),
|
||||
},
|
||||
Action: new(CheckAmdGpuStatus),
|
||||
Parallel: false,
|
||||
Retry: 50,
|
||||
Delay: 10 * time.Second,
|
||||
}
|
||||
|
||||
m.Tasks = []task.Interface{
|
||||
updateNode,
|
||||
installPlugin,
|
||||
checkGpuState,
|
||||
}
|
||||
}
|
||||
56
cli/pkg/amdgpu/prepares.go
Normal file
56
cli/pkg/amdgpu/prepares.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package amdgpu
|
||||
|
||||
import (
|
||||
"github.com/beclab/Olares/cli/pkg/bootstrap/precheck"
|
||||
"github.com/beclab/Olares/cli/pkg/common"
|
||||
"github.com/beclab/Olares/cli/pkg/core/connector"
|
||||
"github.com/beclab/Olares/cli/pkg/core/logger"
|
||||
)
|
||||
|
||||
// RocmInstalled checks if AMD ROCm is installed on the system.
|
||||
type RocmInstalled struct {
|
||||
common.KubePrepare
|
||||
}
|
||||
|
||||
func (p *RocmInstalled) PreCheck(runtime connector.Runtime) (bool, error) {
|
||||
rocmV, err := connector.RocmVersion()
|
||||
if err != nil {
|
||||
logger.Debugf("ROCm version check error: %v", err)
|
||||
return false, nil
|
||||
}
|
||||
if rocmV == nil {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
logger.Infof("Detected ROCm version: %s", rocmV.Original())
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// RocmNotInstalled checks if AMD ROCm is NOT installed on the system.
|
||||
type RocmNotInstalled struct {
|
||||
common.KubePrepare
|
||||
RocmInstalled
|
||||
}
|
||||
|
||||
func (p *RocmNotInstalled) PreCheck(runtime connector.Runtime) (bool, error) {
|
||||
installed, err := p.RocmInstalled.PreCheck(runtime)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return !installed, nil
|
||||
}
|
||||
|
||||
// ContainerdInstalled checks if containerd is installed on the system.
|
||||
type ContainerdInstalled struct {
|
||||
common.KubePrepare
|
||||
}
|
||||
|
||||
func (p *ContainerdInstalled) PreCheck(runtime connector.Runtime) (bool, error) {
|
||||
containerdCheck := precheck.ConflictingContainerdCheck{}
|
||||
if err := containerdCheck.Check(runtime); err != nil {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
logger.Info("containerd is not installed, ignore task")
|
||||
return false, nil
|
||||
}
|
||||
@@ -1,17 +1,20 @@
|
||||
package amdgpu
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"path"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/beclab/Olares/cli/pkg/clientset"
|
||||
"github.com/beclab/Olares/cli/pkg/common"
|
||||
cc "github.com/beclab/Olares/cli/pkg/core/common"
|
||||
"github.com/beclab/Olares/cli/pkg/core/connector"
|
||||
"github.com/beclab/Olares/cli/pkg/core/logger"
|
||||
"github.com/beclab/Olares/cli/pkg/core/task"
|
||||
"github.com/beclab/Olares/cli/pkg/utils"
|
||||
"github.com/beclab/Olares/cli/pkg/core/util"
|
||||
"github.com/beclab/Olares/cli/pkg/gpu"
|
||||
|
||||
"github.com/Masterminds/semver/v3"
|
||||
"github.com/pkg/errors"
|
||||
@@ -26,8 +29,8 @@ func (m *InstallAmdRocmModule) Init() {
|
||||
m.Name = "InstallAMDGPU"
|
||||
|
||||
installAmd := &task.RemoteTask{
|
||||
Name: "InstallAmdRocm",
|
||||
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
||||
Name: "InstallAmdRocm",
|
||||
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
||||
Action: &InstallAmdRocm{
|
||||
// no manifest needed
|
||||
},
|
||||
@@ -51,7 +54,7 @@ func (t *InstallAmdRocm) Execute(runtime connector.Runtime) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
amdGPUExists, err := utils.HasAmdIGPU(runtime)
|
||||
amdGPUExists, err := connector.HasAmdAPUOrGPU(runtime)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -59,7 +62,7 @@ func (t *InstallAmdRocm) Execute(runtime connector.Runtime) error {
|
||||
if !amdGPUExists {
|
||||
return nil
|
||||
}
|
||||
rocmV, _ := utils.RocmVersion()
|
||||
rocmV, _ := connector.RocmVersion()
|
||||
min := semver.MustParse("7.1.1")
|
||||
if rocmV != nil && rocmV.LessThan(min) {
|
||||
return fmt.Errorf("detected ROCm version %s, which is lower than required %s; please uninstall existing ROCm/AMDGPU components before installation with command: olares-cli amdgpu uninstall", rocmV.Original(), min.Original())
|
||||
@@ -131,3 +134,163 @@ func (t *AmdgpuUninstallAction) Execute(runtime connector.Runtime) error {
|
||||
logger.Warn("Warning: Please reboot your machine after uninstall to fully remove ROCm components.")
|
||||
return nil
|
||||
}
|
||||
|
||||
// UpdateAmdContainerToolkitSource configures the AMD container toolkit APT repository.
|
||||
type UpdateAmdContainerToolkitSource struct {
|
||||
common.KubeAction
|
||||
}
|
||||
|
||||
func (t *UpdateAmdContainerToolkitSource) Execute(runtime connector.Runtime) error {
|
||||
// Install prerequisites
|
||||
if _, err := runtime.GetRunner().SudoCmd("apt update && apt install -y wget gnupg2", false, true); err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "failed to install prerequisites for AMD container toolkit")
|
||||
}
|
||||
|
||||
if _, err := runtime.GetRunner().SudoCmd("install -d -m 0755 /etc/apt/keyrings", false, true); err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "failed to create /etc/apt/keyrings directory")
|
||||
}
|
||||
|
||||
cmd := "wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null"
|
||||
if _, err := runtime.GetRunner().SudoCmd(cmd, false, true); err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "failed to download and install AMD ROCm GPG key")
|
||||
}
|
||||
|
||||
si := runtime.GetSystemInfo()
|
||||
var ubuntuCodename string
|
||||
if si.IsUbuntuVersionEqual(connector.Ubuntu2404) {
|
||||
ubuntuCodename = "noble"
|
||||
} else if si.IsUbuntuVersionEqual(connector.Ubuntu2204) {
|
||||
ubuntuCodename = "jammy"
|
||||
} else {
|
||||
return fmt.Errorf("unsupported Ubuntu version for AMD container toolkit")
|
||||
}
|
||||
|
||||
aptSourceLine := fmt.Sprintf("deb [signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amd-container-toolkit/apt/ %s main", ubuntuCodename)
|
||||
cmd = fmt.Sprintf("echo '%s' > /etc/apt/sources.list.d/amd-container-toolkit.list", aptSourceLine)
|
||||
if _, err := runtime.GetRunner().SudoCmd(cmd, false, true); err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "failed to add AMD container toolkit APT source")
|
||||
}
|
||||
|
||||
logger.Infof("AMD container toolkit repository configured successfully")
|
||||
return nil
|
||||
}
|
||||
|
||||
// InstallAmdContainerToolkit installs the AMD container toolkit package.
|
||||
type InstallAmdContainerToolkit struct {
|
||||
common.KubeAction
|
||||
}
|
||||
|
||||
func (t *InstallAmdContainerToolkit) Execute(runtime connector.Runtime) error {
|
||||
logger.Infof("Installing AMD container toolkit...")
|
||||
if _, err := runtime.GetRunner().SudoCmd("apt update && apt install -y amd-container-toolkit", false, true); err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "failed to install AMD container toolkit")
|
||||
}
|
||||
logger.Infof("AMD container toolkit installed successfully")
|
||||
return nil
|
||||
}
|
||||
|
||||
// GenerateAndValidateAmdCDI generates and validates the AMD CDI spec.
|
||||
type GenerateAndValidateAmdCDI struct {
|
||||
common.KubeAction
|
||||
}
|
||||
|
||||
func (t *GenerateAndValidateAmdCDI) Execute(runtime connector.Runtime) error {
|
||||
// Ensure /etc/cdi directory exists
|
||||
if _, err := runtime.GetRunner().SudoCmd("install -d -m 0755 /etc/cdi", false, true); err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "failed to create /etc/cdi directory")
|
||||
}
|
||||
|
||||
// Generate CDI spec
|
||||
logger.Infof("Generating AMD CDI spec...")
|
||||
if _, err := runtime.GetRunner().SudoCmd("amd-ctk cdi generate --output=/etc/cdi/amd.json", false, true); err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "failed to generate AMD CDI spec")
|
||||
}
|
||||
|
||||
// Validate CDI spec
|
||||
logger.Infof("Validating AMD CDI spec...")
|
||||
if _, err := runtime.GetRunner().SudoCmd("amd-ctk cdi validate --path=/etc/cdi/amd.json", false, true); err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "failed to validate AMD CDI spec")
|
||||
}
|
||||
|
||||
logger.Infof("AMD CDI spec generated and validated successfully")
|
||||
return nil
|
||||
}
|
||||
|
||||
// UpdateNodeAmdGPUInfo updates Kubernetes node labels with AMD GPU information.
|
||||
type UpdateNodeAmdGPUInfo struct {
|
||||
common.KubeAction
|
||||
}
|
||||
|
||||
func (u *UpdateNodeAmdGPUInfo) Execute(runtime connector.Runtime) error {
|
||||
client, err := clientset.NewKubeClient()
|
||||
if err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "kubeclient create error")
|
||||
}
|
||||
|
||||
// Check if AMD GPU/APU exists
|
||||
amdGPUExists, err := connector.HasAmdAPUOrGPU(runtime)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !amdGPUExists {
|
||||
logger.Info("AMD GPU/APU is not detected")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Get ROCm version
|
||||
rocmV, err := connector.RocmVersion()
|
||||
if err != nil || rocmV == nil {
|
||||
logger.Info("ROCm is not installed")
|
||||
return nil
|
||||
}
|
||||
|
||||
rocmVersion := rocmV.Original()
|
||||
|
||||
// Determine GPU type (APU vs discrete GPU)
|
||||
gpuType := gpu.AmdGpuCardType
|
||||
if runtime.GetSystemInfo().IsAmdApu() {
|
||||
gpuType = gpu.AmdApuCardType
|
||||
}
|
||||
|
||||
// Use ROCm version as both driver and "cuda" version for AMD
|
||||
return gpu.UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), &rocmVersion, nil, nil, &gpuType)
|
||||
}
|
||||
|
||||
// InstallAmdPlugin installs the AMD GPU device plugin DaemonSet.
|
||||
type InstallAmdPlugin struct {
|
||||
common.KubeAction
|
||||
}
|
||||
|
||||
func (t *InstallAmdPlugin) Execute(runtime connector.Runtime) error {
|
||||
amdPluginPath := path.Join(runtime.GetInstallerDir(), "wizard/config/gpu/nvidia/amdgpu-device-plugin.yaml")
|
||||
_, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("kubectl apply -f %s", amdPluginPath), false, true)
|
||||
if err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "failed to apply AMD GPU device plugin")
|
||||
}
|
||||
|
||||
logger.Infof("AMD GPU device plugin installed successfully")
|
||||
return nil
|
||||
}
|
||||
|
||||
// CheckAmdGpuStatus checks if the AMD GPU device plugin pod is running.
|
||||
type CheckAmdGpuStatus struct {
|
||||
common.KubeAction
|
||||
}
|
||||
|
||||
func (t *CheckAmdGpuStatus) Execute(runtime connector.Runtime) error {
|
||||
kubectlpath, err := util.GetCommand(common.CommandKubectl)
|
||||
if err != nil {
|
||||
return fmt.Errorf("kubectl not found")
|
||||
}
|
||||
|
||||
// Check AMD device plugin pod status using the label from amdgpu-device-plugin.yaml
|
||||
selector := "name=amdgpu-dp-ds"
|
||||
cmd := fmt.Sprintf("%s get pod -n kube-system -l '%s' -o jsonpath='{.items[*].status.phase}'", kubectlpath, selector)
|
||||
|
||||
rphase, _ := runtime.GetRunner().SudoCmd(cmd, false, false)
|
||||
if rphase == "Running" {
|
||||
logger.Infof("AMD GPU device plugin is running")
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("AMD GPU device plugin state is not Running (current: %s)", rphase)
|
||||
}
|
||||
|
||||
@@ -59,7 +59,7 @@ func (t *PatchTask) Execute(runtime connector.Runtime) error {
|
||||
pre_reqs = pre_reqs + " network-manager "
|
||||
}
|
||||
|
||||
pre_reqs += " conntrack socat apache2-utils net-tools make gcc bison flex tree unzip "
|
||||
pre_reqs += " conntrack socat apache2-utils net-tools make gcc bison flex tree unzip lshw"
|
||||
|
||||
var systemInfo = runtime.GetSystemInfo()
|
||||
var platformFamily = systemInfo.GetOsPlatformFamily()
|
||||
|
||||
@@ -338,7 +338,9 @@ func (c *CudaChecker) Name() string {
|
||||
}
|
||||
|
||||
func (c *CudaChecker) Check(runtime connector.Runtime) error {
|
||||
if !runtime.GetSystemInfo().IsLinux() {
|
||||
if !runtime.GetSystemInfo().IsLinux() ||
|
||||
// Skip check on NVIDIA DGX Spark systems, which have their own GPU management
|
||||
runtime.GetSystemInfo().IsGB10Chip() {
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -388,17 +390,17 @@ func (r *RocmChecker) Check(runtime connector.Runtime) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// detect AMD GPU presence
|
||||
amdGPUExists, err := utils.HasAmdIGPU(runtime)
|
||||
// detect AMD APU/GPU presence
|
||||
amdGPUExists, err := connector.HasAmdAPUOrGPU(runtime)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// no AMD GPU found, no need to check rocm
|
||||
// no AMD APU/GPU found, no need to check rocm
|
||||
if !amdGPUExists {
|
||||
return nil
|
||||
}
|
||||
|
||||
curV, err := utils.RocmVersion()
|
||||
curV, err := connector.RocmVersion()
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -210,6 +210,7 @@ func NewArgument() *Argument {
|
||||
arg.IsCloudInstance, _ = strconv.ParseBool(os.Getenv(ENV_TERMINUS_IS_CLOUD_VERSION))
|
||||
arg.IsOlaresInContainer = os.Getenv(ENV_CONTAINER_MODE) == "oic"
|
||||
si.IsOIC = arg.IsOlaresInContainer
|
||||
si.ProductName = arg.GetProductName()
|
||||
|
||||
// Ensure BaseDir is initialized before loading master.conf
|
||||
// so master host config can be loaded from ${base-dir}/master.conf reliably.
|
||||
@@ -415,6 +416,57 @@ func (a *Argument) SetSwapConfig(config SwapConfig) {
|
||||
a.Swappiness = config.Swappiness
|
||||
}
|
||||
|
||||
func (a *Argument) SetMasterHostOverride(config MasterHostConfig) {
|
||||
if config.MasterHost != "" {
|
||||
a.MasterHost = config.MasterHost
|
||||
}
|
||||
if config.MasterNodeName != "" {
|
||||
a.MasterNodeName = config.MasterNodeName
|
||||
}
|
||||
|
||||
// set a dummy name to bypass validity checks
|
||||
// as it will be overridden later when the node name is fetched
|
||||
if a.MasterNodeName == "" {
|
||||
a.MasterNodeName = "master"
|
||||
}
|
||||
if config.MasterSSHPassword != "" {
|
||||
a.MasterSSHPassword = config.MasterSSHPassword
|
||||
}
|
||||
if config.MasterSSHUser != "" {
|
||||
a.MasterSSHUser = config.MasterSSHUser
|
||||
}
|
||||
if config.MasterSSHPort != 0 {
|
||||
a.MasterSSHPort = config.MasterSSHPort
|
||||
}
|
||||
if config.MasterSSHPrivateKeyPath != "" {
|
||||
a.MasterSSHPrivateKeyPath = config.MasterSSHPrivateKeyPath
|
||||
}
|
||||
}
|
||||
|
||||
func (a *Argument) LoadMasterHostConfigIfAny() error {
|
||||
if a.BaseDir == "" {
|
||||
return errors.New("basedir unset")
|
||||
}
|
||||
content, err := os.ReadFile(filepath.Join(a.BaseDir, MasterHostConfigFile))
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return json.Unmarshal(content, a.MasterHostConfig)
|
||||
}
|
||||
|
||||
func (a *Argument) GetProductName() string {
|
||||
data, err := os.ReadFile("/sys/class/dmi/id/product_name")
|
||||
if err != nil {
|
||||
fmt.Printf("\nCannot get product name on this device, %s\n", err)
|
||||
return ""
|
||||
}
|
||||
|
||||
return strings.TrimSpace(string(data))
|
||||
}
|
||||
|
||||
func NewKubeRuntime(arg Argument) (*KubeRuntime, error) {
|
||||
loader := NewLoader(arg)
|
||||
cluster, err := loader.Load()
|
||||
|
||||
@@ -98,4 +98,6 @@ const (
|
||||
|
||||
const (
|
||||
ZfsSnapshotter = "/var/lib/containerd/io.containerd.snapshotter.v1.zfs"
|
||||
|
||||
ENV_GB10_CHIP = "GB10_CHIP" // for building images for NVIDIA GB10 Superchip systems
|
||||
)
|
||||
|
||||
117
cli/pkg/core/connector/amdgpu.go
Normal file
117
cli/pkg/core/connector/amdgpu.go
Normal file
@@ -0,0 +1,117 @@
|
||||
package connector
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
|
||||
"github.com/Masterminds/semver/v3"
|
||||
)
|
||||
|
||||
func hasAmdAPU(cmdExec func(s string) (string, error)) (bool, error) {
|
||||
// Detect by CPU model names that bundle AMD AI NPU/graphics
|
||||
targets := []string{
|
||||
"AMD Ryzen AI Max+ 395",
|
||||
"AMD Ryzen AI Max 390",
|
||||
"AMD Ryzen AI Max 385",
|
||||
"AMD Ryzen AI 9 HX 375",
|
||||
"AMD Ryzen AI 9 HX 370",
|
||||
"AMD Ryzen AI 9 365",
|
||||
}
|
||||
// try lscpu first: extract 'Model name' field
|
||||
out, err := cmdExec("lscpu 2>/dev/null | awk -F': *' '/^Model name/{print $2; exit}' || true")
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if out != "" {
|
||||
lo := strings.ToLower(strings.TrimSpace(out))
|
||||
for _, t := range targets {
|
||||
if strings.Contains(lo, strings.ToLower(t)) {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
// fallback to /proc/cpuinfo
|
||||
out, err = cmdExec("awk -F': *' '/^model name/{print $2; exit}' /proc/cpuinfo 2>/dev/null || true")
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if out != "" {
|
||||
lo := strings.ToLower(strings.TrimSpace(out))
|
||||
for _, t := range targets {
|
||||
if strings.Contains(lo, strings.ToLower(t)) {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func hasAmdAPUOrGPU(cmdExec func(s string) (string, error)) (bool, error) {
|
||||
out, err := cmdExec("lspci -d '1002:' 2>/dev/null | grep 'AMD' || true")
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if out != "" {
|
||||
return true, nil
|
||||
}
|
||||
out, err = cmdExec("lshw -c display -numeric -disable network 2>/dev/null | grep 'vendor: .* \\[1002\\]' || true")
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if out != "" {
|
||||
return true, nil
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func HasAmdAPU(execRuntime Runtime) (bool, error) {
|
||||
return hasAmdAPU(func(s string) (string, error) {
|
||||
return execRuntime.GetRunner().SudoCmd(s, false, false)
|
||||
})
|
||||
}
|
||||
|
||||
func HasAmdAPULocal() (bool, error) {
|
||||
return hasAmdAPU(func(s string) (string, error) {
|
||||
out, err := exec.Command("sh", "-c", s).Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
})
|
||||
}
|
||||
|
||||
func HasAmdAPUOrGPULocal() (bool, error) {
|
||||
return hasAmdAPUOrGPU(func(s string) (string, error) {
|
||||
out, err := exec.Command("sh", "-c", s).Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
})
|
||||
}
|
||||
|
||||
func HasAmdAPUOrGPU(execRuntime Runtime) (bool, error) {
|
||||
return hasAmdAPUOrGPU(func(s string) (string, error) {
|
||||
return execRuntime.GetRunner().SudoCmd(s, false, false)
|
||||
})
|
||||
}
|
||||
|
||||
func RocmVersion() (*semver.Version, error) {
|
||||
const rocmVersionFile = "/opt/rocm/.info/version"
|
||||
data, err := os.ReadFile(rocmVersionFile)
|
||||
if err != nil {
|
||||
// no ROCm installed, nothing to check
|
||||
if os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
curStr := strings.TrimSpace(string(data))
|
||||
cur, err := semver.NewVersion(curStr)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid rocm version: %s", curStr)
|
||||
}
|
||||
return cur, nil
|
||||
}
|
||||
@@ -76,6 +76,10 @@ type Systems interface {
|
||||
IsPveOrPveLxc() bool
|
||||
IsRaspbian() bool
|
||||
IsLinux() bool
|
||||
IsGB10Chip() bool
|
||||
IsAmdApu() bool
|
||||
IsAmdGPU() bool
|
||||
IsAmdGPUOrAPU() bool
|
||||
|
||||
IsUbuntu() bool
|
||||
IsDebian() bool
|
||||
@@ -111,16 +115,18 @@ type Systems interface {
|
||||
}
|
||||
|
||||
type SystemInfo struct {
|
||||
HostInfo *HostInfo `json:"host"`
|
||||
CpuInfo *CpuInfo `json:"cpu"`
|
||||
DiskInfo *DiskInfo `json:"disk"`
|
||||
MemoryInfo *MemoryInfo `json:"memory"`
|
||||
FsInfo *FileSystemInfo `json:"filesystem"`
|
||||
CgroupInfo *CgroupInfo `json:"cgroup,omitempty"`
|
||||
LocalIp string `json:"local_ip"`
|
||||
NatGateway string `json:"nat_gateway"`
|
||||
PkgManager string `json:"pkg_manager"`
|
||||
IsOIC bool `json:"is_oic,omitempty"`
|
||||
HostInfo *HostInfo `json:"host"`
|
||||
CpuInfo *CpuInfo `json:"cpu"`
|
||||
DiskInfo *DiskInfo `json:"disk"`
|
||||
MemoryInfo *MemoryInfo `json:"memory"`
|
||||
FsInfo *FileSystemInfo `json:"filesystem"`
|
||||
CgroupInfo *CgroupInfo `json:"cgroup,omitempty"`
|
||||
LocalIp string `json:"local_ip"`
|
||||
NatGateway string `json:"nat_gateway"`
|
||||
PkgManager string `json:"pkg_manager"`
|
||||
IsOIC bool `json:"is_oic,omitempty"`
|
||||
ProductName string `json:"product_name,omitempty"`
|
||||
HasAmdGPU bool `json:"has_amd_gpu,omitempty"`
|
||||
}
|
||||
|
||||
func (s *SystemInfo) IsSupport() error {
|
||||
@@ -235,6 +241,22 @@ func (s *SystemInfo) IsLinux() bool {
|
||||
return s.HostInfo.OsType == common.Linux
|
||||
}
|
||||
|
||||
func (s *SystemInfo) IsGB10Chip() bool {
|
||||
return s.CpuInfo.IsGB10Chip
|
||||
}
|
||||
|
||||
func (s *SystemInfo) IsAmdApu() bool {
|
||||
return s.CpuInfo.HasAmdAPU
|
||||
}
|
||||
|
||||
func (s *SystemInfo) IsAmdGPU() bool {
|
||||
return s.HasAmdGPU
|
||||
}
|
||||
|
||||
func (s *SystemInfo) IsAmdGPUOrAPU() bool {
|
||||
return s.CpuInfo.HasAmdAPU || s.HasAmdGPU
|
||||
}
|
||||
|
||||
func (s *SystemInfo) IsUbuntu() bool {
|
||||
return s.HostInfo.OsPlatformFamily == common.Ubuntu
|
||||
}
|
||||
@@ -322,6 +344,12 @@ func GetSystemInfo() *SystemInfo {
|
||||
si.MemoryInfo = getMem()
|
||||
si.FsInfo = getFs()
|
||||
|
||||
hasAmdGPU, err := getAmdGPU()
|
||||
if err != nil {
|
||||
panic(errors.Wrap(err, "failed to get amd apu/gpu"))
|
||||
}
|
||||
si.HasAmdGPU = hasAmdGPU
|
||||
|
||||
localIP, err := util.GetLocalIP()
|
||||
if err != nil {
|
||||
panic(errors.Wrap(err, "failed to get local ip"))
|
||||
@@ -437,6 +465,28 @@ type CpuInfo struct {
|
||||
CpuModel string `json:"cpu_model"`
|
||||
CpuLogicalCount int `json:"cpu_logical_count"`
|
||||
CpuPhysicalCount int `json:"cpu_physical_count"`
|
||||
IsGB10Chip bool `json:"is_gb10_chip,omitempty"`
|
||||
HasAmdAPU bool `json:"has_amd_apu,omitempty"`
|
||||
}
|
||||
|
||||
// Not considering the case where AMD GPU and AMD APU coexist.
|
||||
func getAmdGPU() (bool, error) {
|
||||
APUOrGPUExists, err := HasAmdAPUOrGPULocal()
|
||||
if err != nil {
|
||||
fmt.Printf("Error checking AMD APU/GPU: %v\n", err)
|
||||
return false, err
|
||||
}
|
||||
|
||||
hasAmdAPU, err := HasAmdAPULocal()
|
||||
if err != nil {
|
||||
fmt.Printf("Error checking AMD APU: %v\n", err)
|
||||
return false, err
|
||||
}
|
||||
|
||||
if APUOrGPUExists && !hasAmdAPU {
|
||||
return true, nil
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func getCpu() *CpuInfo {
|
||||
@@ -452,10 +502,36 @@ func getCpu() *CpuInfo {
|
||||
cpuModel = cpuInfo[0].ModelName
|
||||
}
|
||||
|
||||
// check if is GB10 chip
|
||||
isGB10Chip := false
|
||||
|
||||
// In Linux systems, it is recognized via lspci as "NVIDIA Corporation Device 2e12 (rev a1)
|
||||
// or NVIDIA Corporation GB20B [GB10] (rev a1)
|
||||
cmd := exec.Command("sh", "-c", "lspci | grep -i vga | egrep 'GB10|2e12'")
|
||||
output, err := cmd.Output()
|
||||
if err == nil && strings.TrimSpace(string(output)) != "" {
|
||||
isGB10Chip = true
|
||||
} else {
|
||||
fmt.Printf("Error checking GB10 chip: %v\n", err)
|
||||
gb10env := os.Getenv(common.ENV_GB10_CHIP)
|
||||
if gb10env == "1" || strings.EqualFold(gb10env, "true") {
|
||||
isGB10Chip = true
|
||||
}
|
||||
}
|
||||
|
||||
// check if it has amd igpu
|
||||
hasAmdAPU, err := HasAmdAPULocal()
|
||||
if err != nil {
|
||||
fmt.Printf("Error checking AMD iGPU: %v\n", err)
|
||||
hasAmdAPU = false
|
||||
}
|
||||
|
||||
return &CpuInfo{
|
||||
CpuModel: cpuModel,
|
||||
CpuLogicalCount: cpuLogicalCount,
|
||||
CpuPhysicalCount: cpuPhysicalCount,
|
||||
IsGB10Chip: isGB10Chip,
|
||||
HasAmdAPU: hasAmdAPU,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -37,6 +37,11 @@ type CudaInstalled struct {
|
||||
}
|
||||
|
||||
func (p *CudaInstalled) PreCheck(runtime connector.Runtime) (bool, error) {
|
||||
if runtime.GetSystemInfo().IsGB10Chip() {
|
||||
logger.Debug("Assume DGX Spark or GB10 OEM system has CUDA installed")
|
||||
return true, nil
|
||||
}
|
||||
|
||||
st, err := utils.GetNvidiaStatus(runtime)
|
||||
if err != nil {
|
||||
return false, err
|
||||
@@ -50,17 +55,15 @@ func (p *CudaInstalled) PreCheck(runtime connector.Runtime) (bool, error) {
|
||||
|
||||
type CudaNotInstalled struct {
|
||||
common.KubePrepare
|
||||
CudaInstalled
|
||||
}
|
||||
|
||||
func (p *CudaNotInstalled) PreCheck(runtime connector.Runtime) (bool, error) {
|
||||
st, err := utils.GetNvidiaStatus(runtime)
|
||||
installed, err := p.CudaInstalled.PreCheck(runtime)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if st == nil || !st.Installed {
|
||||
return true, nil
|
||||
}
|
||||
return false, nil
|
||||
return !installed, nil
|
||||
}
|
||||
|
||||
type CurrentNodeInK8s struct {
|
||||
|
||||
@@ -325,7 +325,8 @@ func (t *CheckGpuStatus) Execute(runtime connector.Runtime) error {
|
||||
return fmt.Errorf("kubectl not found")
|
||||
}
|
||||
|
||||
cmd := fmt.Sprintf("%s get pod -n kube-system -l 'app.kubernetes.io/component=hami-device-plugin' -o jsonpath='{.items[*].status.phase}'", kubectlpath)
|
||||
selector := "app.kubernetes.io/component=hami-device-plugin"
|
||||
cmd := fmt.Sprintf("%s get pod -n kube-system -l '%s' -o jsonpath='{.items[*].status.phase}'", kubectlpath, selector)
|
||||
|
||||
rphase, _ := runtime.GetRunner().SudoCmd(cmd, false, false)
|
||||
if rphase == "Running" {
|
||||
@@ -363,7 +364,16 @@ func (u *UpdateNodeGPUInfo) Execute(runtime connector.Runtime) error {
|
||||
driverVersion = st.LibraryVersion
|
||||
}
|
||||
|
||||
return UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), &driverVersion, &st.CudaVersion, &supported)
|
||||
// TODO:
|
||||
gpuType := NvidiaCardType
|
||||
switch {
|
||||
case runtime.GetSystemInfo().IsAmdApu():
|
||||
gpuType = AmdApuCardType
|
||||
case runtime.GetSystemInfo().IsGB10Chip():
|
||||
gpuType = GB10ChipType
|
||||
}
|
||||
|
||||
return UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), &driverVersion, &st.CudaVersion, &supported, &gpuType)
|
||||
}
|
||||
|
||||
type RemoveNodeLabels struct {
|
||||
@@ -376,12 +386,12 @@ func (u *RemoveNodeLabels) Execute(runtime connector.Runtime) error {
|
||||
return errors.Wrap(errors.WithStack(err), "kubeclient create error")
|
||||
}
|
||||
|
||||
return UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), nil, nil, nil)
|
||||
return UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), nil, nil, nil, nil)
|
||||
}
|
||||
|
||||
// update k8s node labels gpu.bytetrade.io/driver and gpu.bytetrade.io/cuda.
|
||||
// if labels are not exists, create it.
|
||||
func UpdateNodeGpuLabel(ctx context.Context, client kubernetes.Interface, driver, cuda *string, supported *string) error {
|
||||
func UpdateNodeGpuLabel(ctx context.Context, client kubernetes.Interface, driver, cuda *string, supported *string, gpuType *string) error {
|
||||
// get node name from hostname
|
||||
nodeName, err := os.Hostname()
|
||||
if err != nil {
|
||||
@@ -408,6 +418,7 @@ func UpdateNodeGpuLabel(ctx context.Context, client kubernetes.Interface, driver
|
||||
{GpuDriverLabel, driver},
|
||||
{GpuCudaLabel, cuda},
|
||||
{GpuCudaSupportedLabel, supported},
|
||||
{GpuType, gpuType},
|
||||
} {
|
||||
old, ok := labels[label.key]
|
||||
switch {
|
||||
|
||||
@@ -8,4 +8,13 @@ var (
|
||||
GpuDriverLabel = GpuLabelGroup + "/driver"
|
||||
GpuCudaLabel = GpuLabelGroup + "/cuda"
|
||||
GpuCudaSupportedLabel = GpuLabelGroup + "/cuda-supported"
|
||||
GpuType = GpuLabelGroup + "/type"
|
||||
)
|
||||
|
||||
const (
|
||||
NvidiaCardType = "nvidia" // handling by HAMi
|
||||
AmdGpuCardType = "amd-gpu" //
|
||||
AmdApuCardType = "amd-apu" // AMD APU with integrated GPU , AI Max 395 etc.
|
||||
GB10ChipType = "nvidia-gb10" // NVIDIA GB10 Superchip & unified system memory
|
||||
StrixHaloChipType = "strix-halo" // AMD Strix Halo GPU & unified system memory
|
||||
)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"github.com/beclab/Olares/cli/pkg/amdgpu"
|
||||
"github.com/beclab/Olares/cli/pkg/common"
|
||||
"github.com/beclab/Olares/cli/pkg/core/module"
|
||||
"github.com/beclab/Olares/cli/pkg/gpu"
|
||||
@@ -58,6 +59,12 @@ func (l *linuxInstallPhaseBuilder) installGpuPlugin() phase {
|
||||
return []module.Module{
|
||||
&gpu.RestartK3sServiceModule{Skip: !(l.runtime.Arg.Kubetype == common.K3s)},
|
||||
&gpu.InstallPluginModule{Skip: skipGpuPlugin},
|
||||
&amdgpu.InstallAmdPluginModule{Skip: func() bool {
|
||||
if l.runtime.GetSystemInfo().IsAmdGPUOrAPU() {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}()},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -83,6 +83,13 @@ func (l *linuxPhaseBuilder) build() []module.Module {
|
||||
addModule(gpuModuleBuilder(func() []module.Module {
|
||||
return []module.Module{
|
||||
&amdgpu.InstallAmdRocmModule{},
|
||||
&amdgpu.InstallAmdContainerToolkitModule{Skip: func() bool {
|
||||
if l.runtime.GetSystemInfo().IsAmdGPUOrAPU() {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}(),
|
||||
},
|
||||
&gpu.InstallDriversModule{
|
||||
ManifestModule: manifest.ManifestModule{
|
||||
Manifest: l.manifestMap,
|
||||
|
||||
@@ -9,7 +9,6 @@ import (
|
||||
"github.com/beclab/Olares/cli/pkg/core/connector"
|
||||
"github.com/beclab/Olares/cli/pkg/core/logger"
|
||||
"github.com/beclab/Olares/cli/pkg/core/task"
|
||||
"github.com/beclab/Olares/cli/pkg/utils"
|
||||
)
|
||||
|
||||
type WelcomeMessage struct {
|
||||
@@ -73,7 +72,7 @@ func (t *WelcomeMessage) Execute(runtime connector.Runtime) error {
|
||||
|
||||
// If AMD GPU on Ubuntu 22.04/24.04, print warning about reboot for ROCm
|
||||
if si := runtime.GetSystemInfo(); si.IsUbuntu() && (si.IsUbuntuVersionEqual(connector.Ubuntu2204) || si.IsUbuntuVersionEqual(connector.Ubuntu2404)) {
|
||||
if hasAmd, _ := utils.HasAmdIGPU(runtime); hasAmd {
|
||||
if hasAmd, _ := connector.HasAmdAPUOrGPU(runtime); hasAmd {
|
||||
logger.Warnf("\x1b[31mWarning: To enable ROCm, please reboot your machine after activation.\x1b[0m")
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
@@ -380,7 +380,7 @@ func (a *upgradeGPUDriverIfNeeded) Execute(runtime connector.Runtime) error {
|
||||
if err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "kubeclient create error")
|
||||
}
|
||||
err = gpu.UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), &targetDriverVersionStr, ptr.To(common.CurrentVerifiedCudaVersion), ptr.To("true"))
|
||||
err = gpu.UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), &targetDriverVersionStr, ptr.To(common.CurrentVerifiedCudaVersion), ptr.To("true"), ptr.To(gpu.NvidiaCardType))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -1,67 +0,0 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/Masterminds/semver/v3"
|
||||
"github.com/beclab/Olares/cli/pkg/core/connector"
|
||||
)
|
||||
|
||||
func HasAmdIGPU(execRuntime connector.Runtime) (bool, error) {
|
||||
// Detect by CPU model names that bundle AMD AI NPU/graphics
|
||||
targets := []string{
|
||||
"AMD Ryzen AI Max+ 395",
|
||||
"AMD Ryzen AI Max 390",
|
||||
"AMD Ryzen AI Max 385",
|
||||
"AMD Ryzen AI 9 HX 375",
|
||||
"AMD Ryzen AI 9 HX 370",
|
||||
"AMD Ryzen AI 9 365",
|
||||
}
|
||||
// try lscpu first: extract 'Model name' field
|
||||
out, err := execRuntime.GetRunner().SudoCmd("lscpu 2>/dev/null | awk -F': *' '/^Model name/{print $2; exit}' || true", false, false)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if out != "" {
|
||||
lo := strings.ToLower(strings.TrimSpace(out))
|
||||
for _, t := range targets {
|
||||
if strings.Contains(lo, strings.ToLower(t)) {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
// fallback to /proc/cpuinfo
|
||||
out, err = execRuntime.GetRunner().SudoCmd("awk -F': *' '/^model name/{print $2; exit}' /proc/cpuinfo 2>/dev/null || true", false, false)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if out != "" {
|
||||
lo := strings.ToLower(strings.TrimSpace(out))
|
||||
for _, t := range targets {
|
||||
if strings.Contains(lo, strings.ToLower(t)) {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func RocmVersion() (*semver.Version, error) {
|
||||
const rocmVersionFile = "/opt/rocm/.info/version"
|
||||
data, err := os.ReadFile(rocmVersionFile)
|
||||
if err != nil {
|
||||
// no ROCm installed, nothing to check
|
||||
if os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
curStr := strings.TrimSpace(string(data))
|
||||
cur, err := semver.NewVersion(curStr)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid rocm version: %s", curStr)
|
||||
}
|
||||
return cur, nil
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
project_name: olaresd
|
||||
builds:
|
||||
- env:
|
||||
- CGO_ENABLED=0
|
||||
- CGO_ENABLED=1
|
||||
# - CC=aarch64-linux-gnu-gcc
|
||||
# - CXX=aarch64-linux-gnu-g++
|
||||
main: ./cmd/terminusd/main.go
|
||||
@@ -17,6 +17,12 @@ builds:
|
||||
goamd64: v1
|
||||
env:
|
||||
- CGO_ENABLED=1
|
||||
- goarch: arm64
|
||||
goos: linux
|
||||
env:
|
||||
- CGO_ENABLED=1
|
||||
- CC=aarch64-linux-gnu-gcc
|
||||
- CXX=aarch64-linux-gnu-g++
|
||||
tags:
|
||||
containers_image_openpgp
|
||||
ldflags:
|
||||
|
||||
@@ -20,6 +20,9 @@ build: fmt vet ;$(info $(M)...Begin to build terminusd.) @
|
||||
build-linux: fmt vet ;$(info $(M)...Begin to build terminusd (linux version).) @
|
||||
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o bin/olaresd cmd/terminusd/main.go
|
||||
|
||||
build-arm: fmt vet ;$(info $(M)...Begin to build terminusd (linux version).) @
|
||||
CGO_ENABLED=1 GOOS=linux GOARCH=arm64 go build -o bin/olaresd cmd/terminusd/main.go
|
||||
|
||||
build-linux-in-docker:
|
||||
docker run -it --platform linux/amd64 --rm \
|
||||
-v $(current_dir):/olaresd \
|
||||
@@ -27,3 +30,11 @@ build-linux-in-docker:
|
||||
-e DEBIAN_FRONTEND=noninteractive \
|
||||
golang:1.24.11 \
|
||||
sh -c "apt-get -y update; apt-get -y install libudev-dev libpcap-dev; make build-linux"
|
||||
|
||||
build-arm-in-docker:
|
||||
docker run -it --platform linux/arm64 --rm \
|
||||
-v $(current_dir):/olaresd \
|
||||
-w /olaresd \
|
||||
-e DEBIAN_FRONTEND=noninteractive \
|
||||
golang:1.24.11 \
|
||||
sh -c "apt-get -y update; apt-get -y install libudev-dev libpcap-dev; make build-arm"
|
||||
|
||||
@@ -31,7 +31,7 @@ require (
|
||||
github.com/jaypipes/ghw v0.13.0
|
||||
github.com/jochenvg/go-udev v0.0.0-20171110120927-d6b62d56d37b
|
||||
github.com/joho/godotenv v1.5.1
|
||||
github.com/klauspost/cpuid/v2 v2.2.8
|
||||
github.com/klauspost/cpuid/v2 v2.3.0
|
||||
github.com/labstack/echo/v4 v4.0.0-00010101000000-000000000000
|
||||
github.com/libp2p/go-netroute v0.2.2
|
||||
github.com/mackerelio/go-osstat v0.2.5
|
||||
|
||||
@@ -205,8 +205,8 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI
|
||||
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
|
||||
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
|
||||
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
|
||||
github.com/klauspost/cpuid/v2 v2.2.8 h1:+StwCXwm9PdpiEkPyzBXIy+M9KUb4ODm0Zarf1kS5BM=
|
||||
github.com/klauspost/cpuid/v2 v2.2.8/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
|
||||
github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y=
|
||||
github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
|
||||
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
|
||||
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||
@@ -471,7 +471,6 @@ golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBc
|
||||
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
//go:build !(linux && amd64)
|
||||
// +build !linux !amd64
|
||||
//go:build !linux
|
||||
// +build !linux
|
||||
|
||||
package intranet
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
//go:build linux && amd64
|
||||
// +build linux,amd64
|
||||
//go:build linux
|
||||
// +build linux
|
||||
|
||||
package intranet
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@ func detectdStorageDevices(ctx context.Context, bus string) (usbDevs []storageDe
|
||||
for _, d := range ds {
|
||||
if d.Properties()["ID_BUS"] == bus {
|
||||
usbs = append(usbs, d)
|
||||
} else if d.Properties()["ID_BUS"] == "ata" &&
|
||||
} else if (d.Properties()["ID_BUS"] == "ata" || d.Properties()["ID_BUS"] == "scsi") &&
|
||||
d.Properties()["ID_USB_TYPE"] == "disk" &&
|
||||
bus == "usb" {
|
||||
usbs = append(usbs, d)
|
||||
@@ -97,14 +97,18 @@ func detectdStorageDevices(ctx context.Context, bus string) (usbDevs []storageDe
|
||||
|
||||
idSerial := device.Properties()["ID_SERIAL"]
|
||||
idSerialShort := device.Properties()["ID_SERIAL_SHORT"]
|
||||
idUsbSerial := device.Properties()["ID_USB_SERIAL"]
|
||||
idUsbSerialShort := device.Properties()["ID_USB_SERIAL_SHORT"]
|
||||
partUUID := device.Properties()["ID_PART_ENTRY_UUID"]
|
||||
|
||||
usbDevs = append(usbDevs, storageDevice{
|
||||
DevPath: devPath,
|
||||
Vender: vender,
|
||||
IDSerial: idSerial,
|
||||
IDSerialShort: idSerialShort,
|
||||
PartitionUUID: partUUID,
|
||||
DevPath: devPath,
|
||||
Vender: vender,
|
||||
IDSerial: idSerial,
|
||||
IDSerialShort: idSerialShort,
|
||||
IDUsbSerial: idUsbSerial,
|
||||
IDUsbSerialShort: idUsbSerialShort,
|
||||
PartitionUUID: partUUID,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -199,7 +203,10 @@ func MountedHddPath(ctx context.Context) ([]string, error) {
|
||||
|
||||
func FilterBySerial(serial string) func(dev storageDevice) bool {
|
||||
return func(dev storageDevice) bool {
|
||||
return strings.HasSuffix(serial, dev.IDSerial) || strings.HasSuffix(serial, dev.IDSerialShort)
|
||||
return strings.HasSuffix(serial, dev.IDSerial) ||
|
||||
strings.HasSuffix(serial, dev.IDSerialShort) ||
|
||||
strings.HasSuffix(serial, dev.IDUsbSerial) ||
|
||||
strings.HasSuffix(serial, dev.IDUsbSerialShort)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,11 +3,13 @@ package utils
|
||||
import "strings"
|
||||
|
||||
type storageDevice struct {
|
||||
DevPath string
|
||||
Vender string
|
||||
IDSerial string
|
||||
IDSerialShort string
|
||||
PartitionUUID string
|
||||
DevPath string
|
||||
Vender string
|
||||
IDSerial string
|
||||
IDSerialShort string
|
||||
IDUsbSerial string
|
||||
IDUsbSerialShort string
|
||||
PartitionUUID string
|
||||
}
|
||||
|
||||
type mountedPath struct {
|
||||
|
||||
@@ -4,7 +4,7 @@ nameOverride: ""
|
||||
fullnameOverride: ""
|
||||
namespaceOverride: ""
|
||||
imagePullSecrets: []
|
||||
version: "v2.6.10"
|
||||
version: "v2.6.11"
|
||||
|
||||
# Nvidia GPU Parameters
|
||||
resourceName: "nvidia.com/gpu"
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: amdgpu-device-plugin
|
||||
namespace: kube-system
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: amdgpu-dp-ds
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: amdgpu-dp-ds
|
||||
spec:
|
||||
restartPolicy: Always
|
||||
nodeSelector:
|
||||
kubernetes.io/arch: amd64
|
||||
priorityClassName: system-node-critical
|
||||
tolerations:
|
||||
- key: CriticalAddonsOnly
|
||||
operator: Exists
|
||||
containers:
|
||||
- image: rocm/k8s-device-plugin
|
||||
name: amdgpu-dp-cntr
|
||||
securityContext:
|
||||
privileged: true
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
volumeMounts:
|
||||
- name: dp
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: sys
|
||||
mountPath: /sys
|
||||
volumes:
|
||||
- name: dp
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
||||
- name: sys
|
||||
hostPath:
|
||||
path: /sys
|
||||
@@ -3,7 +3,7 @@ target: prebuilt
|
||||
output:
|
||||
containers:
|
||||
-
|
||||
name: beclab/hami:v2.6.10
|
||||
name: beclab/hami:v2.6.11
|
||||
-
|
||||
name: beclab/hami-webui-fe-oss:v1.0.8
|
||||
-
|
||||
|
||||
Reference in New Issue
Block a user