Compare commits

...

1 Commits

Author SHA1 Message Date
hys
a4f6045a90 cli: feat amdgpu install 2026-01-21 14:59:13 +08:00
13 changed files with 444 additions and 6 deletions

View File

@@ -0,0 +1,21 @@
package amdgpu
import (
"log"
"github.com/beclab/Olares/cli/pkg/pipelines"
"github.com/spf13/cobra"
)
func NewCmdAmdGpuInstall() *cobra.Command {
cmd := &cobra.Command{
Use: "install",
Short: "Install AMD ROCm stack via amdgpu-install",
Run: func(cmd *cobra.Command, args []string) {
if err := pipelines.AmdGpuInstall(); err != nil {
log.Fatalf("error: %v", err)
}
},
}
return cmd
}

View File

@@ -0,0 +1,16 @@
package amdgpu
import "github.com/spf13/cobra"
func NewCmdAmdGpu() *cobra.Command {
cmd := &cobra.Command{
Use: "amdgpu",
Short: "Manage AMD GPU ROCm stack",
}
cmd.AddCommand(NewCmdAmdGpuInstall())
cmd.AddCommand(NewCmdAmdGpuUninstall())
cmd.AddCommand(NewCmdAmdGpuStatus())
return cmd
}

View File

@@ -0,0 +1,21 @@
package amdgpu
import (
"log"
"github.com/beclab/Olares/cli/pkg/pipelines"
"github.com/spf13/cobra"
)
func NewCmdAmdGpuStatus() *cobra.Command {
cmd := &cobra.Command{
Use: "status",
Short: "Show AMD GPU driver and ROCm status",
Run: func(cmd *cobra.Command, args []string) {
if err := pipelines.AmdGpuStatus(); err != nil {
log.Fatalf("error: %v", err)
}
},
}
return cmd
}

View File

@@ -0,0 +1,21 @@
package amdgpu
import (
"log"
"github.com/beclab/Olares/cli/pkg/pipelines"
"github.com/spf13/cobra"
)
func NewCmdAmdGpuUninstall() *cobra.Command {
cmd := &cobra.Command{
Use: "uninstall",
Short: "Uninstall AMD ROCm stack via amdgpu-install",
Run: func(cmd *cobra.Command, args []string) {
if err := pipelines.AmdGpuUninstall(); err != nil {
log.Fatalf("error: %v", err)
}
},
}
return cmd
}

View File

@@ -1,6 +1,7 @@
package ctl
import (
"github.com/beclab/Olares/cli/cmd/ctl/amdgpu"
"github.com/beclab/Olares/cli/cmd/ctl/disk"
"github.com/beclab/Olares/cli/cmd/ctl/gpu"
"github.com/beclab/Olares/cli/cmd/ctl/node"
@@ -33,6 +34,7 @@ func NewDefaultCommand() *cobra.Command {
cmds.AddCommand(os.NewOSCommands()...)
cmds.AddCommand(node.NewNodeCommand())
cmds.AddCommand(gpu.NewCmdGpu())
cmds.AddCommand(amdgpu.NewCmdAmdGpu())
cmds.AddCommand(user.NewUserCommand())
cmds.AddCommand(disk.NewDiskCommand())

133
cli/pkg/amdgpu/tasks.go Normal file
View File

@@ -0,0 +1,133 @@
package amdgpu
import (
"fmt"
"os/exec"
"path"
"path/filepath"
"github.com/beclab/Olares/cli/pkg/common"
cc "github.com/beclab/Olares/cli/pkg/core/common"
"github.com/beclab/Olares/cli/pkg/core/connector"
"github.com/beclab/Olares/cli/pkg/core/logger"
"github.com/beclab/Olares/cli/pkg/core/task"
"github.com/beclab/Olares/cli/pkg/utils"
"github.com/Masterminds/semver/v3"
"github.com/pkg/errors"
)
// InstallAmdRocmModule installs AMD ROCm stack on supported Ubuntu if AMD GPU is present.
type InstallAmdRocmModule struct {
common.KubeModule
}
func (m *InstallAmdRocmModule) Init() {
m.Name = "InstallAMDGPU"
installAmd := &task.RemoteTask{
Name: "InstallAmdRocm",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Action: &InstallAmdRocm{
// no manifest needed
},
Parallel: false,
Retry: 1,
}
m.Tasks = []task.Interface{
installAmd,
}
}
// InstallAmdRocm installs ROCm using amdgpu-install on Ubuntu 22.04/24.04 for AMD GPUs.
type InstallAmdRocm struct {
common.KubeAction
}
func (t *InstallAmdRocm) Execute(runtime connector.Runtime) error {
si := runtime.GetSystemInfo()
if !si.IsLinux() || !si.IsUbuntu() || !(si.IsUbuntuVersionEqual(connector.Ubuntu2204) || si.IsUbuntuVersionEqual(connector.Ubuntu2404)) {
return nil
}
amdGPUExists, err := utils.HasAmdIGPU(runtime)
if err != nil {
return err
}
// skip rocm install
if !amdGPUExists {
return nil
}
rocmV, _ := utils.RocmVersion()
min := semver.MustParse("7.1.1")
if rocmV != nil && rocmV.LessThan(min) {
return fmt.Errorf("detected ROCm version %s, which is lower than required %s; please uninstall existing ROCm/AMDGPU components before installation with command: olares-cli amdgpu uninstall", rocmV.Original(), min.Original())
}
if rocmV != nil && rocmV.GreaterThan(min) {
logger.Warnf("Warning: detected ROCm version %s great than maximum tested version %s")
return nil
}
if rocmV != nil && rocmV.Equal(min) {
logger.Infof("detected ROCm version %s, skip rocm install...", min.Original())
return nil
}
// ensure python3-setuptools and python3-wheel
_, _ = runtime.GetRunner().SudoCmd("apt-get update", false, true)
checkPkgs := "dpkg -s python3-setuptools python3-wheel >/dev/null 2>&1 || DEBIAN_FRONTEND=noninteractive apt-get install -y python3-setuptools python3-wheel"
if _, err := runtime.GetRunner().SudoCmd(checkPkgs, false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to install python3-setuptools and python3-wheel")
}
// ensure amdgpu-install exists
if _, err := exec.LookPath("amdgpu-install"); err != nil {
var debURL string
if si.IsUbuntuVersionEqual(connector.Ubuntu2404) {
debURL = "https://repo.radeon.com/amdgpu-install/7.1.1/ubuntu/noble/amdgpu-install_7.1.1.70101-1_all.deb"
} else {
debURL = "https://repo.radeon.com/amdgpu-install/7.1.1/ubuntu/jammy/amdgpu-install_7.1.1.70101-1_all.deb"
}
tmpDeb := path.Join(runtime.GetBaseDir(), cc.PackageCacheDir, "gpu", "amdgpu-install_7.1.1.70101-1_all.deb")
if _, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("install -d -m 0755 %s", filepath.Dir(tmpDeb)), false, true); err != nil {
return err
}
cmd := fmt.Sprintf("sh -c 'wget -O %s %s'", tmpDeb, debURL)
if _, err := runtime.GetRunner().SudoCmd(cmd, false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to download amdgpu-install deb")
}
if _, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("DEBIAN_FRONTEND=noninteractive apt-get install -y %s", tmpDeb), false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to install amdgpu-install deb")
}
}
// run installer for rocm usecase
if _, err := runtime.GetRunner().SudoCmd("amdgpu-install -y --usecase=rocm", false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to install AMD ROCm via amdgpu-install")
}
fmt.Println()
logger.Warn("Warning: To enable ROCm, please reboot your machine after installation.")
return nil
}
type AmdgpuInstallAction struct {
common.KubeAction
}
func (t *AmdgpuInstallAction) Execute(runtime connector.Runtime) error {
if _, err := runtime.GetRunner().SudoCmd("amdgpu-install -y --usecase=rocm", false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to install AMD ROCm via amdgpu-install")
}
return nil
}
type AmdgpuUninstallAction struct {
common.KubeAction
}
func (t *AmdgpuUninstallAction) Execute(runtime connector.Runtime) error {
if _, err := runtime.GetRunner().SudoCmd("amdgpu-install --uninstall -y", false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to uninstall AMD ROCm via amdgpu-install")
}
fmt.Println()
logger.Warn("Warning: Please reboot your machine after uninstall to fully remove ROCm components.")
return nil
}

View File

@@ -81,6 +81,7 @@ func (m *RunPrechecksModule) Init() {
new(NvidiaCardArchChecker),
new(NouveauChecker),
new(CudaChecker),
new(RocmChecker),
}
runPreChecks := &task.LocalTask{
Name: "RunPrechecks",

View File

@@ -372,6 +372,48 @@ func (c *CudaChecker) Check(runtime connector.Runtime) error {
return nil
}
// RocmChecker checks AMD ROCm version for AMD GPU on Ubuntu 22.04/24.04 only.
type RocmChecker struct{}
func (r *RocmChecker) Name() string {
return "ROCm"
}
func (r *RocmChecker) Check(runtime connector.Runtime) error {
if !runtime.GetSystemInfo().IsLinux() {
return nil
}
si := runtime.GetSystemInfo()
if !si.IsUbuntu() || !(si.IsUbuntuVersionEqual(connector.Ubuntu2204) || si.IsUbuntuVersionEqual(connector.Ubuntu2404)) {
return nil
}
// detect AMD GPU presence
amdGPUExists, err := utils.HasAmdIGPU(runtime)
if err != nil {
return err
}
// no AMD GPU found, no need to check rocm
if !amdGPUExists {
return nil
}
curV, err := utils.RocmVersion()
if err != nil && !os.IsNotExist(err) {
return err
}
if os.IsNotExist(err) {
return nil
}
min := semver.MustParse("7.1.1")
if curV.LessThan(min) {
return fmt.Errorf("detected ROCm version %s, which is lower than required %s; please uninstall existing ROCm/AMDGPU components before installation with command: olares-cli amdgpu uninstall", curV.Original(), min.Original())
}
return nil
}
//////////////////////////////////////////////
// precheck - task

View File

@@ -51,10 +51,12 @@ func (d DebianVersion) String() string {
}
const (
Ubuntu20 UbuntuVersion = "20."
Ubuntu22 UbuntuVersion = "22."
Ubuntu24 UbuntuVersion = "24."
Ubuntu25 UbuntuVersion = "25."
Ubuntu20 UbuntuVersion = "20."
Ubuntu22 UbuntuVersion = "22."
Ubuntu24 UbuntuVersion = "24."
Ubuntu25 UbuntuVersion = "25."
Ubuntu2204 UbuntuVersion = "22.04"
Ubuntu2404 UbuntuVersion = "24.04"
Debian9 DebianVersion = "9"
Debian10 DebianVersion = "10"

View File

@@ -3,8 +3,7 @@ package system
import (
"strings"
"github.com/beclab/Olares/cli/pkg/gpu"
"github.com/beclab/Olares/cli/pkg/amdgpu"
"github.com/beclab/Olares/cli/pkg/bootstrap/os"
"github.com/beclab/Olares/cli/pkg/bootstrap/patch"
"github.com/beclab/Olares/cli/pkg/bootstrap/precheck"
@@ -12,6 +11,7 @@ import (
"github.com/beclab/Olares/cli/pkg/container"
"github.com/beclab/Olares/cli/pkg/core/module"
"github.com/beclab/Olares/cli/pkg/daemon"
"github.com/beclab/Olares/cli/pkg/gpu"
"github.com/beclab/Olares/cli/pkg/images"
"github.com/beclab/Olares/cli/pkg/k3s"
"github.com/beclab/Olares/cli/pkg/manifest"
@@ -82,6 +82,7 @@ func (l *linuxPhaseBuilder) build() []module.Module {
addModule(&terminus.WriteReleaseFileModule{}).
addModule(gpuModuleBuilder(func() []module.Module {
return []module.Module{
&amdgpu.InstallAmdRocmModule{},
&gpu.InstallDriversModule{
ManifestModule: manifest.ManifestModule{
Manifest: l.manifestMap,

101
cli/pkg/pipelines/amdgpu.go Normal file
View File

@@ -0,0 +1,101 @@
package pipelines
import (
"strings"
"github.com/beclab/Olares/cli/pkg/amdgpu"
"github.com/beclab/Olares/cli/pkg/common"
"github.com/beclab/Olares/cli/pkg/core/action"
"github.com/beclab/Olares/cli/pkg/core/connector"
"github.com/beclab/Olares/cli/pkg/core/logger"
"github.com/beclab/Olares/cli/pkg/core/module"
"github.com/beclab/Olares/cli/pkg/core/pipeline"
"github.com/beclab/Olares/cli/pkg/core/task"
)
type singleTaskModule struct {
common.KubeModule
name string
act action.Action
}
func (m *singleTaskModule) Init() {
m.Name = m.name
m.Tasks = []task.Interface{
&task.LocalTask{
Name: m.name,
Action: m.act,
},
}
}
func AmdGpuInstall() error {
arg := common.NewArgument()
arg.SetConsoleLog("amdgpuinstall.log", true)
runtime, err := common.NewKubeRuntime(common.AllInOne, *arg)
if err != nil {
return err
}
p := &pipeline.Pipeline{
Name: "InstallAMDGPUDrivers",
Runtime: runtime,
Modules: []module.Module{
&amdgpu.InstallAmdRocmModule{},
},
}
return p.Start()
}
func AmdGpuUninstall() error {
arg := common.NewArgument()
arg.SetConsoleLog("amdgpuuninstall.log", true)
runtime, err := common.NewKubeRuntime(common.AllInOne, *arg)
if err != nil {
return err
}
p := &pipeline.Pipeline{
Name: "UninstallAMDGPUDrivers",
Runtime: runtime,
Modules: []module.Module{
&singleTaskModule{name: "AmdgpuUninstall", act: new(amdgpu.AmdgpuUninstallAction)},
},
}
return p.Start()
}
func AmdGpuStatus() error {
arg := common.NewArgument()
runtime, err := common.NewKubeRuntime(common.AllInOne, *arg)
if err != nil {
return err
}
runtime.SetRunner(
&connector.Runner{
Host: &connector.BaseHost{
Name: common.LocalHost,
Arch: runtime.GetSystemInfo().GetOsArch(),
Os: runtime.GetSystemInfo().GetOsType(),
},
},
)
amdModel, _ := runtime.GetRunner().SudoCmd("lspci | grep -iE 'VGA|3D|Display' | grep -iE 'AMD|ATI' | head -1 || true", false, false)
drvVer, _ := runtime.GetRunner().SudoCmd("modinfo amdgpu 2>/dev/null | awk -F': ' '/^version:/{print $2}' || true", false, false)
rocmVer, _ := runtime.GetRunner().SudoCmd("cat /opt/rocm/.info/version 2>/dev/null || true", false, false)
if strings.TrimSpace(amdModel) != "" {
logger.Infof("AMD GPU: %s", strings.TrimSpace(amdModel))
} else {
logger.Info("AMD GPU: not detected")
}
if strings.TrimSpace(drvVer) != "" {
logger.Infof("AMDGPU driver %s", strings.TrimSpace(drvVer))
} else {
logger.Info("AMDGPU driver version: unknown")
}
if strings.TrimSpace(rocmVer) != "" {
logger.Infof("ROCm version: %s", strings.TrimSpace(rocmVer))
} else {
logger.Info("ROCm version: not installed")
}
return nil
}

View File

@@ -9,6 +9,7 @@ import (
"github.com/beclab/Olares/cli/pkg/core/connector"
"github.com/beclab/Olares/cli/pkg/core/logger"
"github.com/beclab/Olares/cli/pkg/core/task"
"github.com/beclab/Olares/cli/pkg/utils"
)
type WelcomeMessage struct {
@@ -68,6 +69,15 @@ func (t *WelcomeMessage) Execute(runtime connector.Runtime) error {
logger.Infof("Username: %s", t.KubeConf.Arg.User.UserName)
logger.Infof("Password: %s", t.KubeConf.Arg.User.Password)
fmt.Printf("\n------------------------------------------------\n\n\n\n\n")
fmt.Println()
// If AMD GPU on Ubuntu 22.04/24.04, print warning about reboot for ROCm
if si := runtime.GetSystemInfo(); si.IsUbuntu() && (si.IsUbuntuVersionEqual(connector.Ubuntu2204) || si.IsUbuntuVersionEqual(connector.Ubuntu2404)) {
if hasAmd, _ := utils.HasAmdIGPU(runtime); hasAmd {
logger.Warnf("\x1b[31mWarning: To enable ROCm, please reboot your machine after activation.\x1b[0m")
fmt.Println()
}
}
return nil
}

67
cli/pkg/utils/amdgpu.go Normal file
View File

@@ -0,0 +1,67 @@
package utils
import (
"fmt"
"os"
"strings"
"github.com/Masterminds/semver/v3"
"github.com/beclab/Olares/cli/pkg/core/connector"
)
func HasAmdIGPU(execRuntime connector.Runtime) (bool, error) {
// Detect by CPU model names that bundle AMD AI NPU/graphics
targets := []string{
"AMD Ryzen AI Max+ 395",
"AMD Ryzen AI Max 390",
"AMD Ryzen AI Max 385",
"AMD Ryzen AI 9 HX 375",
"AMD Ryzen AI 9 HX 370",
"AMD Ryzen AI 9 365",
}
// try lscpu first: extract 'Model name' field
out, err := execRuntime.GetRunner().SudoCmd("lscpu 2>/dev/null | awk -F': *' '/^Model name/{print $2; exit}' || true", false, false)
if err != nil {
return false, err
}
if out != "" {
lo := strings.ToLower(strings.TrimSpace(out))
for _, t := range targets {
if strings.Contains(lo, strings.ToLower(t)) {
return true, nil
}
}
}
// fallback to /proc/cpuinfo
out, err = execRuntime.GetRunner().SudoCmd("awk -F': *' '/^model name/{print $2; exit}' /proc/cpuinfo 2>/dev/null || true", false, false)
if err != nil {
return false, err
}
if out != "" {
lo := strings.ToLower(strings.TrimSpace(out))
for _, t := range targets {
if strings.Contains(lo, strings.ToLower(t)) {
return true, nil
}
}
}
return false, nil
}
func RocmVersion() (*semver.Version, error) {
const rocmVersionFile = "/opt/rocm/.info/version"
data, err := os.ReadFile(rocmVersionFile)
if err != nil {
// no ROCm installed, nothing to check
if os.IsNotExist(err) {
return nil, err
}
return nil, err
}
curStr := strings.TrimSpace(string(data))
cur, err := semver.NewVersion(curStr)
if err != nil {
return nil, fmt.Errorf("invalid rocm version: %s", curStr)
}
return cur, nil
}