Compare commits
7 Commits
module-bfl
...
module-l4-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
738198350f | ||
|
|
5de7ea01bc | ||
|
|
ff7765bcb9 | ||
|
|
c98c855099 | ||
|
|
4ae552f33f | ||
|
|
f2aad6d9f6 | ||
|
|
02bc4fafd5 |
@@ -317,7 +317,7 @@ spec:
|
||||
chown -R 1000:1000 /uploadstemp && \
|
||||
chown -R 1000:1000 /appdata
|
||||
- name: olares-app-init
|
||||
image: beclab/system-frontend:v1.7.7
|
||||
image: beclab/system-frontend:v1.8.2
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- /bin/sh
|
||||
|
||||
21
cli/cmd/ctl/amdgpu/install.go
Normal file
21
cli/cmd/ctl/amdgpu/install.go
Normal file
@@ -0,0 +1,21 @@
|
||||
package amdgpu
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/beclab/Olares/cli/pkg/pipelines"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
func NewCmdAmdGpuInstall() *cobra.Command {
|
||||
cmd := &cobra.Command{
|
||||
Use: "install",
|
||||
Short: "Install AMD ROCm stack via amdgpu-install",
|
||||
Run: func(cmd *cobra.Command, args []string) {
|
||||
if err := pipelines.AmdGpuInstall(); err != nil {
|
||||
log.Fatalf("error: %v", err)
|
||||
}
|
||||
},
|
||||
}
|
||||
return cmd
|
||||
}
|
||||
16
cli/cmd/ctl/amdgpu/root.go
Normal file
16
cli/cmd/ctl/amdgpu/root.go
Normal file
@@ -0,0 +1,16 @@
|
||||
package amdgpu
|
||||
|
||||
import "github.com/spf13/cobra"
|
||||
|
||||
func NewCmdAmdGpu() *cobra.Command {
|
||||
cmd := &cobra.Command{
|
||||
Use: "amdgpu",
|
||||
Short: "Manage AMD GPU ROCm stack",
|
||||
}
|
||||
cmd.AddCommand(NewCmdAmdGpuInstall())
|
||||
cmd.AddCommand(NewCmdAmdGpuUninstall())
|
||||
cmd.AddCommand(NewCmdAmdGpuStatus())
|
||||
return cmd
|
||||
}
|
||||
|
||||
|
||||
21
cli/cmd/ctl/amdgpu/status.go
Normal file
21
cli/cmd/ctl/amdgpu/status.go
Normal file
@@ -0,0 +1,21 @@
|
||||
package amdgpu
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/beclab/Olares/cli/pkg/pipelines"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
func NewCmdAmdGpuStatus() *cobra.Command {
|
||||
cmd := &cobra.Command{
|
||||
Use: "status",
|
||||
Short: "Show AMD GPU driver and ROCm status",
|
||||
Run: func(cmd *cobra.Command, args []string) {
|
||||
if err := pipelines.AmdGpuStatus(); err != nil {
|
||||
log.Fatalf("error: %v", err)
|
||||
}
|
||||
},
|
||||
}
|
||||
return cmd
|
||||
}
|
||||
21
cli/cmd/ctl/amdgpu/uninstall.go
Normal file
21
cli/cmd/ctl/amdgpu/uninstall.go
Normal file
@@ -0,0 +1,21 @@
|
||||
package amdgpu
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/beclab/Olares/cli/pkg/pipelines"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
func NewCmdAmdGpuUninstall() *cobra.Command {
|
||||
cmd := &cobra.Command{
|
||||
Use: "uninstall",
|
||||
Short: "Uninstall AMD ROCm stack via amdgpu-install",
|
||||
Run: func(cmd *cobra.Command, args []string) {
|
||||
if err := pipelines.AmdGpuUninstall(); err != nil {
|
||||
log.Fatalf("error: %v", err)
|
||||
}
|
||||
},
|
||||
}
|
||||
return cmd
|
||||
}
|
||||
@@ -3,6 +3,7 @@ package os
|
||||
import (
|
||||
"archive/tar"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
@@ -13,6 +14,9 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/rest"
|
||||
"k8s.io/client-go/tools/clientcmd"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
@@ -276,7 +280,7 @@ func collectSystemdLogs(tw *tar.Writer, options *LogCollectOptions) error {
|
||||
}
|
||||
|
||||
func collectDmesgLogs(tw *tar.Writer, options *LogCollectOptions) error {
|
||||
cmd := exec.Command("dmesg")
|
||||
cmd := exec.Command("dmesg -T")
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -399,6 +403,126 @@ func collectKubernetesLogs(tw *tar.Writer, options *LogCollectOptions) error {
|
||||
}
|
||||
}
|
||||
|
||||
if err := collectNginxLogsFromLabeledPods(tw); err != nil {
|
||||
if !options.IgnoreKubeErrors {
|
||||
return fmt.Errorf("failed to collect nginx logs from labeled pods: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func collectNginxLogsFromLabeledPods(tw *tar.Writer) error {
|
||||
if _, err := util.GetCommand("kubectl"); err != nil {
|
||||
fmt.Printf("warning: kubectl not found, skipping collecting nginx logs from labeled pods\n")
|
||||
return nil
|
||||
}
|
||||
|
||||
cfg, err := ctrl.GetConfig()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get kubeconfig: %v", err)
|
||||
}
|
||||
clientset, err := kubernetes.NewForConfig(cfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create kube client: %v", err)
|
||||
}
|
||||
|
||||
type selectorSpec struct {
|
||||
LabelSelector string
|
||||
ContainerName string
|
||||
}
|
||||
selectors := []selectorSpec{
|
||||
{LabelSelector: "app=l4-bfl-proxy", ContainerName: ""},
|
||||
{LabelSelector: "tier=bfl", ContainerName: "ingress"},
|
||||
}
|
||||
|
||||
type targetPod struct {
|
||||
Namespace string
|
||||
Name string
|
||||
ContainerName string
|
||||
}
|
||||
var targets []targetPod
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
for _, sel := range selectors {
|
||||
podList, err := clientset.CoreV1().Pods(corev1.NamespaceAll).List(ctx, metav1.ListOptions{LabelSelector: sel.LabelSelector})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to list pods by label %q: %v", sel.LabelSelector, err)
|
||||
}
|
||||
for _, pod := range podList.Items {
|
||||
targets = append(targets, targetPod{
|
||||
Namespace: pod.Namespace,
|
||||
Name: pod.Name,
|
||||
ContainerName: sel.ContainerName,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if len(targets) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// simplest approach: use kubectl cp (it already implements copy via tar over exec)
|
||||
tempDir, err := os.MkdirTemp("", "olares-nginx-logs-*")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create temp directory for nginx logs: %v", err)
|
||||
}
|
||||
defer os.RemoveAll(tempDir)
|
||||
|
||||
files := []string{"/var/log/nginx/access.log", "/var/log/nginx/error.log"}
|
||||
for _, target := range targets {
|
||||
for _, remotePath := range files {
|
||||
base := filepath.Base(remotePath)
|
||||
archivePath := filepath.Join("nginx", target.Namespace, target.Name, base)
|
||||
|
||||
dest := filepath.Join(tempDir, fmt.Sprintf("%s__%s__%s", target.Namespace, target.Name, base))
|
||||
|
||||
err := kubectlCopyFile(target.Namespace, target.Name, target.ContainerName, remotePath, dest)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to kubectl cp %s/%s:%s: %v", target.Namespace, target.Name, remotePath, err)
|
||||
}
|
||||
|
||||
fi, err := os.Stat(dest)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to stat copied nginx log %s: %v", dest, err)
|
||||
}
|
||||
|
||||
f, err := os.Open(dest)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open copied nginx log %s: %v", dest, err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
header := &tar.Header{
|
||||
Name: archivePath,
|
||||
Mode: 0644,
|
||||
Size: fi.Size(),
|
||||
ModTime: time.Now(),
|
||||
}
|
||||
if err := tw.WriteHeader(header); err != nil {
|
||||
return fmt.Errorf("failed to write header for %s: %v", archivePath, err)
|
||||
}
|
||||
if _, err := io.CopyN(tw, f, header.Size); err != nil {
|
||||
return fmt.Errorf("failed to write data for %s: %v", archivePath, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func kubectlCopyFile(namespace, pod, container, remotePath, destPath string) error {
|
||||
args := []string{"-n", namespace, "cp"}
|
||||
if container != "" {
|
||||
args = append(args, "-c", container)
|
||||
}
|
||||
args = append(args, fmt.Sprintf("%s:%s", pod, remotePath), destPath)
|
||||
|
||||
cmd := exec.Command("kubectl", args...)
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("kubectl %s failed: %v, output: %s", strings.Join(args, " "), err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package ctl
|
||||
|
||||
import (
|
||||
"github.com/beclab/Olares/cli/cmd/ctl/amdgpu"
|
||||
"github.com/beclab/Olares/cli/cmd/ctl/disk"
|
||||
"github.com/beclab/Olares/cli/cmd/ctl/gpu"
|
||||
"github.com/beclab/Olares/cli/cmd/ctl/node"
|
||||
@@ -33,6 +34,7 @@ func NewDefaultCommand() *cobra.Command {
|
||||
cmds.AddCommand(os.NewOSCommands()...)
|
||||
cmds.AddCommand(node.NewNodeCommand())
|
||||
cmds.AddCommand(gpu.NewCmdGpu())
|
||||
cmds.AddCommand(amdgpu.NewCmdAmdGpu())
|
||||
cmds.AddCommand(user.NewUserCommand())
|
||||
cmds.AddCommand(disk.NewDiskCommand())
|
||||
|
||||
|
||||
133
cli/pkg/amdgpu/tasks.go
Normal file
133
cli/pkg/amdgpu/tasks.go
Normal file
@@ -0,0 +1,133 @@
|
||||
package amdgpu
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"path"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/beclab/Olares/cli/pkg/common"
|
||||
cc "github.com/beclab/Olares/cli/pkg/core/common"
|
||||
"github.com/beclab/Olares/cli/pkg/core/connector"
|
||||
"github.com/beclab/Olares/cli/pkg/core/logger"
|
||||
"github.com/beclab/Olares/cli/pkg/core/task"
|
||||
"github.com/beclab/Olares/cli/pkg/utils"
|
||||
|
||||
"github.com/Masterminds/semver/v3"
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
// InstallAmdRocmModule installs AMD ROCm stack on supported Ubuntu if AMD GPU is present.
|
||||
type InstallAmdRocmModule struct {
|
||||
common.KubeModule
|
||||
}
|
||||
|
||||
func (m *InstallAmdRocmModule) Init() {
|
||||
m.Name = "InstallAMDGPU"
|
||||
|
||||
installAmd := &task.RemoteTask{
|
||||
Name: "InstallAmdRocm",
|
||||
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
||||
Action: &InstallAmdRocm{
|
||||
// no manifest needed
|
||||
},
|
||||
Parallel: false,
|
||||
Retry: 1,
|
||||
}
|
||||
|
||||
m.Tasks = []task.Interface{
|
||||
installAmd,
|
||||
}
|
||||
}
|
||||
|
||||
// InstallAmdRocm installs ROCm using amdgpu-install on Ubuntu 22.04/24.04 for AMD GPUs.
|
||||
type InstallAmdRocm struct {
|
||||
common.KubeAction
|
||||
}
|
||||
|
||||
func (t *InstallAmdRocm) Execute(runtime connector.Runtime) error {
|
||||
si := runtime.GetSystemInfo()
|
||||
if !si.IsLinux() || !si.IsUbuntu() || !(si.IsUbuntuVersionEqual(connector.Ubuntu2204) || si.IsUbuntuVersionEqual(connector.Ubuntu2404)) {
|
||||
return nil
|
||||
}
|
||||
|
||||
amdGPUExists, err := utils.HasAmdIGPU(runtime)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// skip rocm install
|
||||
if !amdGPUExists {
|
||||
return nil
|
||||
}
|
||||
rocmV, _ := utils.RocmVersion()
|
||||
min := semver.MustParse("7.1.1")
|
||||
if rocmV != nil && rocmV.LessThan(min) {
|
||||
return fmt.Errorf("detected ROCm version %s, which is lower than required %s; please uninstall existing ROCm/AMDGPU components before installation with command: olares-cli amdgpu uninstall", rocmV.Original(), min.Original())
|
||||
}
|
||||
if rocmV != nil && rocmV.GreaterThan(min) {
|
||||
logger.Warnf("Warning: detected ROCm version %s great than maximum tested version %s")
|
||||
return nil
|
||||
}
|
||||
if rocmV != nil && rocmV.Equal(min) {
|
||||
logger.Infof("detected ROCm version %s, skip rocm install...", min.Original())
|
||||
return nil
|
||||
}
|
||||
|
||||
// ensure python3-setuptools and python3-wheel
|
||||
_, _ = runtime.GetRunner().SudoCmd("apt-get update", false, true)
|
||||
checkPkgs := "dpkg -s python3-setuptools python3-wheel >/dev/null 2>&1 || DEBIAN_FRONTEND=noninteractive apt-get install -y python3-setuptools python3-wheel"
|
||||
if _, err := runtime.GetRunner().SudoCmd(checkPkgs, false, true); err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "failed to install python3-setuptools and python3-wheel")
|
||||
}
|
||||
// ensure amdgpu-install exists
|
||||
if _, err := exec.LookPath("amdgpu-install"); err != nil {
|
||||
var debURL string
|
||||
if si.IsUbuntuVersionEqual(connector.Ubuntu2404) {
|
||||
debURL = "https://repo.radeon.com/amdgpu-install/7.1.1/ubuntu/noble/amdgpu-install_7.1.1.70101-1_all.deb"
|
||||
} else {
|
||||
debURL = "https://repo.radeon.com/amdgpu-install/7.1.1/ubuntu/jammy/amdgpu-install_7.1.1.70101-1_all.deb"
|
||||
}
|
||||
tmpDeb := path.Join(runtime.GetBaseDir(), cc.PackageCacheDir, "gpu", "amdgpu-install_7.1.1.70101-1_all.deb")
|
||||
if _, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("install -d -m 0755 %s", filepath.Dir(tmpDeb)), false, true); err != nil {
|
||||
return err
|
||||
}
|
||||
cmd := fmt.Sprintf("sh -c 'wget -O %s %s'", tmpDeb, debURL)
|
||||
if _, err := runtime.GetRunner().SudoCmd(cmd, false, true); err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "failed to download amdgpu-install deb")
|
||||
}
|
||||
if _, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("DEBIAN_FRONTEND=noninteractive apt-get install -y %s", tmpDeb), false, true); err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "failed to install amdgpu-install deb")
|
||||
}
|
||||
}
|
||||
// run installer for rocm usecase
|
||||
if _, err := runtime.GetRunner().SudoCmd("amdgpu-install -y --usecase=rocm", false, true); err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "failed to install AMD ROCm via amdgpu-install")
|
||||
}
|
||||
fmt.Println()
|
||||
logger.Warn("Warning: To enable ROCm, please reboot your machine after installation.")
|
||||
return nil
|
||||
}
|
||||
|
||||
type AmdgpuInstallAction struct {
|
||||
common.KubeAction
|
||||
}
|
||||
|
||||
func (t *AmdgpuInstallAction) Execute(runtime connector.Runtime) error {
|
||||
if _, err := runtime.GetRunner().SudoCmd("amdgpu-install -y --usecase=rocm", false, true); err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "failed to install AMD ROCm via amdgpu-install")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type AmdgpuUninstallAction struct {
|
||||
common.KubeAction
|
||||
}
|
||||
|
||||
func (t *AmdgpuUninstallAction) Execute(runtime connector.Runtime) error {
|
||||
if _, err := runtime.GetRunner().SudoCmd("amdgpu-install --uninstall -y", false, true); err != nil {
|
||||
return errors.Wrap(errors.WithStack(err), "failed to uninstall AMD ROCm via amdgpu-install")
|
||||
}
|
||||
fmt.Println()
|
||||
logger.Warn("Warning: Please reboot your machine after uninstall to fully remove ROCm components.")
|
||||
return nil
|
||||
}
|
||||
@@ -81,6 +81,7 @@ func (m *RunPrechecksModule) Init() {
|
||||
new(NvidiaCardArchChecker),
|
||||
new(NouveauChecker),
|
||||
new(CudaChecker),
|
||||
new(RocmChecker),
|
||||
}
|
||||
runPreChecks := &task.LocalTask{
|
||||
Name: "RunPrechecks",
|
||||
|
||||
@@ -372,6 +372,48 @@ func (c *CudaChecker) Check(runtime connector.Runtime) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// RocmChecker checks AMD ROCm version for AMD GPU on Ubuntu 22.04/24.04 only.
|
||||
type RocmChecker struct{}
|
||||
|
||||
func (r *RocmChecker) Name() string {
|
||||
return "ROCm"
|
||||
}
|
||||
|
||||
func (r *RocmChecker) Check(runtime connector.Runtime) error {
|
||||
if !runtime.GetSystemInfo().IsLinux() {
|
||||
return nil
|
||||
}
|
||||
si := runtime.GetSystemInfo()
|
||||
if !si.IsUbuntu() || !(si.IsUbuntuVersionEqual(connector.Ubuntu2204) || si.IsUbuntuVersionEqual(connector.Ubuntu2404)) {
|
||||
return nil
|
||||
}
|
||||
|
||||
// detect AMD GPU presence
|
||||
amdGPUExists, err := utils.HasAmdIGPU(runtime)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// no AMD GPU found, no need to check rocm
|
||||
if !amdGPUExists {
|
||||
return nil
|
||||
}
|
||||
|
||||
curV, err := utils.RocmVersion()
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
|
||||
min := semver.MustParse("7.1.1")
|
||||
if curV.LessThan(min) {
|
||||
return fmt.Errorf("detected ROCm version %s, which is lower than required %s; please uninstall existing ROCm/AMDGPU components before installation with command: olares-cli amdgpu uninstall", curV.Original(), min.Original())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// precheck - task
|
||||
|
||||
|
||||
@@ -51,10 +51,12 @@ func (d DebianVersion) String() string {
|
||||
}
|
||||
|
||||
const (
|
||||
Ubuntu20 UbuntuVersion = "20."
|
||||
Ubuntu22 UbuntuVersion = "22."
|
||||
Ubuntu24 UbuntuVersion = "24."
|
||||
Ubuntu25 UbuntuVersion = "25."
|
||||
Ubuntu20 UbuntuVersion = "20."
|
||||
Ubuntu22 UbuntuVersion = "22."
|
||||
Ubuntu24 UbuntuVersion = "24."
|
||||
Ubuntu25 UbuntuVersion = "25."
|
||||
Ubuntu2204 UbuntuVersion = "22.04"
|
||||
Ubuntu2404 UbuntuVersion = "24.04"
|
||||
|
||||
Debian9 DebianVersion = "9"
|
||||
Debian10 DebianVersion = "10"
|
||||
|
||||
@@ -3,8 +3,7 @@ package system
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/beclab/Olares/cli/pkg/gpu"
|
||||
|
||||
"github.com/beclab/Olares/cli/pkg/amdgpu"
|
||||
"github.com/beclab/Olares/cli/pkg/bootstrap/os"
|
||||
"github.com/beclab/Olares/cli/pkg/bootstrap/patch"
|
||||
"github.com/beclab/Olares/cli/pkg/bootstrap/precheck"
|
||||
@@ -12,6 +11,7 @@ import (
|
||||
"github.com/beclab/Olares/cli/pkg/container"
|
||||
"github.com/beclab/Olares/cli/pkg/core/module"
|
||||
"github.com/beclab/Olares/cli/pkg/daemon"
|
||||
"github.com/beclab/Olares/cli/pkg/gpu"
|
||||
"github.com/beclab/Olares/cli/pkg/images"
|
||||
"github.com/beclab/Olares/cli/pkg/k3s"
|
||||
"github.com/beclab/Olares/cli/pkg/manifest"
|
||||
@@ -82,6 +82,7 @@ func (l *linuxPhaseBuilder) build() []module.Module {
|
||||
addModule(&terminus.WriteReleaseFileModule{}).
|
||||
addModule(gpuModuleBuilder(func() []module.Module {
|
||||
return []module.Module{
|
||||
&amdgpu.InstallAmdRocmModule{},
|
||||
&gpu.InstallDriversModule{
|
||||
ManifestModule: manifest.ManifestModule{
|
||||
Manifest: l.manifestMap,
|
||||
|
||||
101
cli/pkg/pipelines/amdgpu.go
Normal file
101
cli/pkg/pipelines/amdgpu.go
Normal file
@@ -0,0 +1,101 @@
|
||||
package pipelines
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/beclab/Olares/cli/pkg/amdgpu"
|
||||
"github.com/beclab/Olares/cli/pkg/common"
|
||||
"github.com/beclab/Olares/cli/pkg/core/action"
|
||||
"github.com/beclab/Olares/cli/pkg/core/connector"
|
||||
"github.com/beclab/Olares/cli/pkg/core/logger"
|
||||
"github.com/beclab/Olares/cli/pkg/core/module"
|
||||
"github.com/beclab/Olares/cli/pkg/core/pipeline"
|
||||
"github.com/beclab/Olares/cli/pkg/core/task"
|
||||
)
|
||||
|
||||
type singleTaskModule struct {
|
||||
common.KubeModule
|
||||
name string
|
||||
act action.Action
|
||||
}
|
||||
|
||||
func (m *singleTaskModule) Init() {
|
||||
m.Name = m.name
|
||||
m.Tasks = []task.Interface{
|
||||
&task.LocalTask{
|
||||
Name: m.name,
|
||||
Action: m.act,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func AmdGpuInstall() error {
|
||||
arg := common.NewArgument()
|
||||
arg.SetConsoleLog("amdgpuinstall.log", true)
|
||||
runtime, err := common.NewKubeRuntime(common.AllInOne, *arg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
p := &pipeline.Pipeline{
|
||||
Name: "InstallAMDGPUDrivers",
|
||||
Runtime: runtime,
|
||||
Modules: []module.Module{
|
||||
&amdgpu.InstallAmdRocmModule{},
|
||||
},
|
||||
}
|
||||
return p.Start()
|
||||
}
|
||||
|
||||
func AmdGpuUninstall() error {
|
||||
arg := common.NewArgument()
|
||||
arg.SetConsoleLog("amdgpuuninstall.log", true)
|
||||
runtime, err := common.NewKubeRuntime(common.AllInOne, *arg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
p := &pipeline.Pipeline{
|
||||
Name: "UninstallAMDGPUDrivers",
|
||||
Runtime: runtime,
|
||||
Modules: []module.Module{
|
||||
&singleTaskModule{name: "AmdgpuUninstall", act: new(amdgpu.AmdgpuUninstallAction)},
|
||||
},
|
||||
}
|
||||
return p.Start()
|
||||
}
|
||||
|
||||
func AmdGpuStatus() error {
|
||||
arg := common.NewArgument()
|
||||
runtime, err := common.NewKubeRuntime(common.AllInOne, *arg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
runtime.SetRunner(
|
||||
&connector.Runner{
|
||||
Host: &connector.BaseHost{
|
||||
Name: common.LocalHost,
|
||||
Arch: runtime.GetSystemInfo().GetOsArch(),
|
||||
Os: runtime.GetSystemInfo().GetOsType(),
|
||||
},
|
||||
},
|
||||
)
|
||||
amdModel, _ := runtime.GetRunner().SudoCmd("lspci | grep -iE 'VGA|3D|Display' | grep -iE 'AMD|ATI' | head -1 || true", false, false)
|
||||
drvVer, _ := runtime.GetRunner().SudoCmd("modinfo amdgpu 2>/dev/null | awk -F': ' '/^version:/{print $2}' || true", false, false)
|
||||
rocmVer, _ := runtime.GetRunner().SudoCmd("cat /opt/rocm/.info/version 2>/dev/null || true", false, false)
|
||||
|
||||
if strings.TrimSpace(amdModel) != "" {
|
||||
logger.Infof("AMD GPU: %s", strings.TrimSpace(amdModel))
|
||||
} else {
|
||||
logger.Info("AMD GPU: not detected")
|
||||
}
|
||||
if strings.TrimSpace(drvVer) != "" {
|
||||
logger.Infof("AMDGPU driver %s", strings.TrimSpace(drvVer))
|
||||
} else {
|
||||
logger.Info("AMDGPU driver version: unknown")
|
||||
}
|
||||
if strings.TrimSpace(rocmVer) != "" {
|
||||
logger.Infof("ROCm version: %s", strings.TrimSpace(rocmVer))
|
||||
} else {
|
||||
logger.Info("ROCm version: not installed")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"github.com/beclab/Olares/cli/pkg/core/connector"
|
||||
"github.com/beclab/Olares/cli/pkg/core/logger"
|
||||
"github.com/beclab/Olares/cli/pkg/core/task"
|
||||
"github.com/beclab/Olares/cli/pkg/utils"
|
||||
)
|
||||
|
||||
type WelcomeMessage struct {
|
||||
@@ -68,6 +69,15 @@ func (t *WelcomeMessage) Execute(runtime connector.Runtime) error {
|
||||
logger.Infof("Username: %s", t.KubeConf.Arg.User.UserName)
|
||||
logger.Infof("Password: %s", t.KubeConf.Arg.User.Password)
|
||||
fmt.Printf("\n------------------------------------------------\n\n\n\n\n")
|
||||
fmt.Println()
|
||||
|
||||
// If AMD GPU on Ubuntu 22.04/24.04, print warning about reboot for ROCm
|
||||
if si := runtime.GetSystemInfo(); si.IsUbuntu() && (si.IsUbuntuVersionEqual(connector.Ubuntu2204) || si.IsUbuntuVersionEqual(connector.Ubuntu2404)) {
|
||||
if hasAmd, _ := utils.HasAmdIGPU(runtime); hasAmd {
|
||||
logger.Warnf("\x1b[31mWarning: To enable ROCm, please reboot your machine after activation.\x1b[0m")
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
67
cli/pkg/utils/amdgpu.go
Normal file
67
cli/pkg/utils/amdgpu.go
Normal file
@@ -0,0 +1,67 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/Masterminds/semver/v3"
|
||||
"github.com/beclab/Olares/cli/pkg/core/connector"
|
||||
)
|
||||
|
||||
func HasAmdIGPU(execRuntime connector.Runtime) (bool, error) {
|
||||
// Detect by CPU model names that bundle AMD AI NPU/graphics
|
||||
targets := []string{
|
||||
"AMD Ryzen AI Max+ 395",
|
||||
"AMD Ryzen AI Max 390",
|
||||
"AMD Ryzen AI Max 385",
|
||||
"AMD Ryzen AI 9 HX 375",
|
||||
"AMD Ryzen AI 9 HX 370",
|
||||
"AMD Ryzen AI 9 365",
|
||||
}
|
||||
// try lscpu first: extract 'Model name' field
|
||||
out, err := execRuntime.GetRunner().SudoCmd("lscpu 2>/dev/null | awk -F': *' '/^Model name/{print $2; exit}' || true", false, false)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if out != "" {
|
||||
lo := strings.ToLower(strings.TrimSpace(out))
|
||||
for _, t := range targets {
|
||||
if strings.Contains(lo, strings.ToLower(t)) {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
// fallback to /proc/cpuinfo
|
||||
out, err = execRuntime.GetRunner().SudoCmd("awk -F': *' '/^model name/{print $2; exit}' /proc/cpuinfo 2>/dev/null || true", false, false)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if out != "" {
|
||||
lo := strings.ToLower(strings.TrimSpace(out))
|
||||
for _, t := range targets {
|
||||
if strings.Contains(lo, strings.ToLower(t)) {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func RocmVersion() (*semver.Version, error) {
|
||||
const rocmVersionFile = "/opt/rocm/.info/version"
|
||||
data, err := os.ReadFile(rocmVersionFile)
|
||||
if err != nil {
|
||||
// no ROCm installed, nothing to check
|
||||
if os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
curStr := strings.TrimSpace(string(data))
|
||||
cur, err := semver.NewVersion(curStr)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid rocm version: %s", curStr)
|
||||
}
|
||||
return cur, nil
|
||||
}
|
||||
@@ -266,7 +266,7 @@ spec:
|
||||
|
||||
containers:
|
||||
- name: api
|
||||
image: beclab/bfl:v0.4.38
|
||||
image: beclab/bfl:v0.4.39
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
runAsUser: 1000
|
||||
|
||||
@@ -240,6 +240,7 @@ func (c *Client) getAppListFromData(apps []map[string]interface{}) ([]*AppInfo,
|
||||
res = append(res, &AppInfo{
|
||||
ID: genAppID(appSpec),
|
||||
Name: stringOrEmpty(appSpec["name"]),
|
||||
RawAppName: stringOrEmpty(appSpec["rawAppName"]),
|
||||
Namespace: stringOrEmpty(appSpec["namespace"]),
|
||||
DeploymentName: stringOrEmpty(appSpec["deployment"]),
|
||||
Owner: stringOrEmpty(appSpec["owner"]),
|
||||
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
type AppInfo struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
RawAppName string `json:"rawAppName"`
|
||||
Namespace string `json:"namespace"`
|
||||
DeploymentName string `json:"deployment"`
|
||||
Owner string `json:"owner"`
|
||||
|
||||
@@ -146,6 +146,8 @@ type Server struct {
|
||||
ngxCmd *nginx.Command
|
||||
|
||||
ngxTmpl *template.Template
|
||||
|
||||
prevNgxConf []byte
|
||||
}
|
||||
|
||||
func (s *Server) init() error {
|
||||
@@ -906,6 +908,9 @@ func (s *Server) generateStreamServers() ([]StreamServer, error) {
|
||||
if bflHost == "" {
|
||||
return nil, fmt.Errorf("can not find bfl service for user=%s", app.Spec.Owner)
|
||||
}
|
||||
if p.ExposePort < 1 || p.ExposePort > 65535 {
|
||||
continue
|
||||
}
|
||||
server := StreamServer{
|
||||
Protocol: p.Protocol,
|
||||
Port: p.ExposePort,
|
||||
@@ -929,6 +934,13 @@ func (s *Server) renderAndReload() error {
|
||||
return err
|
||||
}
|
||||
nginxConfig := buf.Bytes()
|
||||
|
||||
if bytes.Equal(nginxConfig, s.prevNgxConf) {
|
||||
klog.Infof("nginx config not changed, skip reload")
|
||||
return nil
|
||||
}
|
||||
s.prevNgxConf = nginxConfig
|
||||
|
||||
err = s.testTemplate(nginxConfig)
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
Reference in New Issue
Block a user