406 lines
12 KiB
Go
406 lines
12 KiB
Go
package upgrade
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/Masterminds/semver/v3"
|
|
"github.com/beclab/Olares/cli/pkg/bootstrap/precheck"
|
|
"github.com/beclab/Olares/cli/pkg/clientset"
|
|
"github.com/beclab/Olares/cli/pkg/common"
|
|
"github.com/beclab/Olares/cli/pkg/container"
|
|
cc "github.com/beclab/Olares/cli/pkg/core/common"
|
|
"github.com/beclab/Olares/cli/pkg/core/connector"
|
|
"github.com/beclab/Olares/cli/pkg/core/logger"
|
|
"github.com/beclab/Olares/cli/pkg/core/task"
|
|
"github.com/beclab/Olares/cli/pkg/core/util"
|
|
"github.com/beclab/Olares/cli/pkg/gpu"
|
|
"github.com/beclab/Olares/cli/pkg/k3s"
|
|
k3stemplates "github.com/beclab/Olares/cli/pkg/k3s/templates"
|
|
"github.com/beclab/Olares/cli/pkg/kubernetes"
|
|
"github.com/beclab/Olares/cli/pkg/kubesphere"
|
|
"github.com/beclab/Olares/cli/pkg/kubesphere/plugins"
|
|
"github.com/beclab/Olares/cli/pkg/manifest"
|
|
"github.com/beclab/Olares/cli/pkg/phase"
|
|
"github.com/beclab/Olares/cli/pkg/terminus"
|
|
"github.com/beclab/Olares/cli/pkg/utils"
|
|
"github.com/pkg/errors"
|
|
"k8s.io/utils/ptr"
|
|
)
|
|
|
|
const cacheRebootNeeded = "reboot.needed"
|
|
|
|
type upgradeContainerdAction struct {
|
|
common.KubeAction
|
|
}
|
|
|
|
func (u *upgradeContainerdAction) Execute(runtime connector.Runtime) error {
|
|
m, err := manifest.ReadAll(u.KubeConf.Arg.Manifest)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
action := &container.SyncContainerd{
|
|
ManifestAction: manifest.ManifestAction{
|
|
Manifest: m,
|
|
BaseDir: runtime.GetBaseDir(),
|
|
},
|
|
}
|
|
return action.Execute(runtime)
|
|
}
|
|
|
|
func upgradeContainerd() []task.Interface {
|
|
return []task.Interface{
|
|
&task.LocalTask{
|
|
Name: "UpgradeContainerd",
|
|
Action: new(upgradeContainerdAction),
|
|
},
|
|
&task.LocalTask{
|
|
Name: "RestartContainerd",
|
|
Action: new(container.RestartContainerd),
|
|
},
|
|
}
|
|
}
|
|
|
|
func upgradeKSCore() []task.Interface {
|
|
return []task.Interface{
|
|
&task.LocalTask{
|
|
Name: "CopyEmbeddedKSManifests",
|
|
Action: new(plugins.CopyEmbedFiles),
|
|
},
|
|
&task.LocalTask{
|
|
Name: "UpgradeKSCore",
|
|
Action: new(plugins.CreateKsCore),
|
|
Retry: 10,
|
|
Delay: 10 * time.Second,
|
|
},
|
|
&task.LocalTask{
|
|
Name: "CheckKSCoreRunning",
|
|
Action: new(kubesphere.Check),
|
|
Retry: 20,
|
|
Delay: 10 * time.Second,
|
|
},
|
|
}
|
|
}
|
|
|
|
func upgradePrometheusServiceMonitorKubelet() []task.Interface {
|
|
return []task.Interface{
|
|
// prometheus kubelet ServiceMonitor
|
|
&task.LocalTask{
|
|
Name: "ApplyKubeletServiceMonitor",
|
|
Action: new(applyKubeletServiceMonitorAction),
|
|
Retry: 5,
|
|
Delay: 5 * time.Second,
|
|
},
|
|
}
|
|
}
|
|
|
|
func upgradeKsConfig() []task.Interface {
|
|
return []task.Interface{
|
|
&task.LocalTask{
|
|
Name: "CopyEmbeddedKSManifests",
|
|
Action: new(plugins.CopyEmbedFiles),
|
|
},
|
|
&task.LocalTask{
|
|
Name: "ApplyKsConfigManifests",
|
|
Action: new(plugins.ApplyKsConfigManifests),
|
|
Retry: 5,
|
|
Delay: 5 * time.Second,
|
|
},
|
|
}
|
|
}
|
|
|
|
// applyKubeletServiceMonitorAction applies embedded prometheus kubelet ServiceMonitor
|
|
type applyKubeletServiceMonitorAction struct {
|
|
common.KubeAction
|
|
}
|
|
|
|
func (a *applyKubeletServiceMonitorAction) Execute(runtime connector.Runtime) error {
|
|
kubectlpath, err := util.GetCommand(common.CommandKubectl)
|
|
if err != nil {
|
|
return errors.Wrap(errors.WithStack(err), "kubectl not found")
|
|
}
|
|
manifest := path.Join(runtime.GetInstallerDir(), cc.BuildFilesCacheDir, cc.BuildDir, "prometheus", "kubernetes", "kubernetes-serviceMonitorKubelet.yaml")
|
|
if _, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("%s apply -f %s", kubectlpath, manifest), false, true); err != nil {
|
|
return errors.Wrap(errors.WithStack(err), "apply kubelet ServiceMonitor failed")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// applyNodeExporterAction applies embedded node-exporter
|
|
type applyNodeExporterAction struct {
|
|
common.KubeAction
|
|
}
|
|
|
|
func (a *applyNodeExporterAction) Execute(runtime connector.Runtime) error {
|
|
kubectlpath, err := util.GetCommand(common.CommandKubectl)
|
|
if err != nil {
|
|
return errors.Wrap(errors.WithStack(err), "kubectl not found")
|
|
}
|
|
manifest := path.Join(runtime.GetInstallerDir(), cc.BuildFilesCacheDir, cc.BuildDir, "prometheus", "node-exporter", "node-exporter-daemonset.yaml")
|
|
if _, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("%s apply -f %s", kubectlpath, manifest), false, true); err != nil {
|
|
return errors.Wrap(errors.WithStack(err), "apply node-exporter failed")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func upgradeNodeExporter() []task.Interface {
|
|
return []task.Interface{
|
|
&task.LocalTask{
|
|
Name: "CopyEmbeddedKSManifests",
|
|
Action: new(plugins.CopyEmbedFiles),
|
|
},
|
|
&task.LocalTask{
|
|
Name: "applyNodeExporterManifests",
|
|
Action: new(applyNodeExporterAction),
|
|
},
|
|
}
|
|
}
|
|
|
|
func regenerateKubeFiles() []task.Interface {
|
|
var tasks []task.Interface
|
|
kubeType := phase.GetKubeType()
|
|
if kubeType == common.K3s {
|
|
tasks = append(tasks,
|
|
&task.LocalTask{
|
|
Name: "RegenerateK3sService",
|
|
Action: new(k3s.GenerateK3sService),
|
|
},
|
|
&task.LocalTask{
|
|
Name: "RestartK3sService",
|
|
Action: &terminus.SystemctlCommand{
|
|
Command: "restart",
|
|
UnitNames: []string{k3stemplates.K3sService.Name()},
|
|
DaemonReloadPreExec: true,
|
|
},
|
|
},
|
|
)
|
|
} else {
|
|
tasks = append(tasks,
|
|
&task.LocalTask{
|
|
Name: "RegenerateKubeadmConfig",
|
|
Action: &kubernetes.GenerateKubeadmConfig{
|
|
IsInitConfiguration: true,
|
|
},
|
|
},
|
|
&task.LocalTask{
|
|
Name: "RegenerateK8sFilesWithKubeadm",
|
|
Action: new(terminus.RegenerateFilesForK8s),
|
|
},
|
|
)
|
|
}
|
|
|
|
tasks = append(tasks,
|
|
&task.LocalTask{
|
|
Name: "WaitForKubeAPIServerUp",
|
|
Action: new(precheck.GetKubernetesNodesStatus),
|
|
Retry: 10,
|
|
Delay: 10,
|
|
},
|
|
)
|
|
return tasks
|
|
}
|
|
|
|
type upgradeL4BFLProxy struct {
|
|
common.KubeAction
|
|
Tag string
|
|
}
|
|
|
|
func (u *upgradeL4BFLProxy) Execute(runtime connector.Runtime) error {
|
|
if _, err := runtime.GetRunner().SudoCmd(fmt.Sprintf(
|
|
"/usr/local/bin/kubectl set image deployment/l4-bfl-proxy proxy=beclab/l4-bfl-proxy:%s -n os-network", u.Tag), false, true); err != nil {
|
|
return errors.Wrapf(errors.WithStack(err), "failed to upgrade L4 network proxy to version %s", u.Tag)
|
|
}
|
|
|
|
logger.Infof("L4 upgrade to version %s completed successfully", u.Tag)
|
|
return nil
|
|
}
|
|
|
|
type upgradeGPUDriverIfNeeded struct {
|
|
common.KubeAction
|
|
}
|
|
|
|
// fixProcModprobePath fixes the /proc/sys/kernel/modprobe path issue that can cause
|
|
// nvidia-installer to fail with error:
|
|
// "The path to the `modprobe` utility reported by '/proc/sys/kernel/modprobe', ”, differs from
|
|
// the path determined by `nvidia-installer`, `/bin/kmod`, and does not appear to point to a
|
|
// valid `modprobe` binary."
|
|
//
|
|
// This function checks if /proc/sys/kernel/modprobe is empty or invalid, and if so,
|
|
// writes a valid modprobe path to it.
|
|
func fixProcModprobePath() {
|
|
const procModprobePath = "/proc/sys/kernel/modprobe"
|
|
|
|
modprobePaths := []string{
|
|
"/sbin/modprobe",
|
|
"/usr/sbin/modprobe",
|
|
"/bin/modprobe",
|
|
"/usr/bin/modprobe",
|
|
}
|
|
|
|
data, err := os.ReadFile(procModprobePath)
|
|
if err != nil {
|
|
logger.Warnf("failed to read %s: %v", procModprobePath, err)
|
|
}
|
|
currentPath := strings.TrimSpace(string(data))
|
|
|
|
// Check if current path is valid (non-empty and executable)
|
|
if currentPath != "" {
|
|
if util.IsExecutable(currentPath) {
|
|
logger.Debugf("%s already contains valid path: %s", procModprobePath, currentPath)
|
|
return
|
|
}
|
|
// in case it's a symlink that resolves to a valid executable
|
|
if resolved, err := filepath.EvalSymlinks(currentPath); err == nil && resolved != "" {
|
|
if util.IsExecutable(resolved) {
|
|
logger.Debugf("%s contains symlink %s -> %s which is valid", procModprobePath, currentPath, resolved)
|
|
return
|
|
}
|
|
}
|
|
logger.Warnf("%s contains invalid path: '%s', attempting to fix", procModprobePath, currentPath)
|
|
} else {
|
|
logger.Warnf("%s is empty, attempting to fix", procModprobePath)
|
|
}
|
|
|
|
if lookPath, err := exec.LookPath("modprobe"); err == nil && lookPath != "" {
|
|
modprobePaths = append([]string{lookPath}, modprobePaths...)
|
|
}
|
|
|
|
for _, modprobePath := range modprobePaths {
|
|
if !util.IsExecutable(modprobePath) {
|
|
continue
|
|
}
|
|
|
|
if err := os.WriteFile(procModprobePath, []byte(modprobePath), 0644); err != nil {
|
|
logger.Warnf("failed to write %s to %s: %v", modprobePath, procModprobePath, err)
|
|
continue
|
|
}
|
|
|
|
logger.Infof("successfully fixed %s: set to %s", procModprobePath, modprobePath)
|
|
return
|
|
}
|
|
|
|
// If we get here, we couldn't fix it, but we log a warning and continue
|
|
// The nvidia-installer might still work, or it might fail, but we don't want to block the upgrade
|
|
logger.Warnf("could not fix %s, nvidia-installer may fail; continuing anyway", procModprobePath)
|
|
}
|
|
|
|
func (a *upgradeGPUDriverIfNeeded) Execute(runtime connector.Runtime) error {
|
|
sys := runtime.GetSystemInfo()
|
|
if sys.IsWsl() {
|
|
return nil
|
|
}
|
|
if !(sys.IsUbuntu() || sys.IsDebian()) {
|
|
return nil
|
|
}
|
|
|
|
model, _, err := utils.DetectNvidiaModelAndArch(runtime)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if strings.TrimSpace(model) == "" {
|
|
return nil
|
|
}
|
|
|
|
m, err := manifest.ReadAll(a.KubeConf.Arg.Manifest)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
item, err := m.Get("cuda-driver")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
var targetDriverVersionStr string
|
|
if parts := strings.Split(item.Filename, "-"); len(parts) >= 3 {
|
|
targetDriverVersionStr = strings.TrimSuffix(parts[len(parts)-1], ".run")
|
|
}
|
|
if targetDriverVersionStr == "" {
|
|
return fmt.Errorf("failed to parse target CUDA driver version from %s", item.Filename)
|
|
}
|
|
targetVersion, err := semver.NewVersion(targetDriverVersionStr)
|
|
if err != nil {
|
|
return fmt.Errorf("invalid target driver version '%s': %v", targetDriverVersionStr, err)
|
|
}
|
|
|
|
var needUpgrade bool
|
|
|
|
status, derr := utils.GetNvidiaStatus(runtime)
|
|
// for now, consider it as not installed if error occurs
|
|
// and continue to upgrade
|
|
if derr != nil {
|
|
logger.Warnf("failed to detect NVIDIA driver status, assuming upgrade is needed: %v", derr)
|
|
needUpgrade = true
|
|
}
|
|
|
|
if status != nil && status.Installed {
|
|
currentStr := status.DriverVersion
|
|
if status.Mismatch && status.LibraryVersion != "" {
|
|
currentStr = status.LibraryVersion
|
|
}
|
|
if v, perr := semver.NewVersion(currentStr); perr == nil {
|
|
needUpgrade = targetVersion.GreaterThan(v)
|
|
} else {
|
|
// cannot parse current version, assume upgrade needed
|
|
needUpgrade = true
|
|
}
|
|
} else {
|
|
needUpgrade = true
|
|
}
|
|
|
|
changed := false
|
|
if needUpgrade {
|
|
// if apt-installed, uninstall apt nvidia packages but keep toolkit
|
|
if status != nil && status.InstallMethod != utils.GPUDriverInstallMethodRunfile {
|
|
if err := new(gpu.UninstallNvidiaDrivers).Execute(runtime); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
_, _ = runtime.GetRunner().SudoCmd("apt-get update", false, true)
|
|
if _, err := runtime.GetRunner().SudoCmd("DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends dkms build-essential linux-headers-$(uname -r)", false, true); err != nil {
|
|
return errors.Wrap(errors.WithStack(err), "failed to install kernel build dependencies for NVIDIA runfile")
|
|
}
|
|
|
|
fixProcModprobePath()
|
|
|
|
// install runfile
|
|
runfile := item.FilePath(runtime.GetBaseDir())
|
|
if _, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("chmod +x %s", runfile), false, true); err != nil {
|
|
return errors.Wrap(errors.WithStack(err), "failed to chmod +x runfile")
|
|
}
|
|
cmd := fmt.Sprintf("sh %s -z --no-x-check --allow-installation-with-running-driver --no-check-for-alternate-installs --dkms --rebuild-initramfs -s", runfile)
|
|
if _, err := runtime.GetRunner().SudoCmd(cmd, false, true); err != nil {
|
|
return errors.Wrap(errors.WithStack(err), "failed to install NVIDIA driver via runfile")
|
|
}
|
|
client, err := clientset.NewKubeClient()
|
|
if err != nil {
|
|
return errors.Wrap(errors.WithStack(err), "kubeclient create error")
|
|
}
|
|
err = gpu.UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), &targetDriverVersionStr, ptr.To(common.CurrentVerifiedCudaVersion), ptr.To("true"))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
changed = true
|
|
}
|
|
|
|
needReboot := changed || (status != nil && status.Mismatch)
|
|
a.PipelineCache.Set(cacheRebootNeeded, needReboot)
|
|
return nil
|
|
}
|
|
|
|
type rebootIfNeeded struct {
|
|
common.KubeAction
|
|
}
|
|
|
|
func (r *rebootIfNeeded) Execute(runtime connector.Runtime) error {
|
|
val, ok := r.PipelineCache.GetMustBool(cacheRebootNeeded)
|
|
if ok && val {
|
|
_, _ = runtime.GetRunner().SudoCmd("reboot now", false, false)
|
|
}
|
|
return nil
|
|
}
|