335 lines
9.6 KiB
Go
335 lines
9.6 KiB
Go
package upgrade
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"path"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/Masterminds/semver/v3"
|
|
"github.com/beclab/Olares/cli/pkg/bootstrap/precheck"
|
|
"github.com/beclab/Olares/cli/pkg/clientset"
|
|
"github.com/beclab/Olares/cli/pkg/common"
|
|
"github.com/beclab/Olares/cli/pkg/container"
|
|
cc "github.com/beclab/Olares/cli/pkg/core/common"
|
|
"github.com/beclab/Olares/cli/pkg/core/connector"
|
|
"github.com/beclab/Olares/cli/pkg/core/logger"
|
|
"github.com/beclab/Olares/cli/pkg/core/task"
|
|
"github.com/beclab/Olares/cli/pkg/core/util"
|
|
"github.com/beclab/Olares/cli/pkg/gpu"
|
|
"github.com/beclab/Olares/cli/pkg/k3s"
|
|
k3stemplates "github.com/beclab/Olares/cli/pkg/k3s/templates"
|
|
"github.com/beclab/Olares/cli/pkg/kubernetes"
|
|
"github.com/beclab/Olares/cli/pkg/kubesphere"
|
|
"github.com/beclab/Olares/cli/pkg/kubesphere/plugins"
|
|
"github.com/beclab/Olares/cli/pkg/manifest"
|
|
"github.com/beclab/Olares/cli/pkg/phase"
|
|
"github.com/beclab/Olares/cli/pkg/terminus"
|
|
"github.com/beclab/Olares/cli/pkg/utils"
|
|
"github.com/pkg/errors"
|
|
"k8s.io/utils/ptr"
|
|
)
|
|
|
|
const cacheRebootNeeded = "reboot.needed"
|
|
|
|
type upgradeContainerdAction struct {
|
|
common.KubeAction
|
|
}
|
|
|
|
func (u *upgradeContainerdAction) Execute(runtime connector.Runtime) error {
|
|
m, err := manifest.ReadAll(u.KubeConf.Arg.Manifest)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
action := &container.SyncContainerd{
|
|
ManifestAction: manifest.ManifestAction{
|
|
Manifest: m,
|
|
BaseDir: runtime.GetBaseDir(),
|
|
},
|
|
}
|
|
return action.Execute(runtime)
|
|
}
|
|
|
|
func upgradeContainerd() []task.Interface {
|
|
return []task.Interface{
|
|
&task.LocalTask{
|
|
Name: "UpgradeContainerd",
|
|
Action: new(upgradeContainerdAction),
|
|
},
|
|
&task.LocalTask{
|
|
Name: "RestartContainerd",
|
|
Action: new(container.RestartContainerd),
|
|
},
|
|
}
|
|
}
|
|
|
|
func upgradeKSCore() []task.Interface {
|
|
return []task.Interface{
|
|
&task.LocalTask{
|
|
Name: "CopyEmbeddedKSManifests",
|
|
Action: new(plugins.CopyEmbedFiles),
|
|
},
|
|
&task.LocalTask{
|
|
Name: "UpgradeKSCore",
|
|
Action: new(plugins.CreateKsCore),
|
|
Retry: 10,
|
|
Delay: 10 * time.Second,
|
|
},
|
|
&task.LocalTask{
|
|
Name: "CheckKSCoreRunning",
|
|
Action: new(kubesphere.Check),
|
|
Retry: 20,
|
|
Delay: 10 * time.Second,
|
|
},
|
|
}
|
|
}
|
|
|
|
func upgradePrometheusServiceMonitorKubelet() []task.Interface {
|
|
return []task.Interface{
|
|
// prometheus kubelet ServiceMonitor
|
|
&task.LocalTask{
|
|
Name: "ApplyKubeletServiceMonitor",
|
|
Action: new(applyKubeletServiceMonitorAction),
|
|
Retry: 5,
|
|
Delay: 5 * time.Second,
|
|
},
|
|
}
|
|
}
|
|
|
|
func upgradeKsConfig() []task.Interface {
|
|
return []task.Interface{
|
|
&task.LocalTask{
|
|
Name: "CopyEmbeddedKSManifests",
|
|
Action: new(plugins.CopyEmbedFiles),
|
|
},
|
|
&task.LocalTask{
|
|
Name: "ApplyKsConfigManifests",
|
|
Action: new(plugins.ApplyKsConfigManifests),
|
|
Retry: 5,
|
|
Delay: 5 * time.Second,
|
|
},
|
|
}
|
|
}
|
|
|
|
// applyKubeletServiceMonitorAction applies embedded prometheus kubelet ServiceMonitor
|
|
type applyKubeletServiceMonitorAction struct {
|
|
common.KubeAction
|
|
}
|
|
|
|
func (a *applyKubeletServiceMonitorAction) Execute(runtime connector.Runtime) error {
|
|
kubectlpath, err := util.GetCommand(common.CommandKubectl)
|
|
if err != nil {
|
|
return errors.Wrap(errors.WithStack(err), "kubectl not found")
|
|
}
|
|
manifest := path.Join(runtime.GetInstallerDir(), cc.BuildFilesCacheDir, cc.BuildDir, "prometheus", "kubernetes", "kubernetes-serviceMonitorKubelet.yaml")
|
|
if _, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("%s apply -f %s", kubectlpath, manifest), false, true); err != nil {
|
|
return errors.Wrap(errors.WithStack(err), "apply kubelet ServiceMonitor failed")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// applyNodeExporterAction applies embedded node-exporter
|
|
type applyNodeExporterAction struct {
|
|
common.KubeAction
|
|
}
|
|
|
|
func (a *applyNodeExporterAction) Execute(runtime connector.Runtime) error {
|
|
kubectlpath, err := util.GetCommand(common.CommandKubectl)
|
|
if err != nil {
|
|
return errors.Wrap(errors.WithStack(err), "kubectl not found")
|
|
}
|
|
manifest := path.Join(runtime.GetInstallerDir(), cc.BuildFilesCacheDir, cc.BuildDir, "prometheus", "node-exporter", "node-exporter-daemonset.yaml")
|
|
if _, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("%s apply -f %s", kubectlpath, manifest), false, true); err != nil {
|
|
return errors.Wrap(errors.WithStack(err), "apply node-exporter failed")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func upgradeNodeExporter() []task.Interface {
|
|
return []task.Interface{
|
|
&task.LocalTask{
|
|
Name: "CopyEmbeddedKSManifests",
|
|
Action: new(plugins.CopyEmbedFiles),
|
|
},
|
|
&task.LocalTask{
|
|
Name: "applyNodeExporterManifests",
|
|
Action: new(applyNodeExporterAction),
|
|
},
|
|
}
|
|
}
|
|
|
|
func regenerateKubeFiles() []task.Interface {
|
|
var tasks []task.Interface
|
|
kubeType := phase.GetKubeType()
|
|
if kubeType == common.K3s {
|
|
tasks = append(tasks,
|
|
&task.LocalTask{
|
|
Name: "RegenerateK3sService",
|
|
Action: new(k3s.GenerateK3sService),
|
|
},
|
|
&task.LocalTask{
|
|
Name: "RestartK3sService",
|
|
Action: &terminus.SystemctlCommand{
|
|
Command: "restart",
|
|
UnitNames: []string{k3stemplates.K3sService.Name()},
|
|
DaemonReloadPreExec: true,
|
|
},
|
|
},
|
|
)
|
|
} else {
|
|
tasks = append(tasks,
|
|
&task.LocalTask{
|
|
Name: "RegenerateKubeadmConfig",
|
|
Action: &kubernetes.GenerateKubeadmConfig{
|
|
IsInitConfiguration: true,
|
|
},
|
|
},
|
|
&task.LocalTask{
|
|
Name: "RegenerateK8sFilesWithKubeadm",
|
|
Action: new(terminus.RegenerateFilesForK8s),
|
|
},
|
|
)
|
|
}
|
|
|
|
tasks = append(tasks,
|
|
&task.LocalTask{
|
|
Name: "WaitForKubeAPIServerUp",
|
|
Action: new(precheck.GetKubernetesNodesStatus),
|
|
Retry: 10,
|
|
Delay: 10,
|
|
},
|
|
)
|
|
return tasks
|
|
}
|
|
|
|
type upgradeL4BFLProxy struct {
|
|
common.KubeAction
|
|
Tag string
|
|
}
|
|
|
|
func (u *upgradeL4BFLProxy) Execute(runtime connector.Runtime) error {
|
|
if _, err := runtime.GetRunner().SudoCmd(fmt.Sprintf(
|
|
"/usr/local/bin/kubectl set image deployment/l4-bfl-proxy proxy=beclab/l4-bfl-proxy:%s -n os-network", u.Tag), false, true); err != nil {
|
|
return errors.Wrapf(errors.WithStack(err), "failed to upgrade L4 network proxy to version %s", u.Tag)
|
|
}
|
|
|
|
logger.Infof("L4 upgrade to version %s completed successfully", u.Tag)
|
|
return nil
|
|
}
|
|
|
|
type upgradeGPUDriverIfNeeded struct {
|
|
common.KubeAction
|
|
}
|
|
|
|
func (a *upgradeGPUDriverIfNeeded) Execute(runtime connector.Runtime) error {
|
|
sys := runtime.GetSystemInfo()
|
|
if sys.IsWsl() {
|
|
return nil
|
|
}
|
|
if !(sys.IsUbuntu() || sys.IsDebian()) {
|
|
return nil
|
|
}
|
|
|
|
model, _, err := utils.DetectNvidiaModelAndArch(runtime)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if strings.TrimSpace(model) == "" {
|
|
return nil
|
|
}
|
|
|
|
m, err := manifest.ReadAll(a.KubeConf.Arg.Manifest)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
item, err := m.Get("cuda-driver")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
var targetDriverVersionStr string
|
|
if parts := strings.Split(item.Filename, "-"); len(parts) >= 3 {
|
|
targetDriverVersionStr = strings.TrimSuffix(parts[len(parts)-1], ".run")
|
|
}
|
|
if targetDriverVersionStr == "" {
|
|
return fmt.Errorf("failed to parse target CUDA driver version from %s", item.Filename)
|
|
}
|
|
targetVersion, err := semver.NewVersion(targetDriverVersionStr)
|
|
if err != nil {
|
|
return fmt.Errorf("invalid target driver version '%s': %v", targetDriverVersionStr, err)
|
|
}
|
|
|
|
var needUpgrade bool
|
|
|
|
status, derr := utils.GetNvidiaStatus(runtime)
|
|
// for now, consider it as not installed if error occurs
|
|
// and continue to upgrade
|
|
if derr != nil {
|
|
logger.Warnf("failed to detect NVIDIA driver status, assuming upgrade is needed: %v", derr)
|
|
needUpgrade = true
|
|
}
|
|
|
|
if status != nil && status.Installed {
|
|
currentStr := status.DriverVersion
|
|
if status.Mismatch && status.LibraryVersion != "" {
|
|
currentStr = status.LibraryVersion
|
|
}
|
|
if v, perr := semver.NewVersion(currentStr); perr == nil {
|
|
needUpgrade = targetVersion.GreaterThan(v)
|
|
} else {
|
|
// cannot parse current version, assume upgrade needed
|
|
needUpgrade = true
|
|
}
|
|
} else {
|
|
needUpgrade = true
|
|
}
|
|
|
|
changed := false
|
|
if needUpgrade {
|
|
// if apt-installed, uninstall apt nvidia packages but keep toolkit
|
|
if status != nil && status.InstallMethod != utils.GPUDriverInstallMethodRunfile {
|
|
if err := new(gpu.UninstallNvidiaDrivers).Execute(runtime); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
_, _ = runtime.GetRunner().SudoCmd("apt-get update", false, true)
|
|
if _, err := runtime.GetRunner().SudoCmd("DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends dkms build-essential linux-headers-$(uname -r)", false, true); err != nil {
|
|
return errors.Wrap(errors.WithStack(err), "failed to install kernel build dependencies for NVIDIA runfile")
|
|
}
|
|
// install runfile
|
|
runfile := item.FilePath(runtime.GetBaseDir())
|
|
if _, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("chmod +x %s", runfile), false, true); err != nil {
|
|
return errors.Wrap(errors.WithStack(err), "failed to chmod +x runfile")
|
|
}
|
|
cmd := fmt.Sprintf("sh %s -z --no-x-check --allow-installation-with-running-driver --no-check-for-alternate-installs --dkms --rebuild-initramfs -s", runfile)
|
|
if _, err := runtime.GetRunner().SudoCmd(cmd, false, true); err != nil {
|
|
return errors.Wrap(errors.WithStack(err), "failed to install NVIDIA driver via runfile")
|
|
}
|
|
client, err := clientset.NewKubeClient()
|
|
if err != nil {
|
|
return errors.Wrap(errors.WithStack(err), "kubeclient create error")
|
|
}
|
|
err = gpu.UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), &targetDriverVersionStr, ptr.To(common.CurrentVerifiedCudaVersion), ptr.To("true"))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
changed = true
|
|
}
|
|
|
|
needReboot := changed || (status != nil && status.Mismatch)
|
|
a.PipelineCache.Set(cacheRebootNeeded, needReboot)
|
|
return nil
|
|
}
|
|
|
|
type rebootIfNeeded struct {
|
|
common.KubeAction
|
|
}
|
|
|
|
func (r *rebootIfNeeded) Execute(runtime connector.Runtime) error {
|
|
val, ok := r.PipelineCache.GetMustBool(cacheRebootNeeded)
|
|
if ok && val {
|
|
_, _ = runtime.GetRunner().SudoCmd("reboot now", false, false)
|
|
}
|
|
return nil
|
|
}
|