package main import ( "context" "encoding/json" "fmt" "io/ioutil" "log" "net/http" "os" "strconv" "strings" "time" "golang.org/x/crypto/ssh" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" metrics "k8s.io/metrics/pkg/client/clientset/versioned" "gopkg.in/yaml.v3" ) // Node and NodePool structs type Node struct { Name string `yaml:"name" json:"name"` IP string `yaml:"ip" json:"ip"` Status string `yaml:"status" json:"status"` CPU int `yaml:"cpu" json:"cpu"` Memory int `yaml:"memory" json:"memory"` Role string `yaml:"role" json:"role"` Cluster string `yaml:"cluster" json:"cluster"` LastActive string `yaml:"last_active" json:"last_active"` Pods int `yaml:"pods,omitempty" json:"pods,omitempty"` Temperature float64 `yaml:"temperature,omitempty" json:"temperature,omitempty"` } type NodePool struct { Nodes []Node `yaml:"nodes" json:"nodes"` } // mustIntEnv func mustIntEnv(name string, def int) int { if val := os.Getenv(name); val != "" { if i, err := strconv.Atoi(val); err == nil { return i } } return def } // loadPool func loadPool(file string) (*NodePool, error) { data, err := ioutil.ReadFile(file) if err != nil { return nil, err } var pool NodePool err = yaml.Unmarshal(data, &pool) if err != nil { return nil, err } return &pool, nil } // savePool func savePool(file string, pool *NodePool) error { data, err := yaml.Marshal(pool) if err != nil { return err } return ioutil.WriteFile(file, data, 0644) } // initNodePool func initNodePool(clientset *kubernetes.Clientset, poolFile string) (*NodePool, error) { ctx := context.Background() pool, err := loadPool(poolFile) if err != nil { log.Println("Failed to load existing node pool, starting fresh:", err) pool = &NodePool{} } nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { return nil, fmt.Errorf("failed to list nodes: %w", err) } poolMap := map[string]*Node{} for i := range pool.Nodes { poolMap[pool.Nodes[i].Name] = &pool.Nodes[i] } updatedNodes := []Node{} for _, n := range nodes.Items { ip := nodeIP(&n) status := "online" role := "worker" if isControlPlane(&n) { role = "microk8s-controlplane" } node := Node{ Name: n.Name, IP: ip, Status: status, Role: role, Cluster: os.Getenv("CLUSTER_NAME"), CPU: 0, Memory: 0, LastActive: time.Now().Format(time.RFC3339), } if oldNode, ok := poolMap[n.Name]; ok { node.Status = oldNode.Status node.LastActive = oldNode.LastActive node.CPU = oldNode.CPU node.Memory = oldNode.Memory node.Cluster = oldNode.Cluster node.Role = oldNode.Role } updatedNodes = append(updatedNodes, node) } pool.Nodes = updatedNodes savePool(poolFile, pool) log.Printf("Initialized node pool with %d nodes", len(pool.Nodes)) return pool, nil } // runSSH func runSSH(host, user, pass, cmd string) (string, error) { config := &ssh.ClientConfig{ User: user, Auth: []ssh.AuthMethod{ssh.Password(pass)}, HostKeyCallback: ssh.InsecureIgnoreHostKey(), Timeout: 10 * time.Second, } client, err := ssh.Dial("tcp", fmt.Sprintf("%s:22", host), config) if err != nil { return "", err } defer client.Close() session, err := client.NewSession() if err != nil { return "", err } defer session.Close() out, err := session.CombinedOutput(cmd) return string(out), err } // isControlPlane func isControlPlane(n *v1.Node) bool { if _, ok := n.Labels["node.kubernetes.io/microk8s-controlplane"]; ok { return true } if _, ok := n.Labels["node-role.kubernetes.io/control-plane"]; ok { return true } return false } // nodeIP func nodeIP(n *v1.Node) string { for _, addr := range n.Status.Addresses { if addr.Type == v1.NodeInternalIP { return addr.Address } } return "" } // updatePerNodeUtilization func updatePerNodeUtilization(cs *kubernetes.Clientset, ms *metrics.Clientset, pool *NodePool, sshUser, sshPass string) error { ctx := context.Background() nodes, err := cs.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { return err } metricsList, err := ms.MetricsV1beta1().NodeMetricses().List(ctx, metav1.ListOptions{}) if err != nil { return err } capCPU := map[string]int64{} capMem := map[string]int64{} nodeMap := map[string]*v1.Node{} for i, n := range nodes.Items { capCPU[n.Name] = n.Status.Capacity.Cpu().MilliValue() capMem[n.Name] = n.Status.Capacity.Memory().Value() nodeMap[n.Name] = &nodes.Items[i] } usageCPU := map[string]int64{} usageMem := map[string]int64{} for _, m := range metricsList.Items { usageCPU[m.Name] = m.Usage.Cpu().MilliValue() usageMem[m.Name] = m.Usage.Memory().Value() } pods, err := cs.CoreV1().Pods("").List(ctx, metav1.ListOptions{}) if err != nil { return err } podCount := map[string]int{} for _, p := range pods.Items { if p.Spec.NodeName != "" { podCount[p.Spec.NodeName]++ } } for i := range pool.Nodes { n := &pool.Nodes[i] name := n.Name cpuCap, ok1 := capCPU[name] memCap, ok2 := capMem[name] cpuUse, ok3 := usageCPU[name] memUse, ok4 := usageMem[name] if ok1 && ok2 && ok3 && ok4 && cpuCap > 0 && memCap > 0 { n.CPU = int((cpuUse * 100) / cpuCap) n.Memory = int((memUse * 100) / memCap) } n.Pods = podCount[name] if kn, ok := nodeMap[name]; ok { n.Temperature = getNodeTemp(kn, sshUser, sshPass) } } return nil } // getNodeTemp func getNodeTemp(node *v1.Node, sshUser, sshPass string) float64 { ip := nodeIP(node) out, err := runSSH(ip, sshUser, sshPass, "cat /proc/device-tree/model") if err != nil { return 0 } hw := strings.ToUpper(strings.TrimSpace(out)) var cmd string switch { case strings.Contains(hw, "RASPBERRY"): cmd = "vcgencmd measure_temp | egrep -o '[0-9]+\\.[0-9]+'" case strings.Contains(hw, "ODROID"): cmd = "awk '{printf \"%3.1f\", $1/1000}' /sys/class/thermal/thermal_zone0/temp" default: return 0 } tempStr, err := runSSH(ip, sshUser, sshPass, cmd) if err != nil { return 0 } t, err := strconv.ParseFloat(strings.TrimSpace(tempStr), 64) if err != nil { return 0 } return t } // ... other functions: ensureControlPlanes, clusterUtilization, deactivateOneWorkerSafe, activateOneWorker etc. // (exactly as in your last working version) // startWebGUI func startWebGUI(poolFile string) { http.HandleFunc("/status", func(w http.ResponseWriter, r *http.Request) { log.Println("Serving /status") pool, err := loadPool(poolFile) if err != nil { log.Println("Cannot load node pool:", err) http.Error(w, "cannot load node pool", http.StatusInternalServerError) return } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(pool) }) fs := http.FileServer(http.Dir("/app/web")) http.Handle("/", fs) go func() { log.Println("Web GUI running at :8080") if err := http.ListenAndServe(":8080", nil); err != nil { log.Fatal(err) } }() } // main func main() { poolFile := os.Getenv("NODE_LIST_FILE") sshUser := os.Getenv("NODE_SSH_USER") sshPass := os.Getenv("NODE_SSH_PASS") minCPU := mustIntEnv("MIN_CPU", 20) maxCPU := mustIntEnv("MAX_CPU", 80) minMem := mustIntEnv("MIN_MEM", 30) maxMem := mustIntEnv("MAX_MEM", 80) desiredCP := mustIntEnv("DESIRED_CONTROL_PLANES", 3) waitSec := mustIntEnv("DEACTIVATE_WAIT_SEC", 120) clusterName := os.Getenv("CLUSTER_NAME") if clusterName == "" { log.Fatal("CLUSTER_NAME must be set") } log.Println("=== startup configuration ===") log.Printf("NODE_LIST_FILE=%q", poolFile) log.Printf("NODE_SSH_USER=%q", sshUser) log.Printf("MIN_CPU=%d", minCPU) log.Printf("MAX_CPU=%d", maxCPU) log.Printf("MIN_MEM=%d", minMem) log.Printf("MAX_MEM=%d", maxMem) log.Printf("DESIRED_CONTROL_PLANES=%d", desiredCP) log.Printf("DEACTIVATE_WAIT_SEC=%d", waitSec) log.Printf("CLUSTER_NAME=%q", clusterName) log.Println("=============================") startWebGUI(poolFile) config, err := rest.InClusterConfig() if err != nil { log.Fatal(err) } clientset, err := kubernetes.NewForConfig(config) if err != nil { log.Fatal(err) } _, err = initNodePool(clientset, poolFile) if err != nil { log.Fatal("Failed to initialize node pool:", err) } for { pool, err := loadPool(poolFile) if err != nil { log.Println("cannot load node pool:", err) time.Sleep(30 * time.Second) continue } metricsClient, err := metrics.NewForConfig(config) if err != nil { log.Fatal(err) } ensureControlPlanes(clientset, pool, poolFile, sshUser, sshPass, clusterName, desiredCP) clusterCPU, clusterMem, err := clusterUtilization(clientset, metricsClient) if err != nil { log.Println("failed to compute utilization:", err) time.Sleep(30 * time.Second) continue } log.Printf("Cluster CPU: %d%%, Mem: %d%%", clusterCPU, clusterMem) if clusterCPU < minCPU && clusterMem < minMem { log.Println("Below minimum thresholds, attempting to deactivate one worker node...") deactivateOneWorkerSafe(clientset, pool, poolFile, sshUser, sshPass, clusterName, waitSec) } if clusterCPU > maxCPU || clusterMem > maxMem { log.Println("Above maximum thresholds, attempting to activate a free node from the node-pool...") activateOneWorker(clientset, pool, poolFile, sshUser, sshPass, clusterName) } err = updatePerNodeUtilization(clientset, metricsClient, pool, sshUser, sshPass) if err != nil { log.Println("failed to update per-node utilization:", err) } if err := savePool(poolFile, pool); err != nil { log.Println("failed to save node pool:", err) } time.Sleep(30 * time.Second) } }