package main import ( "context" "encoding/json" "fmt" "log" "net/http" "os" "strconv" "sync" "time" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/apimachinery/pkg/types" metricsv "k8s.io/metrics/pkg/client/clientset/versioned" ) var ( CPUOverloadThreshold int64 MemoryOverloadThreshold int64 CPUUnderloadThreshold int64 MemoryUnderloadThreshold int64 PodEvictionInterval time.Duration PodAnimationDuration time.Duration PodRespawnTimeout time.Duration ) func getEnvInt64(name string, defaultVal int64) int64 { valStr := os.Getenv(name) if valStr == "" { return defaultVal } val, err := strconv.ParseInt(valStr, 10, 64) if err != nil { log.Printf("Invalid %s: %s, using default %d", name, valStr, defaultVal) return defaultVal } return val } func getEnvSecondsDuration(name string, defaultSec int64) time.Duration { valStr := os.Getenv(name) if valStr == "" { return time.Duration(defaultSec) * time.Second } sec, err := strconv.ParseInt(valStr, 10, 64) if err != nil { log.Printf("Invalid %s: %s, using default %d seconds", name, valStr, defaultSec) return time.Duration(defaultSec) * time.Second } return time.Duration(sec) * time.Second } func init() { CPUOverloadThreshold = getEnvInt64("CPU_OVERLOAD_THRESHOLD", 80) MemoryOverloadThreshold = getEnvInt64("MEMORY_OVERLOAD_THRESHOLD", 80) CPUUnderloadThreshold = getEnvInt64("CPU_UNDERLOAD_THRESHOLD", 50) MemoryUnderloadThreshold = getEnvInt64("MEMORY_UNDERLOAD_THRESHOLD", 50) PodEvictionInterval = getEnvSecondsDuration("POD_EVICTION_INTERVAL", 30) PodAnimationDuration = getEnvSecondsDuration("POD_ANIMATION_DURATION", 2) PodRespawnTimeout = getEnvSecondsDuration("POD_RESPAWN_TIMEOUT", 60) log.Println("──────────── CONFIG ────────────") log.Printf("CPU_OVERLOAD_THRESHOLD = %d%%", CPUOverloadThreshold) log.Printf("MEMORY_OVERLOAD_THRESHOLD = %d%%", MemoryOverloadThreshold) log.Printf("CPU_UNDERLOAD_THRESHOLD = %d%%", CPUUnderloadThreshold) log.Printf("MEMORY_UNDERLOAD_THRESHOLD = %d%%", MemoryUnderloadThreshold) log.Printf("POD_EVICTION_INTERVAL = %v", PodEvictionInterval) log.Printf("POD_ANIMATION_DURATION = %v", PodAnimationDuration) log.Printf("POD_RESPAWN_TIMEOUT = %v", PodRespawnTimeout) log.Println("───────────────────────────────") } type PodMoveEvent struct { Time time.Time `json:"time"` Namespace string `json:"namespace"` Pod string `json:"pod"` FromNode string `json:"fromNode"` ToNode string `json:"toNode"` Reason string `json:"reason"` } type EventBus struct { mu sync.Mutex history []PodMoveEvent clients map[chan PodMoveEvent]struct{} } func NewEventBus() *EventBus { return &EventBus{ history: make([]PodMoveEvent, 0, 100), clients: make(map[chan PodMoveEvent]struct{}), } } func (b *EventBus) Emit(evt PodMoveEvent) { b.mu.Lock() defer b.mu.Unlock() if len(b.history) == 100 { b.history = b.history[1:] } b.history = append(b.history, evt) for ch := range b.clients { select { case ch <- evt: default: } } } func (b *EventBus) History() []PodMoveEvent { b.mu.Lock() defer b.mu.Unlock() out := make([]PodMoveEvent, len(b.history)) copy(out, b.history) return out } func (b *EventBus) Subscribe() chan PodMoveEvent { b.mu.Lock() defer b.mu.Unlock() ch := make(chan PodMoveEvent, 10) b.clients[ch] = struct{}{} return ch } func (b *EventBus) Unsubscribe(ch chan PodMoveEvent) { b.mu.Lock() defer b.mu.Unlock() delete(b.clients, ch) close(ch) } func main() { config, err := rest.InClusterConfig() if err != nil { log.Fatal(err) } clientset, err := kubernetes.NewForConfig(config) if err != nil { log.Fatal(err) } metricsClient, err := metricsv.NewForConfig(config) if err != nil { log.Fatal(err) } eventBus := NewEventBus() go startWebUI(eventBus, clientset) for { balanceNodes(clientset, metricsClient, eventBus) time.Sleep(PodEvictionInterval) } } func startWebUI(bus *EventBus, clientset *kubernetes.Clientset) { mux := http.NewServeMux() mux.Handle("/", http.FileServer(http.Dir("/web"))) mux.HandleFunc("/api/events", func(w http.ResponseWriter, _ *http.Request) { w.Header().Set("Content-Type", "application/json") _ = json.NewEncoder(w).Encode(bus.History()) }) mux.HandleFunc("/api/stream", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/event-stream") w.Header().Set("Cache-Control", "no-cache") flusher, ok := w.(http.Flusher) if !ok { http.Error(w, "streaming unsupported", 500) return } ch := bus.Subscribe() defer bus.Unsubscribe(ch) for { select { case evt := <-ch: data, _ := json.Marshal(evt) fmt.Fprintf(w, "data: %s\n\n", data) flusher.Flush() case <-r.Context().Done(): return } } }) log.Println("WebUI listening on :8080") mux.HandleFunc("/api/nodes", func(w http.ResponseWriter, r *http.Request) { nodes, err := clientset.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{}) if err != nil { http.Error(w, "failed to list nodes", http.StatusInternalServerError) return } names := []string{} for _, n := range nodes.Items { names = append(names, n.Name) } w.Header().Set("Content-Type", "application/json") _ = json.NewEncoder(w).Encode(names) }) mux.HandleFunc("/api/pods", func(w http.ResponseWriter, r *http.Request) { ctx := context.Background() pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{}) if err != nil { http.Error(w, "failed to list pods", http.StatusInternalServerError) return } // group pods by node podsByNode := map[string][]string{} for _, pod := range pods.Items { if pod.Spec.NodeName == "" { continue } if _, ok := podsByNode[pod.Spec.NodeName]; !ok { podsByNode[pod.Spec.NodeName] = []string{} } podsByNode[pod.Spec.NodeName] = append(podsByNode[pod.Spec.NodeName], pod.Name) } w.Header().Set("Content-Type", "application/json") _ = json.NewEncoder(w).Encode(podsByNode) }) mux.HandleFunc("/api/config", func(w http.ResponseWriter, _ *http.Request) { w.Header().Set("Content-Type", "application/json") cfg := map[string]interface{}{ "animationDurationMs": int64(2000), // number "version": "1.1", // string "clusterName": os.Getenv("CLUSTER_NAME"), // string } _ = json.NewEncoder(w).Encode(cfg) }) _ = http.ListenAndServe(":8080", mux) } func balanceNodes(clientset *kubernetes.Clientset, metricsClient *metricsv.Clientset, bus *EventBus) { ctx := context.Background() // --- get node metrics --- nodeMetricsList, err := metricsClient.MetricsV1beta1().NodeMetricses().List(ctx, metav1.ListOptions{}) if err != nil { log.Println("Failed to get node metrics:", err) return } nodeUsage := make(map[string]map[string]int64) for _, m := range nodeMetricsList.Items { nodeUsage[m.Name] = map[string]int64{ "cpu": m.Usage.Cpu().MilliValue(), "mem": m.Usage.Memory().Value() / (1024 * 1024), } } // --- get node capacities --- nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { log.Println("Failed to list nodes:", err) return } nodeCapacity := make(map[string]map[string]int64) for _, n := range nodes.Items { nodeCapacity[n.Name] = map[string]int64{ "cpu": n.Status.Capacity.Cpu().MilliValue(), "mem": n.Status.Capacity.Memory().Value() / (1024 * 1024), } } // --- find overloaded and underloaded nodes --- var overloaded, underloaded []string for node, usage := range nodeUsage { cpuPercent := usage["cpu"] * 100 / nodeCapacity[node]["cpu"] // memPercent := usage["mem"] * 100 / nodeCapacity[node]["mem"] // if cpuPercent > CPUOverloadThreshold || memPercent > MemoryOverloadThreshold { if cpuPercent > CPUOverloadThreshold { overloaded = append(overloaded, node) // } else if cpuPercent < CPUUnderloadThreshold && memPercent < MemoryUnderloadThreshold { } else if cpuPercent < CPUUnderloadThreshold { underloaded = append(underloaded, node) } } fmt.Println("Overloaded nodes:", overloaded) fmt.Println("Underloaded nodes:", underloaded) if len(overloaded) == 0 || len(underloaded) == 0 { return } // --- move pods from overloaded to underloaded --- for _, node := range overloaded { pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{ FieldSelector: fmt.Sprintf("spec.nodeName=%s", node), }) if err != nil { log.Println("Failed to list pods for node", node, err) continue } for _, pod := range pods.Items { if pod.Namespace == "kube-system" || isDaemonSet(&pod) { continue } // Skip pods not in Running state if pod.Status.Phase != corev1.PodRunning { continue } // --- choose target before deletion --- targetNode := pickTargetNode(underloaded, nodeUsage, nodeCapacity) if targetNode == "" { // fallback: just skip this pod continue } fmt.Printf("Rescheduling pod %s/%s from %s to another node\n", pod.Namespace, pod.Name, node) // --- cordon the node --- if err := cordonNode(clientset, node); err != nil { log.Println("Failed to cordon node", node, err) continue } fmt.Println("Succesfully cordoned node", node) // --- delete pod immediately --- grace := int64(0) err = clientset.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{ GracePeriodSeconds: &grace, }) if err != nil { log.Println("Failed to delete pod", pod.Name, err) _ = uncordonNode(clientset, node) continue } fmt.Println("Deleted pod", pod.Name) // --- wait for new pod from controller --- var ownerUID types.UID if len(pod.OwnerReferences) > 0 { ownerUID = pod.OwnerReferences[0].UID } var newPod corev1.Pod for { podsList, err := clientset.CoreV1(). Pods(pod.Namespace). List(ctx, metav1.ListOptions{}) if err != nil { log.Println("Failed to list pods:", err) time.Sleep(2 * time.Second) continue } found := false for _, p := range podsList.Items { if p.Status.Phase != corev1.PodRunning { continue } for _, ref := range p.OwnerReferences { if ref.UID == ownerUID { newPod = p found = true break } } if found { break } } if found { log.Println("Replacement pod found") break } log.Println("Waiting for replacement pod to be Running...") time.Sleep(2 * time.Second) } // --- uncordon the node now that pod has moved --- if err := uncordonNode(clientset, node); err != nil { log.Println("Failed to uncordon node", node, err) } fmt.Println("Succesfuly uncordoned node", node) // --- emit event for GUI --- bus.Emit(PodMoveEvent{ Time: time.Now(), Namespace: pod.Namespace, Pod: pod.Name, FromNode: node, ToNode: targetNode, Reason: "node-utilization-imbalance", }) // --- update node usage safely --- if nodeUsage[node] != nil { nodeUsage[node]["cpu"] -= estimatePodCPU(&pod) nodeUsage[node]["mem"] -= estimatePodMem(&pod) } if nodeUsage[newPod.Spec.NodeName] == nil { nodeUsage[newPod.Spec.NodeName] = map[string]int64{"cpu": 0, "mem": 0} } nodeUsage[newPod.Spec.NodeName]["cpu"] += estimatePodCPU(&pod) nodeUsage[newPod.Spec.NodeName]["mem"] += estimatePodMem(&pod) // only move one pod per invocation return } } } // // ─── HELPERS ──────────────────────────────────────────────────────────────────── // func mustEnv(key, def string) string { if val := os.Getenv(key); val != "" { return val } return def } func cordonNode(clientset *kubernetes.Clientset, nodeName string) error { ctx := context.Background() for i := 0; i < 5; i++ { // retry up to 5 times node, err := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) if err != nil { return err } if node.Spec.Unschedulable { return nil // already cordoned } node.Spec.Unschedulable = true _, err = clientset.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}) if err == nil { return nil } log.Println("Cordoning failed, retrying:", err) time.Sleep(500 * time.Millisecond) } return fmt.Errorf("failed to cordon node %s after retries", nodeName) } func uncordonNode(clientset *kubernetes.Clientset, nodeName string) error { ctx := context.Background() for i := 0; i < 5; i++ { node, err := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) if err != nil { return err } if !node.Spec.Unschedulable { return nil // already uncordoned } node.Spec.Unschedulable = false _, err = clientset.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}) if err == nil { return nil } log.Println("Uncordoning failed, retrying:", err) time.Sleep(500 * time.Millisecond) } return fmt.Errorf("failed to uncordon node %s after retries", nodeName) } func isDaemonSet(pod *corev1.Pod) bool { for _, owner := range pod.OwnerReferences { if owner.Kind == "DaemonSet" { return true } } return false } func pickTargetNode(nodes []string, usage map[string]map[string]int64, capacity map[string]map[string]int64) string { var bestNode string bestLoad := int64(1 << 62) for _, n := range nodes { cpuPercent := usage[n]["cpu"] * 100 / capacity[n]["cpu"] memPercent := usage[n]["mem"] * 100 / capacity[n]["mem"] load := cpuPercent + memPercent if load < bestLoad { bestLoad = load bestNode = n } } return bestNode } func estimatePodCPU(pod *corev1.Pod) int64 { var cpu int64 for _, c := range pod.Spec.Containers { if q, ok := c.Resources.Requests[corev1.ResourceCPU]; ok { cpu += q.MilliValue() } } return cpu } func estimatePodMem(pod *corev1.Pod) int64 { var mem int64 for _, c := range pod.Spec.Containers { if q, ok := c.Resources.Requests[corev1.ResourceMemory]; ok { mem += q.Value() / (1024 * 1024) } } return mem }