Files
kubernetes/prod/node-balancer/main.8
T
2026-05-31 16:07:30 +02:00

490 lines
14 KiB
Plaintext

package main
import (
"context"
"embed"
"encoding/json"
"fmt"
"log"
"net/http"
"os"
"strconv"
"sync"
"time"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/apimachinery/pkg/types"
metricsv "k8s.io/metrics/pkg/client/clientset/versioned"
)
//go:embed web/*
var webFS embed.FS
var (
CPUOverloadThreshold int64
MemoryOverloadThreshold int64
CPUUnderloadThreshold int64
MemoryUnderloadThreshold int64
PodEvictionInterval time.Duration
PodAnimationDuration time.Duration
)
func getEnvInt64(name string, defaultVal int64) int64 {
valStr := os.Getenv(name)
if valStr == "" {
return defaultVal
}
val, err := strconv.ParseInt(valStr, 10, 64)
if err != nil {
log.Printf("Invalid %s: %s, using default %d", name, valStr, defaultVal)
return defaultVal
}
return val
}
func getEnvSecondsDuration(name string, defaultSec int64) time.Duration {
valStr := os.Getenv(name)
if valStr == "" {
return time.Duration(defaultSec) * time.Second
}
sec, err := strconv.ParseInt(valStr, 10, 64)
if err != nil {
log.Printf("Invalid %s: %s, using default %d seconds", name, valStr, defaultSec)
return time.Duration(defaultSec) * time.Second
}
return time.Duration(sec) * time.Second
}
func init() {
CPUOverloadThreshold = getEnvInt64("CPU_OVERLOAD_THRESHOLD", 80)
MemoryOverloadThreshold = getEnvInt64("MEMORY_OVERLOAD_THRESHOLD", 80)
CPUUnderloadThreshold = getEnvInt64("CPU_UNDERLOAD_THRESHOLD", 50)
MemoryUnderloadThreshold = getEnvInt64("MEMORY_UNDERLOAD_THRESHOLD", 50)
PodEvictionInterval = getEnvSecondsDuration("POD_EVICTION_INTERVAL", 120) // default 2 min
PodAnimationDuration = getEnvSecondsDuration("POD_ANIMATION_DURATION", 2) // default 2 sec
}
type PodMoveEvent struct {
Time time.Time `json:"time"`
Namespace string `json:"namespace"`
Pod string `json:"pod"`
FromNode string `json:"fromNode"`
ToNode string `json:"toNode"`
Reason string `json:"reason"`
}
type EventBus struct {
mu sync.Mutex
history []PodMoveEvent
clients map[chan PodMoveEvent]struct{}
}
func NewEventBus() *EventBus {
return &EventBus{
history: make([]PodMoveEvent, 0, 100),
clients: make(map[chan PodMoveEvent]struct{}),
}
}
func (b *EventBus) Emit(evt PodMoveEvent) {
b.mu.Lock()
defer b.mu.Unlock()
if len(b.history) == 100 {
b.history = b.history[1:]
}
b.history = append(b.history, evt)
for ch := range b.clients {
select {
case ch <- evt:
default:
}
}
}
func (b *EventBus) History() []PodMoveEvent {
b.mu.Lock()
defer b.mu.Unlock()
out := make([]PodMoveEvent, len(b.history))
copy(out, b.history)
return out
}
func (b *EventBus) Subscribe() chan PodMoveEvent {
b.mu.Lock()
defer b.mu.Unlock()
ch := make(chan PodMoveEvent, 10)
b.clients[ch] = struct{}{}
return ch
}
func (b *EventBus) Unsubscribe(ch chan PodMoveEvent) {
b.mu.Lock()
defer b.mu.Unlock()
delete(b.clients, ch)
close(ch)
}
func main() {
config, err := rest.InClusterConfig()
if err != nil {
log.Fatal(err)
}
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
log.Fatal(err)
}
metricsClient, err := metricsv.NewForConfig(config)
if err != nil {
log.Fatal(err)
}
eventBus := NewEventBus()
go startWebUI(eventBus, clientset)
for {
balanceNodes(clientset, metricsClient, eventBus)
time.Sleep(PodEvictionInterval)
}
}
func startWebUI(bus *EventBus, clientset *kubernetes.Clientset) {
mux := http.NewServeMux()
mux.Handle("/", http.FileServer(http.FS(webFS)))
mux.HandleFunc("/api/events", func(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(bus.History())
})
mux.HandleFunc("/api/stream", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/event-stream")
w.Header().Set("Cache-Control", "no-cache")
flusher, ok := w.(http.Flusher)
if !ok {
http.Error(w, "streaming unsupported", 500)
return
}
ch := bus.Subscribe()
defer bus.Unsubscribe(ch)
for {
select {
case evt := <-ch:
data, _ := json.Marshal(evt)
fmt.Fprintf(w, "data: %s\n\n", data)
flusher.Flush()
case <-r.Context().Done():
return
}
}
})
log.Println("WebUI listening on :8080")
mux.HandleFunc("/api/nodes", func(w http.ResponseWriter, r *http.Request) {
nodes, err := clientset.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{})
if err != nil {
http.Error(w, "failed to list nodes", http.StatusInternalServerError)
return
}
names := []string{}
for _, n := range nodes.Items {
names = append(names, n.Name)
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(names)
})
mux.HandleFunc("/api/pods", func(w http.ResponseWriter, r *http.Request) {
ctx := context.Background()
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
if err != nil {
http.Error(w, "failed to list pods", http.StatusInternalServerError)
return
}
// group pods by node
podsByNode := map[string][]string{}
for _, pod := range pods.Items {
if pod.Spec.NodeName == "" {
continue
}
if _, ok := podsByNode[pod.Spec.NodeName]; !ok {
podsByNode[pod.Spec.NodeName] = []string{}
}
podsByNode[pod.Spec.NodeName] = append(podsByNode[pod.Spec.NodeName], pod.Name)
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(podsByNode)
})
mux.HandleFunc("/api/config", func(w http.ResponseWriter, r *http.Request) {
cfg := map[string]int64{
"animationDurationMs": PodAnimationDuration.Milliseconds(),
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(cfg)
})
_ = http.ListenAndServe(":8080", mux)
}
func balanceNodes(clientset *kubernetes.Clientset, metricsClient *metricsv.Clientset, bus *EventBus) {
ctx := context.Background()
// --- get node metrics ---
nodeMetricsList, err := metricsClient.MetricsV1beta1().NodeMetricses().List(ctx, metav1.ListOptions{})
if err != nil {
log.Println("Failed to get node metrics:", err)
return
}
nodeUsage := make(map[string]map[string]int64)
for _, m := range nodeMetricsList.Items {
nodeUsage[m.Name] = map[string]int64{
"cpu": m.Usage.Cpu().MilliValue(),
"mem": m.Usage.Memory().Value() / (1024 * 1024),
}
}
// --- get node capacities ---
nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
log.Println("Failed to list nodes:", err)
return
}
nodeCapacity := make(map[string]map[string]int64)
for _, n := range nodes.Items {
nodeCapacity[n.Name] = map[string]int64{
"cpu": n.Status.Capacity.Cpu().MilliValue(),
"mem": n.Status.Capacity.Memory().Value() / (1024 * 1024),
}
}
// --- find overloaded and underloaded nodes ---
var overloaded, underloaded []string
for node, usage := range nodeUsage {
cpuPercent := usage["cpu"] * 100 / nodeCapacity[node]["cpu"]
memPercent := usage["mem"] * 100 / nodeCapacity[node]["mem"]
if cpuPercent > CPUOverloadThreshold || memPercent > MemoryOverloadThreshold {
overloaded = append(overloaded, node)
} else if cpuPercent < CPUUnderloadThreshold && memPercent < MemoryUnderloadThreshold {
underloaded = append(underloaded, node)
}
}
fmt.Println("Overloaded nodes:", overloaded)
fmt.Println("Underloaded nodes:", underloaded)
if len(overloaded) == 0 || len(underloaded) == 0 {
return
}
// --- move pods from overloaded to underloaded ---
for _, node := range overloaded {
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{
FieldSelector: fmt.Sprintf("spec.nodeName=%s", node),
})
if err != nil {
log.Println("Failed to list pods for node", node, err)
continue
}
for _, pod := range pods.Items {
if pod.Namespace == "kube-system" || isDaemonSet(&pod) {
continue
}
// Skip pods not in Running state
if pod.Status.Phase != corev1.PodRunning {
continue
}
// --- choose target before deletion ---
targetNode := pickTargetNode(underloaded, nodeUsage, nodeCapacity)
if targetNode == "" {
// fallback: just skip this pod
continue
}
fmt.Printf("Rescheduling pod %s/%s from %s to %s\n", pod.Namespace, pod.Name, node, targetNode)
// --- cordon the node ---
if err := cordonNode(clientset, node); err != nil {
log.Println("Failed to cordon node", node, err)
continue
}
// --- delete pod immediately ---
grace := int64(0)
err = clientset.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{
GracePeriodSeconds: &grace,
})
if err != nil {
log.Println("Failed to delete pod", pod.Name, err)
_ = uncordonNode(clientset, node)
continue
}
// --- wait for new pod from deployment ---
var ownerUID types.UID
if len(pod.OwnerReferences) > 0 {
ownerUID = pod.OwnerReferences[0].UID
}
var newPod corev1.Pod
for {
podsList, err := clientset.CoreV1().Pods(pod.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
log.Println("Failed to list pods:", err)
time.Sleep(2 * time.Second)
continue
}
found := true
for _, p := range podsList.Items {
if p.Name == pod.Name {
continue
}
for _, ref := range p.OwnerReferences {
if ref.UID == ownerUID && p.Status.Phase == corev1.PodRunning {
newPod = p
found = true
break
}
}
if found {
break
}
}
if found {
break
}
fmt.Println("Waiting for replacement pod to be Running...")
time.Sleep(2 * time.Second)
}
// --- uncordon the node now that pod has moved ---
if err := uncordonNode(clientset, node); err != nil {
log.Println("Failed to uncordon node", node, err)
}
// --- update node usage safely ---
if nodeUsage[node] != nil {
nodeUsage[node]["cpu"] -= estimatePodCPU(&pod)
nodeUsage[node]["mem"] -= estimatePodMem(&pod)
}
if nodeUsage[newPod.Spec.NodeName] == nil {
nodeUsage[newPod.Spec.NodeName] = map[string]int64{"cpu": 0, "mem": 0}
}
nodeUsage[newPod.Spec.NodeName]["cpu"] += estimatePodCPU(&pod)
nodeUsage[newPod.Spec.NodeName]["mem"] += estimatePodMem(&pod)
// --- emit event for GUI ---
bus.Emit(PodMoveEvent{
Time: time.Now(),
Namespace: pod.Namespace,
Pod: newPod.Name,
FromNode: node,
ToNode: newPod.Spec.NodeName,
Reason: "node-utilization-imbalance",
})
// only move one pod per invocation
return
}
}
}
//
// ─── HELPERS ────────────────────────────────────────────────────────────────────
//
func cordonNode(clientset *kubernetes.Clientset, nodeName string) error {
ctx := context.Background()
node, err := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
if err != nil {
return err
}
if node.Spec.Unschedulable {
return nil
}
node.Spec.Unschedulable = true
_, err = clientset.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{})
return err
}
func uncordonNode(clientset *kubernetes.Clientset, nodeName string) error {
ctx := context.Background()
node, err := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
if err != nil {
return err
}
node.Spec.Unschedulable = false
_, err = clientset.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{})
return err
}
func isDaemonSet(pod *corev1.Pod) bool {
for _, owner := range pod.OwnerReferences {
if owner.Kind == "DaemonSet" {
return true
}
}
return false
}
func pickTargetNode(nodes []string, usage map[string]map[string]int64, capacity map[string]map[string]int64) string {
var bestNode string
bestLoad := int64(1 << 62)
for _, n := range nodes {
cpuPercent := usage[n]["cpu"] * 100 / capacity[n]["cpu"]
memPercent := usage[n]["mem"] * 100 / capacity[n]["mem"]
load := cpuPercent + memPercent
if load < bestLoad {
bestLoad = load
bestNode = n
}
}
return bestNode
}
func estimatePodCPU(pod *corev1.Pod) int64 {
var cpu int64
for _, c := range pod.Spec.Containers {
if q, ok := c.Resources.Requests[corev1.ResourceCPU]; ok {
cpu += q.MilliValue()
}
}
return cpu
}
func estimatePodMem(pod *corev1.Pod) int64 {
var mem int64
for _, c := range pod.Spec.Containers {
if q, ok := c.Resources.Requests[corev1.ResourceMemory]; ok {
mem += q.Value() / (1024 * 1024)
}
}
return mem
}