563 lines
15 KiB
Go
563 lines
15 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"strconv"
|
|
"sync"
|
|
"time"
|
|
corev1 "k8s.io/api/core/v1"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/client-go/kubernetes"
|
|
"k8s.io/client-go/rest"
|
|
"k8s.io/apimachinery/pkg/types"
|
|
metricsv "k8s.io/metrics/pkg/client/clientset/versioned"
|
|
)
|
|
|
|
var (
|
|
CPUOverloadThreshold int64
|
|
MemoryOverloadThreshold int64
|
|
CPUUnderloadThreshold int64
|
|
MemoryUnderloadThreshold int64
|
|
PodEvictionInterval time.Duration
|
|
PodAnimationDuration time.Duration
|
|
PodRespawnTimeout time.Duration
|
|
)
|
|
|
|
func getEnvInt64(name string, defaultVal int64) int64 {
|
|
valStr := os.Getenv(name)
|
|
if valStr == "" {
|
|
return defaultVal
|
|
}
|
|
val, err := strconv.ParseInt(valStr, 10, 64)
|
|
if err != nil {
|
|
log.Printf("Invalid %s: %s, using default %d", name, valStr, defaultVal)
|
|
return defaultVal
|
|
}
|
|
return val
|
|
}
|
|
func getEnvSecondsDuration(name string, defaultSec int64) time.Duration {
|
|
valStr := os.Getenv(name)
|
|
if valStr == "" {
|
|
return time.Duration(defaultSec) * time.Second
|
|
}
|
|
sec, err := strconv.ParseInt(valStr, 10, 64)
|
|
if err != nil {
|
|
log.Printf("Invalid %s: %s, using default %d seconds", name, valStr, defaultSec)
|
|
return time.Duration(defaultSec) * time.Second
|
|
}
|
|
return time.Duration(sec) * time.Second
|
|
}
|
|
|
|
func init() {
|
|
CPUOverloadThreshold = getEnvInt64("CPU_OVERLOAD_THRESHOLD", 80)
|
|
MemoryOverloadThreshold = getEnvInt64("MEMORY_OVERLOAD_THRESHOLD", 80)
|
|
CPUUnderloadThreshold = getEnvInt64("CPU_UNDERLOAD_THRESHOLD", 50)
|
|
MemoryUnderloadThreshold = getEnvInt64("MEMORY_UNDERLOAD_THRESHOLD", 50)
|
|
|
|
PodEvictionInterval = getEnvSecondsDuration("POD_EVICTION_INTERVAL", 30)
|
|
PodAnimationDuration = getEnvSecondsDuration("POD_ANIMATION_DURATION", 2)
|
|
PodRespawnTimeout = getEnvSecondsDuration("POD_RESPAWN_TIMEOUT", 60)
|
|
|
|
log.Println("──────────── CONFIG ────────────")
|
|
log.Printf("CPU_OVERLOAD_THRESHOLD = %d%%", CPUOverloadThreshold)
|
|
log.Printf("MEMORY_OVERLOAD_THRESHOLD = %d%%", MemoryOverloadThreshold)
|
|
log.Printf("CPU_UNDERLOAD_THRESHOLD = %d%%", CPUUnderloadThreshold)
|
|
log.Printf("MEMORY_UNDERLOAD_THRESHOLD = %d%%", MemoryUnderloadThreshold)
|
|
log.Printf("POD_EVICTION_INTERVAL = %v", PodEvictionInterval)
|
|
log.Printf("POD_ANIMATION_DURATION = %v", PodAnimationDuration)
|
|
log.Printf("POD_RESPAWN_TIMEOUT = %v", PodRespawnTimeout)
|
|
log.Println("───────────────────────────────")
|
|
}
|
|
|
|
type PodMoveEvent struct {
|
|
Time time.Time `json:"time"`
|
|
Namespace string `json:"namespace"`
|
|
Pod string `json:"pod"`
|
|
FromNode string `json:"fromNode"`
|
|
ToNode string `json:"toNode"`
|
|
Reason string `json:"reason"`
|
|
}
|
|
|
|
type EventBus struct {
|
|
mu sync.Mutex
|
|
history []PodMoveEvent
|
|
clients map[chan PodMoveEvent]struct{}
|
|
}
|
|
|
|
func NewEventBus() *EventBus {
|
|
return &EventBus{
|
|
history: make([]PodMoveEvent, 0, 100),
|
|
clients: make(map[chan PodMoveEvent]struct{}),
|
|
}
|
|
}
|
|
|
|
func (b *EventBus) Emit(evt PodMoveEvent) {
|
|
b.mu.Lock()
|
|
defer b.mu.Unlock()
|
|
|
|
if len(b.history) == 100 {
|
|
b.history = b.history[1:]
|
|
}
|
|
b.history = append(b.history, evt)
|
|
|
|
for ch := range b.clients {
|
|
select {
|
|
case ch <- evt:
|
|
default:
|
|
}
|
|
}
|
|
}
|
|
|
|
func (b *EventBus) History() []PodMoveEvent {
|
|
b.mu.Lock()
|
|
defer b.mu.Unlock()
|
|
out := make([]PodMoveEvent, len(b.history))
|
|
copy(out, b.history)
|
|
return out
|
|
}
|
|
|
|
func (b *EventBus) Subscribe() chan PodMoveEvent {
|
|
b.mu.Lock()
|
|
defer b.mu.Unlock()
|
|
ch := make(chan PodMoveEvent, 10)
|
|
b.clients[ch] = struct{}{}
|
|
return ch
|
|
}
|
|
|
|
func (b *EventBus) Unsubscribe(ch chan PodMoveEvent) {
|
|
b.mu.Lock()
|
|
defer b.mu.Unlock()
|
|
delete(b.clients, ch)
|
|
close(ch)
|
|
}
|
|
|
|
func main() {
|
|
config, err := rest.InClusterConfig()
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
|
|
clientset, err := kubernetes.NewForConfig(config)
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
|
|
metricsClient, err := metricsv.NewForConfig(config)
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
|
|
eventBus := NewEventBus()
|
|
go startWebUI(eventBus, clientset)
|
|
for {
|
|
balanceNodes(clientset, metricsClient, eventBus)
|
|
time.Sleep(PodEvictionInterval)
|
|
}
|
|
}
|
|
|
|
func startWebUI(bus *EventBus, clientset *kubernetes.Clientset) {
|
|
mux := http.NewServeMux()
|
|
|
|
mux.Handle("/", http.FileServer(http.Dir("/web")))
|
|
mux.HandleFunc("/api/events", func(w http.ResponseWriter, _ *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_ = json.NewEncoder(w).Encode(bus.History())
|
|
})
|
|
|
|
mux.HandleFunc("/api/stream", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "text/event-stream")
|
|
w.Header().Set("Cache-Control", "no-cache")
|
|
|
|
flusher, ok := w.(http.Flusher)
|
|
if !ok {
|
|
http.Error(w, "streaming unsupported", 500)
|
|
return
|
|
}
|
|
|
|
ch := bus.Subscribe()
|
|
defer bus.Unsubscribe(ch)
|
|
|
|
for {
|
|
select {
|
|
case evt := <-ch:
|
|
data, _ := json.Marshal(evt)
|
|
fmt.Fprintf(w, "data: %s\n\n", data)
|
|
flusher.Flush()
|
|
case <-r.Context().Done():
|
|
return
|
|
}
|
|
}
|
|
})
|
|
|
|
log.Println("WebUI listening on :8080")
|
|
mux.HandleFunc("/api/nodes", func(w http.ResponseWriter, r *http.Request) {
|
|
nodes, err := clientset.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{})
|
|
if err != nil {
|
|
http.Error(w, "failed to list nodes", http.StatusInternalServerError)
|
|
return
|
|
}
|
|
names := []string{}
|
|
for _, n := range nodes.Items {
|
|
names = append(names, n.Name)
|
|
}
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_ = json.NewEncoder(w).Encode(names)
|
|
})
|
|
|
|
|
|
mux.HandleFunc("/api/pods", func(w http.ResponseWriter, r *http.Request) {
|
|
ctx := context.Background()
|
|
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
|
|
if err != nil {
|
|
http.Error(w, "failed to list pods", http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
// group pods by node
|
|
podsByNode := map[string][]string{}
|
|
for _, pod := range pods.Items {
|
|
if pod.Spec.NodeName == "" {
|
|
continue
|
|
}
|
|
if _, ok := podsByNode[pod.Spec.NodeName]; !ok {
|
|
podsByNode[pod.Spec.NodeName] = []string{}
|
|
}
|
|
podsByNode[pod.Spec.NodeName] = append(podsByNode[pod.Spec.NodeName], pod.Name)
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_ = json.NewEncoder(w).Encode(podsByNode)
|
|
})
|
|
|
|
|
|
|
|
mux.HandleFunc("/api/config", func(w http.ResponseWriter, _ *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
|
|
cfg := map[string]interface{}{
|
|
"animationDurationMs": int64(2000), // number
|
|
"version": "1.1", // string
|
|
"clusterName": os.Getenv("CLUSTER_NAME"), // string
|
|
}
|
|
|
|
_ = json.NewEncoder(w).Encode(cfg)
|
|
})
|
|
|
|
_ = http.ListenAndServe(":8080", mux)
|
|
|
|
}
|
|
|
|
func balanceNodes(clientset *kubernetes.Clientset, metricsClient *metricsv.Clientset, bus *EventBus) {
|
|
ctx := context.Background()
|
|
|
|
// --- get node metrics ---
|
|
nodeMetricsList, err := metricsClient.MetricsV1beta1().
|
|
NodeMetricses().
|
|
List(ctx, metav1.ListOptions{})
|
|
if err != nil {
|
|
log.Println("Failed to get node metrics:", err)
|
|
return
|
|
}
|
|
|
|
nodeUsage := make(map[string]map[string]int64)
|
|
for _, m := range nodeMetricsList.Items {
|
|
nodeUsage[m.Name] = map[string]int64{
|
|
"cpu": m.Usage.Cpu().MilliValue(),
|
|
"mem": m.Usage.Memory().Value() / (1024 * 1024),
|
|
}
|
|
}
|
|
|
|
// --- get node capacities ---
|
|
nodes, err := clientset.CoreV1().
|
|
Nodes().
|
|
List(ctx, metav1.ListOptions{})
|
|
if err != nil {
|
|
log.Println("Failed to list nodes:", err)
|
|
return
|
|
}
|
|
|
|
nodeCapacity := make(map[string]map[string]int64)
|
|
for _, n := range nodes.Items {
|
|
nodeCapacity[n.Name] = map[string]int64{
|
|
"cpu": n.Status.Capacity.Cpu().MilliValue(),
|
|
"mem": n.Status.Capacity.Memory().Value() / (1024 * 1024),
|
|
}
|
|
}
|
|
|
|
// --- find overloaded and underloaded nodes ---
|
|
var overloaded, underloaded []string
|
|
|
|
for node, usage := range nodeUsage {
|
|
if nodeCapacity[node]["cpu"] == 0 {
|
|
continue
|
|
}
|
|
|
|
cpuPercent := usage["cpu"] * 100 / nodeCapacity[node]["cpu"]
|
|
|
|
if cpuPercent > CPUOverloadThreshold {
|
|
overloaded = append(overloaded, node)
|
|
} else if cpuPercent < CPUUnderloadThreshold {
|
|
underloaded = append(underloaded, node)
|
|
}
|
|
}
|
|
|
|
log.Println("Overloaded nodes:", overloaded)
|
|
log.Println("Underloaded nodes:", underloaded)
|
|
|
|
if len(overloaded) == 0 || len(underloaded) == 0 {
|
|
return
|
|
}
|
|
|
|
// --- move pods from overloaded to underloaded ---
|
|
for _, node := range overloaded {
|
|
|
|
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{
|
|
FieldSelector: fmt.Sprintf("spec.nodeName=%s", node),
|
|
})
|
|
if err != nil {
|
|
log.Println("Failed to list pods for node", node, err)
|
|
continue
|
|
}
|
|
|
|
for _, pod := range pods.Items {
|
|
|
|
if pod.Namespace == "kube-system" || isDaemonSet(&pod) {
|
|
continue
|
|
}
|
|
|
|
if pod.Status.Phase != corev1.PodRunning {
|
|
continue
|
|
}
|
|
|
|
if usesHostPath(&pod) {
|
|
log.Printf("Skipping pod %s/%s (uses hostPath)", pod.Namespace, pod.Name)
|
|
continue
|
|
}
|
|
|
|
targetNode := pickTargetNode(underloaded, nodeUsage, nodeCapacity)
|
|
if targetNode == "" {
|
|
continue
|
|
}
|
|
|
|
log.Printf("Rescheduling pod %s/%s from %s to %s\n",
|
|
pod.Namespace, pod.Name, node, targetNode)
|
|
|
|
// --- cordon the node ---
|
|
if err := cordonNode(clientset, node); err != nil {
|
|
log.Println("Failed to cordon node", node, err)
|
|
continue
|
|
}
|
|
log.Println("Succesfully cordoned node", node)
|
|
|
|
// --- delete pod ---
|
|
grace := int64(0)
|
|
err = clientset.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{
|
|
GracePeriodSeconds: &grace,
|
|
})
|
|
if err != nil {
|
|
log.Println("Failed to delete pod", pod.Name, err)
|
|
_ = uncordonNode(clientset, node)
|
|
continue
|
|
}
|
|
log.Println("Deleted pod", pod.Name)
|
|
|
|
// --- wait for replacement pod ---
|
|
var ownerUID types.UID
|
|
if len(pod.OwnerReferences) > 0 {
|
|
ownerUID = pod.OwnerReferences[0].UID
|
|
}
|
|
|
|
var newPod corev1.Pod
|
|
|
|
timeoutCtx, cancel := context.WithTimeout(ctx, PodRespawnTimeout)
|
|
ticker := time.NewTicker(2 * time.Second)
|
|
|
|
WAIT_LOOP:
|
|
for {
|
|
select {
|
|
case <-timeoutCtx.Done():
|
|
log.Println("Pod respawn timeout reached")
|
|
break WAIT_LOOP
|
|
|
|
case <-ticker.C:
|
|
|
|
podsList, err := clientset.CoreV1().
|
|
Pods(pod.Namespace).
|
|
List(ctx, metav1.ListOptions{})
|
|
if err != nil {
|
|
log.Println("Failed to list pods:", err)
|
|
continue
|
|
}
|
|
|
|
for _, p := range podsList.Items {
|
|
|
|
if p.Status.Phase != corev1.PodRunning {
|
|
continue
|
|
}
|
|
|
|
for _, ref := range p.OwnerReferences {
|
|
if ref.UID == ownerUID {
|
|
|
|
// MUST be on different node
|
|
if p.Spec.NodeName != node {
|
|
newPod = p
|
|
break WAIT_LOOP
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
log.Println("Waiting for replacement pod to be Running...")
|
|
}
|
|
}
|
|
|
|
ticker.Stop()
|
|
cancel()
|
|
|
|
// --- ALWAYS uncordon ---
|
|
if err := uncordonNode(clientset, node); err != nil {
|
|
log.Println("Failed to uncordon node", node, err)
|
|
}
|
|
|
|
log.Println("Succesfuly uncordoned node", node)
|
|
log.Println("About to emit PodMoveEvent")
|
|
|
|
bus.Emit(PodMoveEvent{
|
|
Time: time.Now(),
|
|
Namespace: pod.Namespace,
|
|
Pod: pod.Name,
|
|
FromNode: node,
|
|
ToNode: targetNode,
|
|
Reason: "node-utilization-imbalance",
|
|
})
|
|
|
|
// --- update node usage ---
|
|
if nodeUsage[node] != nil {
|
|
nodeUsage[node]["cpu"] -= estimatePodCPU(&pod)
|
|
nodeUsage[node]["mem"] -= estimatePodMem(&pod)
|
|
}
|
|
|
|
if nodeUsage[newPod.Spec.NodeName] == nil {
|
|
nodeUsage[newPod.Spec.NodeName] = map[string]int64{"cpu": 0, "mem": 0}
|
|
}
|
|
|
|
nodeUsage[newPod.Spec.NodeName]["cpu"] += estimatePodCPU(&pod)
|
|
nodeUsage[newPod.Spec.NodeName]["mem"] += estimatePodMem(&pod)
|
|
|
|
// only move one pod per invocation
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// ─── HELPERS ────────────────────────────────────────────────────────────────────
|
|
//
|
|
func mustEnv(key, def string) string {
|
|
if val := os.Getenv(key); val != "" {
|
|
return val
|
|
}
|
|
return def
|
|
}
|
|
|
|
func cordonNode(clientset *kubernetes.Clientset, nodeName string) error {
|
|
ctx := context.Background()
|
|
for i := 0; i < 5; i++ { // retry up to 5 times
|
|
node, err := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if node.Spec.Unschedulable {
|
|
return nil // already cordoned
|
|
}
|
|
node.Spec.Unschedulable = true
|
|
_, err = clientset.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{})
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
log.Println("Cordoning failed, retrying:", err)
|
|
time.Sleep(500 * time.Millisecond)
|
|
}
|
|
return fmt.Errorf("failed to cordon node %s after retries", nodeName)
|
|
}
|
|
|
|
func uncordonNode(clientset *kubernetes.Clientset, nodeName string) error {
|
|
ctx := context.Background()
|
|
for i := 0; i < 5; i++ {
|
|
node, err := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !node.Spec.Unschedulable {
|
|
return nil // already uncordoned
|
|
}
|
|
node.Spec.Unschedulable = false
|
|
_, err = clientset.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{})
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
log.Println("Uncordoning failed, retrying:", err)
|
|
time.Sleep(500 * time.Millisecond)
|
|
}
|
|
return fmt.Errorf("failed to uncordon node %s after retries", nodeName)
|
|
}
|
|
|
|
|
|
func isDaemonSet(pod *corev1.Pod) bool {
|
|
for _, owner := range pod.OwnerReferences {
|
|
if owner.Kind == "DaemonSet" {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func usesHostPath(pod *corev1.Pod) bool {
|
|
for _, vol := range pod.Spec.Volumes {
|
|
if vol.HostPath != nil {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func pickTargetNode(nodes []string, usage map[string]map[string]int64, capacity map[string]map[string]int64) string {
|
|
var bestNode string
|
|
bestLoad := int64(1 << 62)
|
|
for _, n := range nodes {
|
|
cpuPercent := usage[n]["cpu"] * 100 / capacity[n]["cpu"]
|
|
memPercent := usage[n]["mem"] * 100 / capacity[n]["mem"]
|
|
load := cpuPercent + memPercent
|
|
if load < bestLoad {
|
|
bestLoad = load
|
|
bestNode = n
|
|
}
|
|
}
|
|
return bestNode
|
|
}
|
|
|
|
func estimatePodCPU(pod *corev1.Pod) int64 {
|
|
var cpu int64
|
|
for _, c := range pod.Spec.Containers {
|
|
if q, ok := c.Resources.Requests[corev1.ResourceCPU]; ok {
|
|
cpu += q.MilliValue()
|
|
}
|
|
}
|
|
return cpu
|
|
}
|
|
|
|
func estimatePodMem(pod *corev1.Pod) int64 {
|
|
var mem int64
|
|
for _, c := range pod.Spec.Containers {
|
|
if q, ok := c.Resources.Requests[corev1.ResourceMemory]; ok {
|
|
mem += q.Value() / (1024 * 1024)
|
|
}
|
|
}
|
|
return mem
|
|
}
|