Files
2026-05-31 16:07:30 +02:00

217 lines
6.0 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package main
import (
"context"
"fmt"
"log"
"os"
"strconv"
"time"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
metricsv "k8s.io/metrics/pkg/client/clientset/versioned"
)
var (
CPUOverloadThreshold int64
MemoryOverloadThreshold int64
CPUUnderloadThreshold int64
MemoryUnderloadThreshold int64
)
func getEnvInt64(name string, defaultVal int64) int64 {
valStr := os.Getenv(name)
if valStr == "" {
return defaultVal
}
val, err := strconv.ParseInt(valStr, 10, 64)
if err != nil {
log.Printf("Invalid %s: %s, using default %d", name, valStr, defaultVal)
return defaultVal
}
return val
}
func init() {
CPUOverloadThreshold = getEnvInt64("CPU_OVERLOAD_THRESHOLD", 80)
MemoryOverloadThreshold = getEnvInt64("MEMORY_OVERLOAD_THRESHOLD", 80)
CPUUnderloadThreshold = getEnvInt64("CPU_UNDERLOAD_THRESHOLD", 50)
MemoryUnderloadThreshold = getEnvInt64("MEMORY_UNDERLOAD_THRESHOLD", 50)
}
func main() {
config, err := rest.InClusterConfig()
if err != nil {
log.Fatal(err)
}
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
log.Fatal(err)
}
metricsClient, err := metricsv.NewForConfig(config)
if err != nil {
log.Fatal(err)
}
for {
balanceNodes(clientset, metricsClient)
time.Sleep(2 * time.Minute)
}
}
func balanceNodes(clientset *kubernetes.Clientset, metricsClient *metricsv.Clientset) {
ctx := context.Background()
// Get node metrics
nodeMetricsList, err := metricsClient.MetricsV1beta1().NodeMetricses().List(ctx, metav1.ListOptions{})
if err != nil {
log.Println("Failed to get node metrics:", err)
return
}
nodeUsage := make(map[string]map[string]int64)
for _, m := range nodeMetricsList.Items {
nodeUsage[m.Name] = map[string]int64{
"cpu": m.Usage.Cpu().MilliValue(),
"mem": m.Usage.Memory().Value() / (1024 * 1024), // MiB
}
}
// Get node capacity
nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
log.Println("Failed to list nodes:", err)
return
}
nodeCapacity := make(map[string]map[string]int64)
for _, n := range nodes.Items {
nodeCapacity[n.Name] = map[string]int64{
"cpu": n.Status.Capacity.Cpu().MilliValue(),
"mem": n.Status.Capacity.Memory().Value() / (1024 * 1024),
}
}
// Identify overloaded and underloaded nodes
var overloaded, underloaded []string
for node, usage := range nodeUsage {
cpuPercent := usage["cpu"] * 100 / nodeCapacity[node]["cpu"]
memPercent := usage["mem"] * 100 / nodeCapacity[node]["mem"]
if cpuPercent > CPUOverloadThreshold || memPercent > MemoryOverloadThreshold {
overloaded = append(overloaded, node)
} else if cpuPercent < CPUUnderloadThreshold && memPercent < MemoryUnderloadThreshold {
underloaded = append(underloaded, node)
}
}
// ORIGINAL LOGGING preserved
fmt.Println("Overloaded nodes:", overloaded)
fmt.Println("Underloaded nodes:", underloaded)
if len(overloaded) == 0 || len(underloaded) == 0 {
return
}
// Evict exactly ONE pod, then return
for _, node := range overloaded {
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{
FieldSelector: fmt.Sprintf("spec.nodeName=%s", node),
})
if err != nil {
log.Println("Failed to list pods for node", node, err)
continue
}
for _, pod := range pods.Items {
if pod.Namespace == "kube-system" || isDaemonSet(&pod) {
continue
}
targetNode := pickTargetNode(underloaded, nodeUsage, nodeCapacity)
if targetNode == "" {
log.Println("No suitable underloaded node available")
return
}
// ORIGINAL LOGGING preserved
fmt.Printf(
"Rescheduling pod %s/%s from %s to %s\n",
pod.Namespace,
pod.Name,
node,
targetNode,
)
grace := int64(0)
err = clientset.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{
GracePeriodSeconds: &grace,
})
if err != nil {
log.Println("Failed to delete pod", pod.Name, err)
return
}
// Update in-memory usage
nodeUsage[node]["cpu"] -= estimatePodCPU(&pod)
nodeUsage[node]["mem"] -= estimatePodMem(&pod)
nodeUsage[targetNode]["cpu"] += estimatePodCPU(&pod)
nodeUsage[targetNode]["mem"] += estimatePodMem(&pod)
// IMPORTANT: stop after one eviction
return
}
}
}
func isDaemonSet(pod *corev1.Pod) bool {
for _, owner := range pod.OwnerReferences {
if owner.Kind == "DaemonSet" {
return true
}
}
return false
}
// pickTargetNode chooses the underloaded node with lowest combined CPU+Memory usage
func pickTargetNode(nodes []string, usage map[string]map[string]int64, capacity map[string]map[string]int64) string {
var bestNode string
bestLoad := int64(1 << 62) // very high
for _, n := range nodes {
cpuPercent := usage[n]["cpu"] * 100 / capacity[n]["cpu"]
memPercent := usage[n]["mem"] * 100 / capacity[n]["mem"]
load := cpuPercent + memPercent
if load < bestLoad {
bestLoad = load
bestNode = n
}
}
return bestNode
}
// estimatePodCPU/Mem returns an approximate CPU/memory usage for the pod (from requests)
func estimatePodCPU(pod *corev1.Pod) int64 {
var cpu int64
for _, c := range pod.Spec.Containers {
if q, ok := c.Resources.Requests[corev1.ResourceCPU]; ok {
cpu += q.MilliValue()
}
}
return cpu
}
func estimatePodMem(pod *corev1.Pod) int64 {
var mem int64
for _, c := range pod.Spec.Containers {
if q, ok := c.Resources.Requests[corev1.ResourceMemory]; ok {
mem += q.Value() / (1024 * 1024) // MiB
}
}
return mem
}