From 89c32f8b9df6986bf72f7bc80bfc9c418166e6f1 Mon Sep 17 00:00:00 2001 From: liaosirui Date: Thu, 31 Oct 2024 20:12:59 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=AD=A3ContainerMeta=E5=92=8CRemove?= =?UTF-8?q?=E4=BA=8B=E4=BB=B6=E5=A4=84=E7=90=86=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deploy.sh | 6 +-- pkg/resmgr/pod.go | 2 +- pkg/resmgr/regulator.go | 82 ++++++++++++++++++++++++++--------------- 3 files changed, 57 insertions(+), 33 deletions(-) diff --git a/deploy.sh b/deploy.sh index efa9d0c..6b16f3e 100644 --- a/deploy.sh +++ b/deploy.sh @@ -77,11 +77,11 @@ plugin_install() { # 3. 部署NRI插件 # 3.1 启动grpc服务器 - echo "step3: start grpc server" - start_grpc_server "./bin/grpc_server" + #echo "step3: start grpc server" + #start_grpc_server "./bin/grpc_server" # 3.2 安装NRI插件 - echo "step4: apply nri plugin" + echo "step3: apply nri plugin" apply_nri_plugin "./deploy/numa-daemonset.yaml" if [ $? -eq 0 ]; then diff --git a/pkg/resmgr/pod.go b/pkg/resmgr/pod.go index cbd8e40..67ddfc3 100644 --- a/pkg/resmgr/pod.go +++ b/pkg/resmgr/pod.go @@ -57,7 +57,7 @@ type ContainerMeta struct { Name string // podname/container_name Pid uint32 ID string - PodID string + PodUID string CPUShares uint64 State api.ContainerState diff --git a/pkg/resmgr/regulator.go b/pkg/resmgr/regulator.go index c7c7cda..2215043 100644 --- a/pkg/resmgr/regulator.go +++ b/pkg/resmgr/regulator.go @@ -28,7 +28,7 @@ type Regulator struct { } func NewRegulator() (*Regulator, error) { - // TODO: 进行自定义设置 + // TODO: 进行自定义设置 cliConf := NewConfig() k8sConf, err := rest.InClusterConfig() @@ -87,7 +87,7 @@ func (r *Regulator) SyncInfo(pods []*api.PodSandbox, containers []*api.Container r.syncPodInfo(pods) // 将 Container 和 NUMA NOde 进行关系映射 - r.syncContainerInfo(containers) + r.syncContainerInfo(pods, containers) r.syncNumaNodeMap() } @@ -98,7 +98,7 @@ func (r *Regulator) syncPodInfo(pods []*api.PodSandbox) { klog.Errorf("Failed to get pod in %v", r.nodeName) } - for _, podSandBox := range pods { + for _, podSandBox := range pods { uid := podSandBox.Uid name := podSandBox.Name namespace := podSandBox.Namespace @@ -130,25 +130,34 @@ func getQosClass(podList *corev1.PodList, uid string) QoSClass { return "" } -func (r *Regulator) syncContainerInfo(containers []*api.Container) { +func (r *Regulator) syncContainerInfo(pods []*api.PodSandbox, containers []*api.Container) { containerCpuSpecMap := r.getCPULimitsAndRequests() + podToContainer := make(map[string]string) + // 建立 + for _, p := range pods { + podToContainer[p.Id] = p.Uid + } for _, container := range containers { name := container.Name pid := container.Pid id := container.Id podid := container.PodSandboxId + podUid, ok := podToContainer[podid] // TODO should be POD UID + if !ok { + klog.Infof("podid no match poduid, podid: %v", podid) + } cpushares := container.Linux.Resources.Cpu.Shares.GetValue() state := container.State resource := container.Linux.Resources cgrouppath := container.Linux.CgroupsPath - klog.Infof("name = %v, pid = %v, id = %v podid = %v cpushares = %v, state = %v, resource = %v cgrouppath = %v", - name, pid, id, podid, cpushares, state, resource, cgrouppath) + klog.Infof("name = %v, pid = %v, id = %v poduid = %v cpushares = %v, state = %v, resource = %v cgrouppath = %v", + name, pid, id, podUid, cpushares, state, resource, cgrouppath) r.containerInfo[id] = ContainerMeta{ Name: name, Pid: pid, ID: id, - PodID: podid, + PodUID: podUid, CPUShares: cpushares, State: state, Resource: resource, @@ -224,23 +233,26 @@ func (r *Regulator) syncNumaNodeMap() { } func (r *Regulator) Regulate(pod *api.PodSandbox, container *api.Container) (*api.ContainerAdjustment, []*api.ContainerUpdate, error) { - r.Lock() + r.Lock() defer r.Unlock() // Adjust cpuset and memset according to cpu Info - // 进行单个 NUMA 节点的空闲资源计算 numaNodeStat, allocatableNode := r.singleNodeEstimate(pod, container) + klog.Infof("After SingleNodeEstimate, allocatableNode: %v", len(allocatableNode)) + if len(allocatableNode) == 0 { klog.Info("Can't meet the demand, SuppressEstimate........") _, allocatableNode = r.suppressEstimate(pod, container, numaNodeStat) - + klog.Info("SuppressEstimate Done........") } - klog.Infof("After SingleNodeEstimate, allocatableNode: %v", len(allocatableNode)) - - // TODO: 尝试主动驱逐,暂时没有实现 - // r.evictEstimate(pod, container, adjustment, updates) + if len(allocatableNode) == 0 { + klog.Info("Can't meet the demand, return to Kubelet, Exit......") + // TODO: 尝试主动驱逐,暂时没有实现 + // r.evictEstimate(pod, container, adjustment, updates) + return nil, nil, nil + } // 选择cpu可用值最大的节点 maxNode := 0 @@ -322,6 +334,8 @@ func (r *Regulator) genUpdates(pod *api.PodSandbox, container *api.Container, no suppressSpace += (cpuLimit - cpuReq) } } + // 更新 containerInfo 信息 + return conUpdates, nil } @@ -457,7 +471,7 @@ func (r *Regulator) singleNodeEstimate(_ *api.PodSandbox, container *api.Contain } // 返回值:压缩统计后,压缩后能满足的空闲节点 -func (r *Regulator) suppressEstimate(_ *api.PodSandbox, container *api.Container, numaUsage []NUMANodeUsage) ([]NUMANodeUsage, []NUMANodeUsage) { +func (r *Regulator) suppressEstimate(pod *api.PodSandbox, container *api.Container, numaUsage []NUMANodeUsage) ([]NUMANodeUsage, []NUMANodeUsage) { // 按照 (cpu.limits 和 cpu.requests 差值 + 空闲核心值),排序 NUMA Node numaStat := []NUMANodeUsage{} @@ -467,8 +481,9 @@ func (r *Regulator) suppressEstimate(_ *api.PodSandbox, container *api.Container zipSpace := 0 // 对集群中的 Burstable Pod 下的容器进行压缩 for _, con := range r.nodemap.numaNode[numaId].containers { - podId := con.PodID - qos := r.podInfo[podId].QosClass + podUid := con.PodUID + qos := r.podInfo[podUid].QosClass + klog.Infof("container name = %v, pod id = %v, qos = %v", con.Name, podUid, qos) if qos == QoSBurstable { cpuspec := con.CPUAllocation @@ -477,9 +492,11 @@ func (r *Regulator) suppressEstimate(_ *api.PodSandbox, container *api.Container zipSpace += int(limit - request) } } + cpufree := zipSpace + v.cpuFree // 压缩的资源空间 + 原本空闲的资源空间 + cpuused := perNumaCpuNum - cpufree numaStat = append(numaStat, NUMANodeUsage{ - cpuFree: zipSpace + v.cpuFree, // 压缩的资源空间 + 原本空闲的资源空间 - cpuUsed: perNumaCpuNum - v.cpuFree, + cpuFree: cpufree, + cpuUsed: cpuused, nodeId: v.nodeId, }) } @@ -527,7 +544,7 @@ func (r *Regulator) Record(pod *api.PodSandbox, container *api.Container, ops Op Name: container.GetName(), Pid: container.GetPid(), ID: container.GetId(), - PodID: container.GetPodSandboxId(), + PodUID: pod.Uid, CPUShares: container.GetLinux().Resources.Cpu.Shares.GetValue(), State: container.GetState(), Resource: container.GetLinux().Resources, @@ -552,17 +569,21 @@ func (r *Regulator) Record(pod *api.PodSandbox, container *api.Container, ops Op // 从 ContainerInfo 中删除 delete(r.containerInfo, container.GetId()) // 从 NodeMap 中删除 Container - conCpulist := resources.GetCPUSetList(container.GetLinux().GetResources().Cpu.Cpus) + conCpulist := resources.GetCPUSetList(container.GetLinux().GetResources().GetCpu().GetCpus()) for _, cpuid := range conCpulist { numaNodeId := r.nodemap.topo.CPUDetails[cpuid].NUMANodeID numaInfo := r.nodemap.numaNode[numaNodeId] // 删除 container 信息 - idx := 0 - for _, v := range numaInfo.containers { - if v.ID != container.Id { - idx++ + idx := -1 + for i, v := range numaInfo.containers { + if v.ID == container.Id { + idx = i } } + if idx < 0 || idx >= len(numaInfo.containers) { + klog.Infof("There is no containerInfo: %v...", container.Id) + return nil + } numaInfo.containers = append(numaInfo.containers[:idx], numaInfo.containers[idx+1:]...) r.nodemap.numaNode[numaNodeId] = numaInfo klog.Infof("Remove pod into numaInfo --- nodeID: %v, PodMeta: %v", numaNodeId, numaInfo.pods) @@ -575,7 +596,7 @@ func (r *Regulator) Record(pod *api.PodSandbox, container *api.Container, ops Op delete(r.podInfo, uid) // 从 NodeMap 中删除Pod numaSet := map[int]int{} - podCpulist := resources.GetCPUSetList(pod.GetLinux().GetResources().Cpu.Cpus) + podCpulist := resources.GetCPUSetList(pod.GetLinux().GetResources().GetCpu().GetCpus()) for _, cpuid := range podCpulist { numaNodeId := r.nodemap.topo.CPUDetails[cpuid].NUMANodeID numaSet[numaNodeId] = 1 @@ -583,12 +604,16 @@ func (r *Regulator) Record(pod *api.PodSandbox, container *api.Container, ops Op for numaNodeId, _ := range numaSet { numaInfo := r.nodemap.numaNode[numaNodeId] // 删除 container 信息 - idx := 0 + idx := -1 for _, v := range numaInfo.pods { - if v.Uid != pod.Uid { + if v.Uid == pod.Uid { idx++ } } + if idx < 0 || idx >= len(numaInfo.pods) { + klog.Infof("There is no podInfo: %v...", uid) + return nil + } numaInfo.pods = append(numaInfo.pods[:idx], numaInfo.pods[idx+1:]...) r.nodemap.numaNode[numaNodeId] = numaInfo klog.Infof("Remove pod into numaInfo --- nodeID: %v, PodMeta: %v", numaNodeId, numaInfo.pods) @@ -619,4 +644,3 @@ func (r *Regulator) Record(pod *api.PodSandbox, container *api.Container, ops Op } return nil } - -- Gitee