From 0647df90862d1724875d23901ada8dfbe9f6a6a6 Mon Sep 17 00:00:00 2001 From: liyuanr Date: Tue, 10 Sep 2024 19:17:54 +0800 Subject: [PATCH] bugfix (os-operator and proxy): fix the issue that some node configurations are not delivered When configuring node, the operator updates osinstance and then node. The time of the two updates is affected by the response time of the APIServer. The update time may be different. If the proxy completes the configuration immediately after the osinstance is updated and the node label is updated, the configuration label on the node is not deleted. As a result, the node is skipped during the next configuration. Therefore, the osinstance and node label check is added to the proxy.The configuration is performed only after the operator is updated. In addition, the logs of the operator and proxy are optimized as follows: 1. Fixe an issue where error logs are printed when the values of starttime and endtime are the same. 2. Delete the logs used during development from the time.go file. 3. The log about the successful deletion of the serial label by the operator is added. 4. Add a description before obtaining the logs of the node to be checked. (whether to add serial labels or upgrade/configuration) 5. Logs are added when a node is being upgraded or configuration is returned in serial mode. 6. Some debug logs are added to the operator. Signed-off-by: liyuanr --- ...or-and-proxy-fix-the-issue-that-some.patch | 272 ++++++++++++++++++ KubeOS.spec | 11 +- 2 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 0010-bugfix-os-operator-and-proxy-fix-the-issue-that-some.patch diff --git a/0010-bugfix-os-operator-and-proxy-fix-the-issue-that-some.patch b/0010-bugfix-os-operator-and-proxy-fix-the-issue-that-some.patch new file mode 100644 index 0000000..1950652 --- /dev/null +++ b/0010-bugfix-os-operator-and-proxy-fix-the-issue-that-some.patch @@ -0,0 +1,272 @@ +From 8fce3e81822b0a5818adfb4ed5112030ed6b957e Mon Sep 17 00:00:00 2001 +From: liyuanr +Date: Tue, 10 Sep 2024 16:14:21 +0800 +Subject: [PATCH] bugfix (os-operator and proxy): fix the issue that some node + configurations are not delivered. + +When configuring node, the operator updates osinstance and then node. +The time of the two updates is affected by the response time of the APIServer. +The update time may be different. If the proxy completes the configuration immediately +after the osinstance is updated and the node label is updated, the configuration label +on the node is not deleted. As a result, the node is skipped during the next configuration. +Therefore, the osinstance and node label check is added to the proxy.The configuration is performed +only after the operator is updated. +In addition, the logs of the operator and proxy are optimized as follows: +1. Fixe an issue where error logs are printed when the values of starttime and endtime are the same. +2. Delete the logs used during development from the time.go file. +3. The log about the successful deletion of the serial label by the operator is added. +4. Add a description before obtaining the logs of the node to be checked. (whether to add +serial labels or upgrade/configuration) +5. Logs are added when a node is being upgraded or configuration is returned in serial mode. +6. Some debug logs are added to the operator. + +Signed-off-by: liyuanr +--- + .../proxy/src/controller/controller.rs | 26 ++++++++++++++++--- + KubeOS-Rust/proxy/src/controller/utils.rs | 6 ++--- + KubeOS-Rust/proxy/src/main.rs | 2 +- + cmd/operator/controllers/operation.go | 1 + + cmd/operator/controllers/os_controller.go | 12 ++++++--- + cmd/operator/controllers/times.go | 4 +-- + 6 files changed, 37 insertions(+), 14 deletions(-) + +diff --git a/KubeOS-Rust/proxy/src/controller/controller.rs b/KubeOS-Rust/proxy/src/controller/controller.rs +index 40405b2d..787a0e1c 100644 +--- a/KubeOS-Rust/proxy/src/controller/controller.rs ++++ b/KubeOS-Rust/proxy/src/controller/controller.rs +@@ -57,6 +57,7 @@ pub async fn reconcile( + return Ok(NO_REQUEUE) + } + }else { ++ debug!("osinstance correspending os name is None, not in upgrading or configuring"); + return Ok(REQUEUE_NORMAL) + } + +@@ -68,7 +69,7 @@ pub async fn reconcile( + .as_ref() + .ok_or(Error::MissingSubResource { value: String::from("node.status.node_info") })? + .os_image; +- debug!("os expected osversion is {},actual osversion is {}", os_cr.spec.osversion, node_os_image); ++ debug!("os expected osversion is {}, actual osversion is {}", os_cr.spec.osversion, node_os_image); + if check_version(&os_cr.spec.osversion, node_os_image) { + match ConfigType::SysConfig.check_config_version(&os, &osinstance) { + ConfigOperation::Reassign => { +@@ -94,10 +95,26 @@ pub async fn reconcile( + }, + _ => {}, + } ++ if node.labels().contains_key(LABEL_UPGRADING) || node.labels().contains_key(LABEL_CONFIGURING) { ++ if osinstance.spec.nodestatus == NODE_STATUS_IDLE { ++ info!( ++ "node has upgrade/config label , but osinstance.spec.nodestatus is idle. Operation:refesh node and wait reassgin" ++ ); ++ proxy_controller ++ .refresh_node( ++ node, ++ osinstance, ++ &get_config_version(os_cr.spec.upgradeconfigs.as_ref()), ++ ConfigType::UpgradeConfig, ++ ) ++ .await?; ++ return Ok(REQUEUE_NORMAL); ++ } + proxy_controller.set_config(&mut osinstance, ConfigType::SysConfig).await?; + proxy_controller + .refresh_node(node, osinstance, &get_config_version(os_cr.spec.sysconfigs.as_ref()), ConfigType::SysConfig) + .await?; ++ } + } else { + if os_cr.spec.opstype == NODE_STATUS_CONFIG { + return Err(Error::UpgradeBeforeConfig); +@@ -117,7 +134,7 @@ pub async fn reconcile( + if node.labels().contains_key(LABEL_UPGRADING) { + if osinstance.spec.nodestatus == NODE_STATUS_IDLE { + info!( +- "node has upgrade label ,but osinstance.spec.nodestatus is idle. Operation:refesh node and wait reassgin" ++ "node has upgrade label , but osinstance.spec.nodestatus is idle. Operation:refesh node and wait reassgin" + ); + proxy_controller + .refresh_node( +@@ -196,12 +213,13 @@ impl ProxyController { + let node_api: Api = Api::all(self.k8s_client.clone()); + let labels = node.labels_mut(); + if labels.contains_key(LABEL_UPGRADING) { ++ debug!("delete label {}", LABEL_UPGRADING); + labels.remove(LABEL_UPGRADING); + node = node_api.replace(&node.name(), &PostParams::default(), &node).await?; +- }else if labels.contains_key(LABEL_CONFIGURING) { ++ }else if labels.contains_key(LABEL_CONFIGURING){ ++ debug!("delete label {}", LABEL_CONFIGURING); + labels.remove(LABEL_CONFIGURING); + node = node_api.replace(&node.name(), &PostParams::default(), &node).await?; +- + } + if let Some(node_spec) = &node.spec { + if let Some(node_unschedulable) = node_spec.unschedulable { +diff --git a/KubeOS-Rust/proxy/src/controller/utils.rs b/KubeOS-Rust/proxy/src/controller/utils.rs +index 148ca24d..7e7b41d9 100644 +--- a/KubeOS-Rust/proxy/src/controller/utils.rs ++++ b/KubeOS-Rust/proxy/src/controller/utils.rs +@@ -47,7 +47,7 @@ impl ConfigType { + let os_config_version = get_config_version(os.spec.upgradeconfigs.as_ref()); + let osi_config_version = get_config_version(osinstance.spec.upgradeconfigs.as_ref()); + debug!( +- "os upgradeconfig version is{},osinstance spec upragdeconfig version is{}", ++ "os upgradeconfig version is {}, osinstance spec upragdeconfig version is {}", + os_config_version, osi_config_version + ); + if !check_version(&os_config_version, &osi_config_version) { +@@ -61,7 +61,7 @@ impl ConfigType { + let os_config_version = get_config_version(os.spec.sysconfigs.as_ref()); + let osi_config_version = get_config_version(osinstance.spec.sysconfigs.as_ref()); + debug!( +- "os sysconfig version is{},osinstance spec sysconfig version is{}", ++ "os sysconfig version is {},osinstance spec sysconfig version is {}", + os_config_version, osi_config_version + ); + if !check_version(&os_config_version, &osi_config_version) { +@@ -108,7 +108,7 @@ impl ConfigType { + }, + } + debug!( +- "osinstance soec config version is {},status config version is {}", ++ "osinstance spec config version is {}, status config version is {}", + spec_config_version, status_config_version + ); + if spec_config_version != status_config_version && osinstance.spec.nodestatus != NODE_STATUS_IDLE { +diff --git a/KubeOS-Rust/proxy/src/main.rs b/KubeOS-Rust/proxy/src/main.rs +index 5c122ba2..c15aebed 100644 +--- a/KubeOS-Rust/proxy/src/main.rs ++++ b/KubeOS-Rust/proxy/src/main.rs +@@ -27,7 +27,7 @@ use controller::{ + const PROXY_VERSION: Option<&'static str> = option_env!("CARGO_PKG_VERSION"); + #[tokio::main] + async fn main() -> Result<()> { +- Builder::from_env(Env::default().default_filter_or("info")).target(Target::Stdout).init(); ++ Builder::from_env(Env::default().default_filter_or("proxy=info")).target(Target::Stdout).init(); + let client = Client::try_default().await?; + let os: Api = Api::all(client.clone()); + let controller_client = ControllerClient::new(client.clone()); +diff --git a/cmd/operator/controllers/operation.go b/cmd/operator/controllers/operation.go +index 5ac3d6d4..9f130479 100644 +--- a/cmd/operator/controllers/operation.go ++++ b/cmd/operator/controllers/operation.go +@@ -93,6 +93,7 @@ func deleteSerialLabel(ctx context.Context, r common.ReadStatusWriter, nodes []c + log.Error(err, "unable to delete serial label ", "node", node.Name+", skip this node") + errList = append(errList, err) + } ++ log.Info("delete node " + node.Name + " serial label " + values.LabelSerial + " successfully") + } + } + if len(errList) > 0 { +diff --git a/cmd/operator/controllers/os_controller.go b/cmd/operator/controllers/os_controller.go +index 9e2e8e49..f9e65b47 100644 +--- a/cmd/operator/controllers/os_controller.go ++++ b/cmd/operator/controllers/os_controller.go +@@ -75,7 +75,6 @@ func Reconcile(ctx context.Context, r common.ReadStatusWriter, req ctrl.Request) + " , the end time " + os.Spec.TimeWindow.EndTime) + return values.Requeue, nil + } +- + ops := os.Spec.OpsType + var opsInsatnce operation + switch ops { +@@ -106,6 +105,7 @@ func Reconcile(ctx context.Context, r common.ReadStatusWriter, req ctrl.Request) + if err != nil { + return values.RequeueNow, err + } ++ log.V(1).Info("get all nodes num is " + strconv.Itoa(len(allNodes))) + switch os.Spec.ExecutionMode { + case ExecutionModeParallel: + result, err := excuteParallelOperation(ctx, r, os, opsInsatnce, len(allNodes)) +@@ -197,6 +197,7 @@ func calNodeLimit(ctx context.Context, r common.ReadStatusWriter, + func assignOperation(ctx context.Context, r common.ReadStatusWriter, os upgradev1.OS, limit int, + opsInstance operation, requirements []labels.Requirement) (int, error) { + if limit == 0 { ++ log.V(1).Info("limit is 0 , do not need to assign operation") + return 0, nil + } + nodes, err := getNodes(ctx, r, limit+1, requirements...) // one more to see if all nodes updated +@@ -283,6 +284,7 @@ func setTimeInterval(timeInterval int) ctrl.Result { + + func excuteParallelOperation(ctx context.Context, r common.ReadStatusWriter, os upgradev1.OS, + opsInsatnce operation, nodeNum int) (ctrl.Result, error) { ++ log.V(1).Info("start parallel operation") + opsLabel := opsInsatnce.getOpsLabel() + opsLabel.op = selection.Exists + opsNodesReq, err := newopsNodesRequirement(os.Spec.NodeSelector, +@@ -294,6 +296,7 @@ func excuteParallelOperation(ctx context.Context, r common.ReadStatusWriter, os + if err != nil { + return values.RequeueNow, nil + } ++ log.V(1).Info("get limit is " + strconv.Itoa(limit)) + opsLabel.op = selection.DoesNotExist + noOpsNodesReq, err := newopsNodesRequirement(os.Spec.NodeSelector, + selection.Equals, opsLabel).createNodeRequirement(ctx, r) +@@ -308,6 +311,7 @@ func excuteParallelOperation(ctx context.Context, r common.ReadStatusWriter, os + + func excuteSerialOperation(ctx context.Context, r common.ReadStatusWriter, os upgradev1.OS, + opsInsatnce operation, nodeNum int) (ctrl.Result, error) { ++ log.V(1).Info("start serial operation") + opsLabel := opsInsatnce.getOpsLabel() + opsLabel.op = selection.Exists + opsNodesReq, err := newopsNodesRequirement(os.Spec.NodeSelector, +@@ -320,6 +324,7 @@ func excuteSerialOperation(ctx context.Context, r common.ReadStatusWriter, os up + return values.RequeueNow, nil + } + if len(opsNodeNum) > 0 { ++ log.V(1).Info("a node is being upgraded or configured. Wait until the node upgrade or configuration is complete.") + return values.Requeue, nil + } + +@@ -332,7 +337,7 @@ func excuteSerialOperation(ctx context.Context, r common.ReadStatusWriter, os up + if err != nil { + return values.RequeueNow, nil + } +- ++ log.V(1).Info("get the number of nodes which need to be added serial label num is " + strconv.Itoa(serialNodeLimit)) + noSerialNodesRequirement, err := newSerialNodesRequirement(os.Spec.NodeSelector, + selection.Equals, selection.DoesNotExist).createNodeRequirement(ctx, r) + if err != nil { +@@ -342,10 +347,12 @@ func excuteSerialOperation(ctx context.Context, r common.ReadStatusWriter, os up + serialOpsInstance := serialOps{ + label: opsInsatnce.getOpsLabel(), + } ++ log.V(1).Info("start add serial label to nodes") + if _, err := assignOperation(ctx, r, os, serialNodeLimit, serialOpsInstance, noSerialNodesRequirement); err != nil { + return values.RequeueNow, nil + } + ++ log.V(1).Info("start check nodes needed to be upgrade/configure or not") + serialLimit := 1 // 1 is the number of operation nodes when excution mode in serial + count, err := assignOperation(ctx, r, os, serialLimit, opsInsatnce, serialNodesRequirement) + if err != nil { +@@ -355,5 +362,4 @@ func excuteSerialOperation(ctx context.Context, r common.ReadStatusWriter, os up + return values.Requeue, nil + } + return setTimeInterval(os.Spec.TimeInterval), nil +- + } +diff --git a/cmd/operator/controllers/times.go b/cmd/operator/controllers/times.go +index 3a72cce9..f651c0e4 100644 +--- a/cmd/operator/controllers/times.go ++++ b/cmd/operator/controllers/times.go +@@ -62,14 +62,12 @@ func isWithinTimeWindow(start, end string) (bool, error) { + } + if endTime.Before(startTime) { + if layoutStart == DATE_TIME { +- return false, fmt.Errorf("invalid TimeWindow: Start %s Time is after end time %s", ++ return false, fmt.Errorf("invalid TimeWindow: start time %s is after end time %s", + startTime.Format(layoutStart), endTime.Format(layoutEnd)) + } + endTime = endTime.Add(oneDayTime) +- fmt.Printf("endtime time add 24 hour is %s\n", endTime.Format(layoutStart)) + if now.Before(startTime) { + now = now.Add(oneDayTime) +- fmt.Printf("now time add 24 hour is %s\n", now.Format(layoutStart)) + } + + } +-- +2.33.0.windows.2 + diff --git a/KubeOS.spec b/KubeOS.spec index 7dfcb47..3b1b21a 100644 --- a/KubeOS.spec +++ b/KubeOS.spec @@ -2,7 +2,7 @@ Name: KubeOS Version: 1.0.6 -Release: 3 +Release: 4 Summary: O&M platform used to update the whole OS as an entirety License: Mulan PSL v2 Source0: https://gitee.com/openeuler/KubeOS/repository/archive/v%{version}.tar.gz @@ -15,6 +15,7 @@ Patch6: 0006-operator-delete-unnecessary-fmt-and-add-printing-for.patch Patch7: 0007-feat-os-operator-support-setting-TimeWindow-and-Time.patch Patch8: 0008-feat-os-proxy-add-ExcutionMode-to-os.patch Patch9: 0009-bugfix-fix-the-problem-that-proxy-will-get-all-os-fo.patch +Patch10: 0010-bugfix-os-operator-and-proxy-fix-the-issue-that-some.patch BuildRoot: %{_tmppath}/%{name}-%{version}-build BuildRequires: make rust cargo openssl-devel @@ -126,7 +127,13 @@ install -p -m 0600 ./files/os-release %{buildroot}/opt/kubeOS/files rm -rfv %{buildroot} %changelog -* Tue Jun 11 2024 Yuhang Wei - 1.0.6-3 +* Tue Sep 10 2024 liyuanrong - 1.0.6-4 +- Type:requirement +- CVE:NA +- SUG:restart +- DESC:fix the issue that some node configurations are not delivered + +* Wed Aug 21 2024 liyuanrong - 1.0.6-3 - Type:requirement - CVE:NA - SUG:restart -- Gitee