From c51b0904f611689f8cf71d75ceb221eff7db82b0 Mon Sep 17 00:00:00 2001 From: Yuhang Wei Date: Wed, 11 Sep 2024 14:27:35 +0800 Subject: [PATCH 1/2] bugfix (os-operator and proxy): fix the issue that some node configurtions are not delivered When configuring node, the operator updates osinstance and then node. The time of the two updates is affected by the response time of the APIServer. The update time may be different. If the proxy completes the configuration immediately after the osinstance is updated and the node label is updated, the configuration label on the node is not deleted. As a result, the node is skipped during the next configuration. Therefore, the osinstance and node label check is added to the proxy.The configuration is performed only after the operator is updated. In addition, the logs of the operator and proxy are optimized as follows: 1. Fixe an issue where error logs are printed when the values of starttime and endtime are the same. 2. Delete the logs used during development from the time.go file. 3. The log about the successful deletion of the serial label by the operator is added. 4. Add a description before obtaining the logs of the node to be checked. (whether to add serial labels or upgrade/configuration) 5. Logs are added when a node is being upgraded or configuration is returned in serial mode. 6. Some debug logs are added to the operator. Signed-off-by: liyuanr --- ...or-and-proxy-fix-the-issue-that-some.patch | 272 ++++++++++++++++++ KubeOS.spec | 11 +- 2 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 0010-bugfix-os-operator-and-proxy-fix-the-issue-that-some.patch diff --git a/0010-bugfix-os-operator-and-proxy-fix-the-issue-that-some.patch b/0010-bugfix-os-operator-and-proxy-fix-the-issue-that-some.patch new file mode 100644 index 0000000..1950652 --- /dev/null +++ b/0010-bugfix-os-operator-and-proxy-fix-the-issue-that-some.patch @@ -0,0 +1,272 @@ +From 8fce3e81822b0a5818adfb4ed5112030ed6b957e Mon Sep 17 00:00:00 2001 +From: liyuanr +Date: Tue, 10 Sep 2024 16:14:21 +0800 +Subject: [PATCH] bugfix (os-operator and proxy): fix the issue that some node + configurations are not delivered. + +When configuring node, the operator updates osinstance and then node. +The time of the two updates is affected by the response time of the APIServer. +The update time may be different. If the proxy completes the configuration immediately +after the osinstance is updated and the node label is updated, the configuration label +on the node is not deleted. As a result, the node is skipped during the next configuration. +Therefore, the osinstance and node label check is added to the proxy.The configuration is performed +only after the operator is updated. +In addition, the logs of the operator and proxy are optimized as follows: +1. Fixe an issue where error logs are printed when the values of starttime and endtime are the same. +2. Delete the logs used during development from the time.go file. +3. The log about the successful deletion of the serial label by the operator is added. +4. Add a description before obtaining the logs of the node to be checked. (whether to add +serial labels or upgrade/configuration) +5. Logs are added when a node is being upgraded or configuration is returned in serial mode. +6. Some debug logs are added to the operator. + +Signed-off-by: liyuanr +--- + .../proxy/src/controller/controller.rs | 26 ++++++++++++++++--- + KubeOS-Rust/proxy/src/controller/utils.rs | 6 ++--- + KubeOS-Rust/proxy/src/main.rs | 2 +- + cmd/operator/controllers/operation.go | 1 + + cmd/operator/controllers/os_controller.go | 12 ++++++--- + cmd/operator/controllers/times.go | 4 +-- + 6 files changed, 37 insertions(+), 14 deletions(-) + +diff --git a/KubeOS-Rust/proxy/src/controller/controller.rs b/KubeOS-Rust/proxy/src/controller/controller.rs +index 40405b2d..787a0e1c 100644 +--- a/KubeOS-Rust/proxy/src/controller/controller.rs ++++ b/KubeOS-Rust/proxy/src/controller/controller.rs +@@ -57,6 +57,7 @@ pub async fn reconcile( + return Ok(NO_REQUEUE) + } + }else { ++ debug!("osinstance correspending os name is None, not in upgrading or configuring"); + return Ok(REQUEUE_NORMAL) + } + +@@ -68,7 +69,7 @@ pub async fn reconcile( + .as_ref() + .ok_or(Error::MissingSubResource { value: String::from("node.status.node_info") })? + .os_image; +- debug!("os expected osversion is {},actual osversion is {}", os_cr.spec.osversion, node_os_image); ++ debug!("os expected osversion is {}, actual osversion is {}", os_cr.spec.osversion, node_os_image); + if check_version(&os_cr.spec.osversion, node_os_image) { + match ConfigType::SysConfig.check_config_version(&os, &osinstance) { + ConfigOperation::Reassign => { +@@ -94,10 +95,26 @@ pub async fn reconcile( + }, + _ => {}, + } ++ if node.labels().contains_key(LABEL_UPGRADING) || node.labels().contains_key(LABEL_CONFIGURING) { ++ if osinstance.spec.nodestatus == NODE_STATUS_IDLE { ++ info!( ++ "node has upgrade/config label , but osinstance.spec.nodestatus is idle. Operation:refesh node and wait reassgin" ++ ); ++ proxy_controller ++ .refresh_node( ++ node, ++ osinstance, ++ &get_config_version(os_cr.spec.upgradeconfigs.as_ref()), ++ ConfigType::UpgradeConfig, ++ ) ++ .await?; ++ return Ok(REQUEUE_NORMAL); ++ } + proxy_controller.set_config(&mut osinstance, ConfigType::SysConfig).await?; + proxy_controller + .refresh_node(node, osinstance, &get_config_version(os_cr.spec.sysconfigs.as_ref()), ConfigType::SysConfig) + .await?; ++ } + } else { + if os_cr.spec.opstype == NODE_STATUS_CONFIG { + return Err(Error::UpgradeBeforeConfig); +@@ -117,7 +134,7 @@ pub async fn reconcile( + if node.labels().contains_key(LABEL_UPGRADING) { + if osinstance.spec.nodestatus == NODE_STATUS_IDLE { + info!( +- "node has upgrade label ,but osinstance.spec.nodestatus is idle. Operation:refesh node and wait reassgin" ++ "node has upgrade label , but osinstance.spec.nodestatus is idle. Operation:refesh node and wait reassgin" + ); + proxy_controller + .refresh_node( +@@ -196,12 +213,13 @@ impl ProxyController { + let node_api: Api = Api::all(self.k8s_client.clone()); + let labels = node.labels_mut(); + if labels.contains_key(LABEL_UPGRADING) { ++ debug!("delete label {}", LABEL_UPGRADING); + labels.remove(LABEL_UPGRADING); + node = node_api.replace(&node.name(), &PostParams::default(), &node).await?; +- }else if labels.contains_key(LABEL_CONFIGURING) { ++ }else if labels.contains_key(LABEL_CONFIGURING){ ++ debug!("delete label {}", LABEL_CONFIGURING); + labels.remove(LABEL_CONFIGURING); + node = node_api.replace(&node.name(), &PostParams::default(), &node).await?; +- + } + if let Some(node_spec) = &node.spec { + if let Some(node_unschedulable) = node_spec.unschedulable { +diff --git a/KubeOS-Rust/proxy/src/controller/utils.rs b/KubeOS-Rust/proxy/src/controller/utils.rs +index 148ca24d..7e7b41d9 100644 +--- a/KubeOS-Rust/proxy/src/controller/utils.rs ++++ b/KubeOS-Rust/proxy/src/controller/utils.rs +@@ -47,7 +47,7 @@ impl ConfigType { + let os_config_version = get_config_version(os.spec.upgradeconfigs.as_ref()); + let osi_config_version = get_config_version(osinstance.spec.upgradeconfigs.as_ref()); + debug!( +- "os upgradeconfig version is{},osinstance spec upragdeconfig version is{}", ++ "os upgradeconfig version is {}, osinstance spec upragdeconfig version is {}", + os_config_version, osi_config_version + ); + if !check_version(&os_config_version, &osi_config_version) { +@@ -61,7 +61,7 @@ impl ConfigType { + let os_config_version = get_config_version(os.spec.sysconfigs.as_ref()); + let osi_config_version = get_config_version(osinstance.spec.sysconfigs.as_ref()); + debug!( +- "os sysconfig version is{},osinstance spec sysconfig version is{}", ++ "os sysconfig version is {},osinstance spec sysconfig version is {}", + os_config_version, osi_config_version + ); + if !check_version(&os_config_version, &osi_config_version) { +@@ -108,7 +108,7 @@ impl ConfigType { + }, + } + debug!( +- "osinstance soec config version is {},status config version is {}", ++ "osinstance spec config version is {}, status config version is {}", + spec_config_version, status_config_version + ); + if spec_config_version != status_config_version && osinstance.spec.nodestatus != NODE_STATUS_IDLE { +diff --git a/KubeOS-Rust/proxy/src/main.rs b/KubeOS-Rust/proxy/src/main.rs +index 5c122ba2..c15aebed 100644 +--- a/KubeOS-Rust/proxy/src/main.rs ++++ b/KubeOS-Rust/proxy/src/main.rs +@@ -27,7 +27,7 @@ use controller::{ + const PROXY_VERSION: Option<&'static str> = option_env!("CARGO_PKG_VERSION"); + #[tokio::main] + async fn main() -> Result<()> { +- Builder::from_env(Env::default().default_filter_or("info")).target(Target::Stdout).init(); ++ Builder::from_env(Env::default().default_filter_or("proxy=info")).target(Target::Stdout).init(); + let client = Client::try_default().await?; + let os: Api = Api::all(client.clone()); + let controller_client = ControllerClient::new(client.clone()); +diff --git a/cmd/operator/controllers/operation.go b/cmd/operator/controllers/operation.go +index 5ac3d6d4..9f130479 100644 +--- a/cmd/operator/controllers/operation.go ++++ b/cmd/operator/controllers/operation.go +@@ -93,6 +93,7 @@ func deleteSerialLabel(ctx context.Context, r common.ReadStatusWriter, nodes []c + log.Error(err, "unable to delete serial label ", "node", node.Name+", skip this node") + errList = append(errList, err) + } ++ log.Info("delete node " + node.Name + " serial label " + values.LabelSerial + " successfully") + } + } + if len(errList) > 0 { +diff --git a/cmd/operator/controllers/os_controller.go b/cmd/operator/controllers/os_controller.go +index 9e2e8e49..f9e65b47 100644 +--- a/cmd/operator/controllers/os_controller.go ++++ b/cmd/operator/controllers/os_controller.go +@@ -75,7 +75,6 @@ func Reconcile(ctx context.Context, r common.ReadStatusWriter, req ctrl.Request) + " , the end time " + os.Spec.TimeWindow.EndTime) + return values.Requeue, nil + } +- + ops := os.Spec.OpsType + var opsInsatnce operation + switch ops { +@@ -106,6 +105,7 @@ func Reconcile(ctx context.Context, r common.ReadStatusWriter, req ctrl.Request) + if err != nil { + return values.RequeueNow, err + } ++ log.V(1).Info("get all nodes num is " + strconv.Itoa(len(allNodes))) + switch os.Spec.ExecutionMode { + case ExecutionModeParallel: + result, err := excuteParallelOperation(ctx, r, os, opsInsatnce, len(allNodes)) +@@ -197,6 +197,7 @@ func calNodeLimit(ctx context.Context, r common.ReadStatusWriter, + func assignOperation(ctx context.Context, r common.ReadStatusWriter, os upgradev1.OS, limit int, + opsInstance operation, requirements []labels.Requirement) (int, error) { + if limit == 0 { ++ log.V(1).Info("limit is 0 , do not need to assign operation") + return 0, nil + } + nodes, err := getNodes(ctx, r, limit+1, requirements...) // one more to see if all nodes updated +@@ -283,6 +284,7 @@ func setTimeInterval(timeInterval int) ctrl.Result { + + func excuteParallelOperation(ctx context.Context, r common.ReadStatusWriter, os upgradev1.OS, + opsInsatnce operation, nodeNum int) (ctrl.Result, error) { ++ log.V(1).Info("start parallel operation") + opsLabel := opsInsatnce.getOpsLabel() + opsLabel.op = selection.Exists + opsNodesReq, err := newopsNodesRequirement(os.Spec.NodeSelector, +@@ -294,6 +296,7 @@ func excuteParallelOperation(ctx context.Context, r common.ReadStatusWriter, os + if err != nil { + return values.RequeueNow, nil + } ++ log.V(1).Info("get limit is " + strconv.Itoa(limit)) + opsLabel.op = selection.DoesNotExist + noOpsNodesReq, err := newopsNodesRequirement(os.Spec.NodeSelector, + selection.Equals, opsLabel).createNodeRequirement(ctx, r) +@@ -308,6 +311,7 @@ func excuteParallelOperation(ctx context.Context, r common.ReadStatusWriter, os + + func excuteSerialOperation(ctx context.Context, r common.ReadStatusWriter, os upgradev1.OS, + opsInsatnce operation, nodeNum int) (ctrl.Result, error) { ++ log.V(1).Info("start serial operation") + opsLabel := opsInsatnce.getOpsLabel() + opsLabel.op = selection.Exists + opsNodesReq, err := newopsNodesRequirement(os.Spec.NodeSelector, +@@ -320,6 +324,7 @@ func excuteSerialOperation(ctx context.Context, r common.ReadStatusWriter, os up + return values.RequeueNow, nil + } + if len(opsNodeNum) > 0 { ++ log.V(1).Info("a node is being upgraded or configured. Wait until the node upgrade or configuration is complete.") + return values.Requeue, nil + } + +@@ -332,7 +337,7 @@ func excuteSerialOperation(ctx context.Context, r common.ReadStatusWriter, os up + if err != nil { + return values.RequeueNow, nil + } +- ++ log.V(1).Info("get the number of nodes which need to be added serial label num is " + strconv.Itoa(serialNodeLimit)) + noSerialNodesRequirement, err := newSerialNodesRequirement(os.Spec.NodeSelector, + selection.Equals, selection.DoesNotExist).createNodeRequirement(ctx, r) + if err != nil { +@@ -342,10 +347,12 @@ func excuteSerialOperation(ctx context.Context, r common.ReadStatusWriter, os up + serialOpsInstance := serialOps{ + label: opsInsatnce.getOpsLabel(), + } ++ log.V(1).Info("start add serial label to nodes") + if _, err := assignOperation(ctx, r, os, serialNodeLimit, serialOpsInstance, noSerialNodesRequirement); err != nil { + return values.RequeueNow, nil + } + ++ log.V(1).Info("start check nodes needed to be upgrade/configure or not") + serialLimit := 1 // 1 is the number of operation nodes when excution mode in serial + count, err := assignOperation(ctx, r, os, serialLimit, opsInsatnce, serialNodesRequirement) + if err != nil { +@@ -355,5 +362,4 @@ func excuteSerialOperation(ctx context.Context, r common.ReadStatusWriter, os up + return values.Requeue, nil + } + return setTimeInterval(os.Spec.TimeInterval), nil +- + } +diff --git a/cmd/operator/controllers/times.go b/cmd/operator/controllers/times.go +index 3a72cce9..f651c0e4 100644 +--- a/cmd/operator/controllers/times.go ++++ b/cmd/operator/controllers/times.go +@@ -62,14 +62,12 @@ func isWithinTimeWindow(start, end string) (bool, error) { + } + if endTime.Before(startTime) { + if layoutStart == DATE_TIME { +- return false, fmt.Errorf("invalid TimeWindow: Start %s Time is after end time %s", ++ return false, fmt.Errorf("invalid TimeWindow: start time %s is after end time %s", + startTime.Format(layoutStart), endTime.Format(layoutEnd)) + } + endTime = endTime.Add(oneDayTime) +- fmt.Printf("endtime time add 24 hour is %s\n", endTime.Format(layoutStart)) + if now.Before(startTime) { + now = now.Add(oneDayTime) +- fmt.Printf("now time add 24 hour is %s\n", now.Format(layoutStart)) + } + + } +-- +2.33.0.windows.2 + diff --git a/KubeOS.spec b/KubeOS.spec index 7dfcb47..3b1b21a 100644 --- a/KubeOS.spec +++ b/KubeOS.spec @@ -2,7 +2,7 @@ Name: KubeOS Version: 1.0.6 -Release: 3 +Release: 4 Summary: O&M platform used to update the whole OS as an entirety License: Mulan PSL v2 Source0: https://gitee.com/openeuler/KubeOS/repository/archive/v%{version}.tar.gz @@ -15,6 +15,7 @@ Patch6: 0006-operator-delete-unnecessary-fmt-and-add-printing-for.patch Patch7: 0007-feat-os-operator-support-setting-TimeWindow-and-Time.patch Patch8: 0008-feat-os-proxy-add-ExcutionMode-to-os.patch Patch9: 0009-bugfix-fix-the-problem-that-proxy-will-get-all-os-fo.patch +Patch10: 0010-bugfix-os-operator-and-proxy-fix-the-issue-that-some.patch BuildRoot: %{_tmppath}/%{name}-%{version}-build BuildRequires: make rust cargo openssl-devel @@ -126,7 +127,13 @@ install -p -m 0600 ./files/os-release %{buildroot}/opt/kubeOS/files rm -rfv %{buildroot} %changelog -* Tue Jun 11 2024 Yuhang Wei - 1.0.6-3 +* Tue Sep 10 2024 liyuanrong - 1.0.6-4 +- Type:requirement +- CVE:NA +- SUG:restart +- DESC:fix the issue that some node configurations are not delivered + +* Wed Aug 21 2024 liyuanrong - 1.0.6-3 - Type:requirement - CVE:NA - SUG:restart -- Gitee From 02d0e1225342d7b4c79e9e0d3171bef292cd3db7 Mon Sep 17 00:00:00 2001 From: Yuhang Wei Date: Thu, 12 Sep 2024 18:39:18 +0800 Subject: [PATCH 2/2] fix(os-agent,script): increase default rootfs size to 2.5GiB Signed-off-by: Yuhang Wei --- ...crease-default-rootfs-size-to-2.5GiB.patch | 60 +++++++++++++++++++ KubeOS.spec | 9 ++- 2 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 0011-fix-increase-default-rootfs-size-to-2.5GiB.patch diff --git a/0011-fix-increase-default-rootfs-size-to-2.5GiB.patch b/0011-fix-increase-default-rootfs-size-to-2.5GiB.patch new file mode 100644 index 0000000..e310933 --- /dev/null +++ b/0011-fix-increase-default-rootfs-size-to-2.5GiB.patch @@ -0,0 +1,60 @@ +From 1b3ec61c6d90b68e17371a4d62f151202c76c73a Mon Sep 17 00:00:00 2001 +From: Yuhang Wei +Date: Thu, 12 Sep 2024 15:38:45 +0800 +Subject: [PATCH] fix: increase default rootfs size to 2.5GiB + +Currently, default rootfs size is 2100MiB which is not enough. Considering increase the default rootfs size to 2560MiB(2.5GiB) + +Signed-off-by: Yuhang Wei +--- + KubeOS-Rust/manager/src/utils/image_manager.rs | 2 +- + docs/quick-start.md | 2 +- + scripts/create/imageCreate.sh | 6 +++--- + 3 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/KubeOS-Rust/manager/src/utils/image_manager.rs b/KubeOS-Rust/manager/src/utils/image_manager.rs +index 90806cf8..d62b8872 100644 +--- a/KubeOS-Rust/manager/src/utils/image_manager.rs ++++ b/KubeOS-Rust/manager/src/utils/image_manager.rs +@@ -53,7 +53,7 @@ impl UpgradeImageManager { + let image_str = self.image_path_str()?; + + debug!("Create image {}", image_str); +- self.executor.run_command("dd", &["if=/dev/zero", &format!("of={}", image_str), "bs=2M", "count=1024"])?; ++ self.executor.run_command("dd", &["if=/dev/zero", &format!("of={}", image_str), "bs=2M", "count=1280"])?; + fs::set_permissions(&self.paths.image_path, Permissions::from_mode(permission))?; + Ok(()) + } +diff --git a/docs/quick-start.md b/docs/quick-start.md +index 1d59048e..2279154e 100644 +--- a/docs/quick-start.md ++++ b/docs/quick-start.md +@@ -138,7 +138,7 @@ + + * 其中 xxx.repo 为制作镜像所需要的 yum 源,yum 源建议配置为 openEuler 具体版本的 everything 仓库和 EPOL 仓库。 + * 容器 OS 镜像制作完成后,会在 scripts 目录下生成: +- * raw格式的系统镜像system.img,system.img大小默认为20G,支持的根文件系统分区大小<2020MiB,持久化分区<16GB。 ++ * raw格式的系统镜像system.img,system.img大小默认为20G,支持的根文件系统分区大小<2560MiB,持久化分区<14GB。 + * qcow2 格式的系统镜像 system.qcow2。 + * 可用于升级的根文件系统分区镜像 update.img 。 + * 制作出来的容器 OS 虚拟机镜像目前只能用于 CPU 架构为 x86 和 AArch64 的虚拟机场景。若x86 架构的虚拟机需要使用 legacy 启动模式,需制作镜像时指定-l参数 +diff --git a/scripts/create/imageCreate.sh b/scripts/create/imageCreate.sh +index 4d02f9d1..f2ec44de 100644 +--- a/scripts/create/imageCreate.sh ++++ b/scripts/create/imageCreate.sh +@@ -26,9 +26,9 @@ function create_img() { + parted system.img -s mklabel gpt + parted system.img -s mkpart primary fat32 1MiB 60MiB + fi +- parted system.img -s mkpart primary ext4 60MiB 2160MiB +- parted system.img -s mkpart primary ext4 2160MiB 4260MiB +- parted system.img -s mkpart primary ext4 4260MiB 100% ++ parted system.img -s mkpart primary ext4 60MiB 2620MiB ++ parted system.img -s mkpart primary ext4 2620MiB 5180MiB ++ parted system.img -s mkpart primary ext4 5180MiB 100% + local device=$(losetup -f) + losetup "${device}" system.img + +-- +2.39.3 (Apple Git-146) + diff --git a/KubeOS.spec b/KubeOS.spec index 3b1b21a..6483887 100644 --- a/KubeOS.spec +++ b/KubeOS.spec @@ -2,7 +2,7 @@ Name: KubeOS Version: 1.0.6 -Release: 4 +Release: 5 Summary: O&M platform used to update the whole OS as an entirety License: Mulan PSL v2 Source0: https://gitee.com/openeuler/KubeOS/repository/archive/v%{version}.tar.gz @@ -16,6 +16,7 @@ Patch7: 0007-feat-os-operator-support-setting-TimeWindow-and-Time.patch Patch8: 0008-feat-os-proxy-add-ExcutionMode-to-os.patch Patch9: 0009-bugfix-fix-the-problem-that-proxy-will-get-all-os-fo.patch Patch10: 0010-bugfix-os-operator-and-proxy-fix-the-issue-that-some.patch +Patch11: 0011-fix-increase-default-rootfs-size-to-2.5GiB.patch BuildRoot: %{_tmppath}/%{name}-%{version}-build BuildRequires: make rust cargo openssl-devel @@ -127,6 +128,12 @@ install -p -m 0600 ./files/os-release %{buildroot}/opt/kubeOS/files rm -rfv %{buildroot} %changelog +* Thu Sep 12 2024 Yuhang Wei - 1.0.6-5 +- Type:requirement +- CVE:NA +- SUG:restart +- DESC:increase the default rootfs size to 2.5GiB + * Tue Sep 10 2024 liyuanrong - 1.0.6-4 - Type:requirement - CVE:NA -- Gitee