diff --git a/0001-Prepare-for-next-pointrelease.patch b/0001-Prepare-for-next-pointrelease.patch new file mode 100644 index 0000000000000000000000000000000000000000..c9b24c11616cb3e4a16fd3434ee32aec8a27d00e --- /dev/null +++ b/0001-Prepare-for-next-pointrelease.patch @@ -0,0 +1,97 @@ +From c209fe8c68e4275855154285901cfc0091bdf9e9 Mon Sep 17 00:00:00 2001 +From: Oleg Drokin +Date: Tue, 18 Jul 2023 10:54:54 -0400 +Subject: [PATCH 01/61] Prepare for next pointrelease. + +Change-Id: Idc2b25dea2b4c0f587f735dc1bdf2dd358d1f647 +--- + lustre/ChangeLog | 76 +++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 75 insertions(+), 1 deletion(-) + +diff --git a/lustre/ChangeLog b/lustre/ChangeLog +index 59b05d21f2..57699d2232 100644 +--- a/lustre/ChangeLog ++++ b/lustre/ChangeLog +@@ -1,4 +1,78 @@ +-06-19-2023 Whamcloud ++TBD Whamcloud ++ * version 2.15.4 ++ * See https://wiki.whamcloud.com/display/PUB/Lustre+Support+Matrix ++ for currently supported client and server kernel versions. ++ * Server primary kernels built and tested during release cycle: ++ 4.18.0-477.10.1.el8 (RHEL8.8) ++ * Other server kernels known to build and work at some point (others may also work): ++ 3.10.0-862.14.4.el7 (RHEL7.5) ++ 3.10.0-957.27.2.el7 (RHEL7.6) ++ 3.10.0-1062.18.1.el7 (RHEL7.7) ++ 3.10.0-1127.19.1.el7 (RHEL7.8) ++ 3.10.0-1160.88.1.el7 (RHEL7.9) ++ 4.18.0-80.11.2.el8 (RHEL8.0) ++ 4.18.0-147.8.1.el8 (RHEL8.1) ++ 4.18.0-193.28.1.el8 (RHEL8.2) ++ 4.18.0-240.22.1.el8 (RHEL8.3) ++ 4.18.0-305.25.1.el8 (RHEL8.4) ++ 4.18.0-348.23.1.el8 (RHEL8.5) ++ 4.18.0-372.32.1.el8 (RHEL8.6) ++ 4.18.0-425.3.1.el8 (RHEL8.7) ++ 4.4.120-92.70 (SLES12 SP2) ++ 4.4.180-94.100 (SLES12 SP3) ++ 4.4.0-131 (Ubuntu 16.04) ++ 4.15.0-32 (Ubuntu 18.04) ++ 5.4.0-48 (Ubuntu 20.04) ++ vanilla linux 5.4.0 (ZFS + ldiskfs) ++ vanilla linux 5.4.21 (ZFS + ldiskfs) ++ vanilla linux 5.4.136 (ZFS + ldiskfs) ++ * ldiskfs needs an ldiskfs patch series for that kernel, ZFS does not ++ * Client primary kernels built and tested during release cycle: ++ 5.14.0-284.11.1.el9 (RHEL9.2) ++ 5.14.0-162.23.1.el9 (RHEL9.1) ++ 4.18.0-477.10.1.el8 (RHEL8.8) ++ 5.4.0-37 (Ubuntu 20.04) ++ 5.14.21-150400.24.28 (SLES15 SP4) ++ * Other clients known to build on these kernels at some point (others may also work): ++ 3.10.0-862.14.4.el7 (RHEL7.5) ++ 3.10.0-957.27.2.el7 (RHEL7.6) ++ 4.14.0-49.13.1.el7a (RHEL7.5) ++ 4.14.0-115.2.2.el7a (RHEL7.6) ++ 3.10.0-1062.18.1.el7 (RHEL7.7) ++ 3.10.0-1127.19.1.el7 (RHEL7.8) ++ 3.10.0-1160.88.1.el7 (RHEL7.9) ++ 4.18.0-80.11.2.el8 (RHEL8.0) ++ 4.18.0-147.8.1.el8 (RHEL8.1) ++ 4.18.0-193.28.1.el8 (RHEL8.2) ++ 4.18.0-240.22.1.el8 (RHEL8.3) ++ 4.18.0-305.25.1.el8 (RHEL8.4) ++ 4.18.0-348.23.1.el8 (RHEL8.5) ++ 4.18.0-372.32.1.el8 (RHEL8.6) ++ 4.18.0-425.3.1.el8 (RHEL8.7) ++ 5.14.0-70.30.1.el9 (RHEL9.0) ++ 4.4.120-92.70 (SLES12 SP2) ++ 4.4.180-94.100 (SLES12 SP3) ++ 4.12.14-95.48 (SLES12 SP4) ++ 4.12.14-122.91 (SLES12 SP5) ++ 4.12.14-197.75 (SLES15 SP1) ++ 5.3.18-24.96 (SLES15 SP2) ++ 5.3.18-150300.59.93 (SLES15 SP3) ++ 4.4.0-131 (Ubuntu 16.04) ++ 4.15.0-48 (Ubuntu 18.04) ++ 5.8.0-53 (Ubuntu 20.04.2 HWE) ++ 5.11.0-31 (Ubuntu 20.04.3 HWE) ++ 5.11.0 (vanilla kernel.org) ++ * Recommended e2fsprogs version: 1.47.0-wc1 or newer ++ * Recommended ZFS version: 2.1.11 ++ * NFS export disabled when stack size < 8192 (32-bit Lustre clients), ++ since the NFSv4 export of Lustre filesystem with 4K stack may cause a ++ stack overflow. For more information, please refer to bugzilla 17630. ++ * NFSv4 reexport to 32-bit NFS client nodes requires Lustre client on ++ the re-exporting nodes to be mounted with "32bitapi" mount option ++ ++-------------------------------------------------------------------------------- ++ ++'06-19-2023 Whamcloud + * version 2.15.3 + * See https://wiki.whamcloud.com/display/PUB/Lustre+Support+Matrix + for currently supported client and server kernel versions. +-- +2.33.0 + diff --git a/0002-LU-15821-ldlm-Prioritize-blocking-callbacks.patch b/0002-LU-15821-ldlm-Prioritize-blocking-callbacks.patch new file mode 100644 index 0000000000000000000000000000000000000000..1cdbf0de16d89a85477b05f23f295a7daa663496 --- /dev/null +++ b/0002-LU-15821-ldlm-Prioritize-blocking-callbacks.patch @@ -0,0 +1,159 @@ +From 8ca1186151faa778edd5abd361e92fcd5d8ff56b Mon Sep 17 00:00:00 2001 +From: Patrick Farrell +Date: Wed, 4 May 2022 20:50:57 -0400 +Subject: [PATCH 02/61] LU-15821 ldlm: Prioritize blocking callbacks + +The current code places bl_ast lock callbacks at the end of +the global BL callback queue. This is bad because it +causes urgent requests from the server to wait behind +non-urgent cleanup tasks to keep lru_size at the right +level. + +This can lead to evictions if there is a large queue of +items in the global queue so the callback is not serviced +in a timely manner. + +Put bl_ast callbacks on the priority queue so they do not +wait behind the background traffic. + +Add some additional debug in this area. + +Lustre-change: https://review.whamcloud.com/47215 +Lustre-commit: 2d59294d52b696125acc464e5910c893d9aef237 + +Signed-off-by: Patrick Farrell +Change-Id: Ic6eb65819a4a93e9d30e807d386ca18380b30c7d +Reviewed-by: Andreas Dilger +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49610 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Stephane Thiell +Reviewed-by: Oleg Drokin +--- + lustre/ldlm/ldlm_lockd.c | 60 ++++++++++++++++++++++++++++++++-------- + 1 file changed, 48 insertions(+), 12 deletions(-) + +diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c +index f82df7df0e..7e59709ea6 100644 +--- a/lustre/ldlm/ldlm_lockd.c ++++ b/lustre/ldlm/ldlm_lockd.c +@@ -83,27 +83,29 @@ static inline timeout_t ldlm_get_rq_timeout(void) + } + + struct ldlm_bl_pool { +- spinlock_t blp_lock; ++ spinlock_t blp_lock; + + /* + * blp_prio_list is used for callbacks that should be handled + * as a priority. It is used for LDLM_FL_DISCARD_DATA requests. + * see b=13843 + */ +- struct list_head blp_prio_list; ++ struct list_head blp_prio_list; + + /* + * blp_list is used for all other callbacks which are likely + * to take longer to process. + */ +- struct list_head blp_list; +- +- wait_queue_head_t blp_waitq; +- struct completion blp_comp; +- atomic_t blp_num_threads; +- atomic_t blp_busy_threads; +- int blp_min_threads; +- int blp_max_threads; ++ struct list_head blp_list; ++ ++ wait_queue_head_t blp_waitq; ++ struct completion blp_comp; ++ atomic_t blp_num_threads; ++ atomic_t blp_busy_threads; ++ int blp_min_threads; ++ int blp_max_threads; ++ int blp_total_locks; ++ int blp_total_blwis; + }; + + struct ldlm_bl_work_item { +@@ -2116,22 +2118,41 @@ static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi, + enum ldlm_cancel_flags cancel_flags) + { + struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool; ++ char *prio = "regular"; ++ int count; + + ENTRY; + + spin_lock(&blp->blp_lock); ++ /* cannot access blwi after added to list and lock is dropped */ ++ count = blwi->blwi_lock ? 1 : blwi->blwi_count; ++ ++ /* if the server is waiting on a lock to be cancelled (bl_ast), this is ++ * an urgent request and should go in the priority queue so it doesn't ++ * get stuck behind non-priority work (eg, lru size management) ++ * ++ * We also prioritize discard_data, which is for eviction handling ++ */ + if (blwi->blwi_lock && +- ldlm_is_discard_data(blwi->blwi_lock)) { +- /* add LDLM_FL_DISCARD_DATA requests to the priority list */ ++ (ldlm_is_discard_data(blwi->blwi_lock) || ++ ldlm_is_bl_ast(blwi->blwi_lock))) { + list_add_tail(&blwi->blwi_entry, &blp->blp_prio_list); ++ prio = "priority"; + } else { + /* other blocking callbacks are added to the regular list */ + list_add_tail(&blwi->blwi_entry, &blp->blp_list); + } ++ blp->blp_total_locks += count; ++ blp->blp_total_blwis++; + spin_unlock(&blp->blp_lock); + + wake_up(&blp->blp_waitq); + ++ /* unlocked read of blp values is intentional - OK for debug */ ++ CDEBUG(D_DLMTRACE, ++ "added %d/%d locks to %s blp list, %d blwis in pool\n", ++ count, blp->blp_total_locks, prio, blp->blp_total_blwis); ++ + /* + * can not check blwi->blwi_flags as blwi could be already freed in + * LCF_ASYNC mode +@@ -2749,10 +2770,23 @@ static int ldlm_bl_get_work(struct ldlm_bl_pool *blp, + if (++num_bl >= num_th) + num_bl = 0; + list_del(&blwi->blwi_entry); ++ blp->blp_total_locks -= blwi->blwi_lock ? 1 : blwi->blwi_count; ++ blp->blp_total_blwis--; + } + spin_unlock(&blp->blp_lock); + *p_blwi = blwi; + ++ /* intentional unlocked read of blp values - OK for debug */ ++ if (blwi) { ++ CDEBUG(D_DLMTRACE, ++ "Got %d locks of %d total in blp. (%d blwis in pool)\n", ++ blwi->blwi_lock ? 1 : blwi->blwi_count, ++ blp->blp_total_locks, blp->blp_total_blwis); ++ } else { ++ CDEBUG(D_DLMTRACE, ++ "No blwi found in queue (no bl locks in queue)\n"); ++ } ++ + if (*p_exp != NULL && *p_blwi != NULL) { + obd_stale_export_put(*p_exp); + *p_exp = NULL; +@@ -3293,6 +3327,8 @@ static int ldlm_setup(void) + init_waitqueue_head(&blp->blp_waitq); + atomic_set(&blp->blp_num_threads, 0); + atomic_set(&blp->blp_busy_threads, 0); ++ blp->blp_total_locks = 0; ++ blp->blp_total_blwis = 0; + + if (ldlm_num_threads == 0) { + blp->blp_min_threads = LDLM_NTHRS_INIT; +-- +2.33.0 + diff --git a/0003-LU-14377-tests-make-parallel-scale-rr_alloc-less-str.patch b/0003-LU-14377-tests-make-parallel-scale-rr_alloc-less-str.patch new file mode 100644 index 0000000000000000000000000000000000000000..ba98d63af6b9b6c96f8ee253c0338438f4c6f511 --- /dev/null +++ b/0003-LU-14377-tests-make-parallel-scale-rr_alloc-less-str.patch @@ -0,0 +1,144 @@ +From c0b60c0c79a2d5d5be651570564d6d0407457a5f Mon Sep 17 00:00:00 2001 +From: Andreas Dilger +Date: Tue, 18 Oct 2022 18:37:58 -0600 +Subject: [PATCH 03/61] LU-14377 tests: make parallel-scale/rr_alloc less + strict + +test_rr_alloc() sometimes fails with a difference of 3-4 objects +per OST, after creating 1500+ objects on each OST. This should +not be considered fatal. Make the test more lenient, and allow +a difference of up to 0.3% of objects between the OSTs. + +Fix some code style issues in the test. + +Lustre-change: https://review.whamcloud.com/48914 +Lustre-commit: b104c0a27713899a4d047f56fed57c30c39b8195 + +Test-Parameters: trivial testlist=parallel-scale env=ONLY=rr_alloc +Signed-off-by: Andreas Dilger +Change-Id: Ib6ba8c5d8e9d3245833448a52f8ed25308698a33 +Reviewed-by: Lai Siyao +Reviewed-by: Elena Gryaznova +(cherry picked from commit b104c0a27713899a4d047f56fed57c30c39b8195) +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51142 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Oleg Drokin +--- + lustre/tests/functions.sh | 52 +++++++++++++++++++++------------------ + 1 file changed, 28 insertions(+), 24 deletions(-) + +diff --git a/lustre/tests/functions.sh b/lustre/tests/functions.sh +index 1807bb8d17..3017fdb887 100644 +--- a/lustre/tests/functions.sh ++++ b/lustre/tests/functions.sh +@@ -1010,16 +1010,15 @@ cleanup_rr_alloc () { + + run_rr_alloc() { + remote_mds_nodsh && skip "remote MDS with nodsh" +- echo "===Test gives more reproduction percentage if number of "\ +- "client and ost are more. Test with 44 or more clients "\ +- "and 73 or more OSTs gives 100% reproduction rate==" ++ ++ echo "===Test gives more reproduction percentage if number of " ++ echo " client and ost are more. Test with 44 or more clients " ++ echo " and 73 or more OSTs gives 100% reproduction rate==" + + RR_ALLOC=${RR_ALLOC:-$(which rr_alloc 2> /dev/null || true)} + [ x$RR_ALLOC = x ] && skip_env "rr_alloc not found" + declare -a diff_max_min_arr +- # foeo = file on each ost. calc = calculated. + local ost_idx +- local foeo_calc + local qos_prec_objs="${TMP}/qos_and_precreated_objects" + local rr_alloc_NFILES=${rr_alloc_NFILES:-555} + local rr_alloc_MNTPTS=${rr_alloc_MNTPTS:-11} +@@ -1036,9 +1035,6 @@ run_rr_alloc() { + error_exit "Failed to mount lustre on ${mntpt_root}$i $clients" + done + +- local cmd="$RR_ALLOC $mntpt_root/$tdir/ash $rr_alloc_NFILES \ +- $num_clients" +- + # Save mdt values, set threshold to 100% i.e always Round Robin, + # restore the saved values again after creating files... + save_lustre_params mds1 \ +@@ -1056,7 +1052,8 @@ run_rr_alloc() { + # per OST are not multiple of that then it will be set to nearest + # lower power of 2. So set 'create_count' to the upper power of 2. + +- foeo_calc=$((rr_alloc_NFILES * total_MNTPTS / OSTCOUNT)) ++ # foeo = file on each ost. calc = calculated. ++ local foeo_calc=$((rr_alloc_NFILES * total_MNTPTS / OSTCOUNT)) + local create_count=$((2 * foeo_calc)) + + # create_count accepted values: +@@ -1078,18 +1075,19 @@ run_rr_alloc() { + # is created per OSTs. + createmany -o $DIR/$tdir/foo- $(((old_create_count + 1) * OSTCOUNT)) \ + > /dev/null +- rm -f /$DIR/$tdir/foo* ++ unlinkmany $DIR/$tdir/foo- $(((old_create_count + 1) * OSTCOUNT)) + + # Check for enough precreated objects... We should not + # fail here because code(osp_precreate.c) also takes care of it. + # So we have good chances of passing test even if this check fails. + local mdt_idx=0 +- for ost_idx in $(seq 0 $((OSTCOUNT - 1))); do +- [[ $(precreated_ost_obj_count $mdt_idx $ost_idx) -ge \ +- $foeo_calc ]] || echo "Warning: test may fail because" \ +- "of lack of precreated objects on OST${ost_idx}" ++ for ((ost_idx = 0; ost_idx < $OSTCOUNT; ost_idx++ )); do ++ (($(precreated_ost_obj_count $mdt_idx $ost_idx) >= foeo_calc))|| ++ echo "Warning: test may fail from too few objs on OST$ost_idx" + done + ++ local cmd="$RR_ALLOC $mntpt_root/$tdir/f $rr_alloc_NFILES $num_clients" ++ + if [[ $total_MNTPTS -ne 0 ]]; then + # Now start the actual file creation app. + mpi_run "-np $total_MNTPTS" $cmd || return +@@ -1101,12 +1099,14 @@ run_rr_alloc() { + rm -f $qos_prec_objs + + diff_max_min_arr=($($LFS getstripe -r $DIR/$tdir/ | +- grep "lmm_stripe_offset:" | awk '{print $2}' | sort -n | +- uniq -c | awk 'NR==1 {min=max=$1} \ +- { $1max ? max=$1 : max} \ +- END {print max-min, max, min}')) ++ awk '/lmm_stripe_offset:/ {print $2}' | ++ sort | uniq -c | ++ awk 'NR==1 {min=max=$1} \ ++ { $1max ? max=$1:max} \ ++ END {print max-min, max, min}')) ++ ++ $LFS find $DIR/$tdir -type f | xargs -n1 -P8 unlink + +- rm -rf $DIR/$tdir + + # In-case of fairly large number of file creation using RR (round-robin) + # there can be two cases in which deviation will occur than the regular +@@ -1114,11 +1114,15 @@ run_rr_alloc() { + # 1- When rr_alloc does not start right with 'lqr_start_count' reseeded, + # 2- When rr_alloc does not finish with 'lqr_start_count == 0'. + # So the difference of files b/w any 2 OST should not be more than 2. +- [[ ${diff_max_min_arr[0]} -le 2 ]] || +- error "Uneven distribution detected: difference between" \ +- "maximum files per OST (${diff_max_min_arr[1]}) and" \ +- "minimum files per OST (${diff_max_min_arr[2]}) must not be" \ +- "greater than 2" ++ # In some cases it may be more, but shouldn't be > 0.3% of the files. ++ local max_diff=$((create_count > 600 ? create_count / 300 : 2)) ++ ++ (( ${diff_max_min_arr[0]} <= $max_diff )) || { ++ $LFS getstripe -r $DIR/$tdir | ++ awk '/lmm_stripe_offset:/ {print $2}' | sort | uniq -c ++ ++ error "max/min OST objects (${diff_max_min_arr[1]} : ${diff_max_min_arr[2]}) too different" ++ } + } + + run_fs_test() { +-- +2.33.0 + diff --git a/0004-LU-15123-tests-check-quota-reintegration-after-recov.patch b/0004-LU-15123-tests-check-quota-reintegration-after-recov.patch new file mode 100644 index 0000000000000000000000000000000000000000..003037814ffb440f9228c1b407d0398c23c04029 --- /dev/null +++ b/0004-LU-15123-tests-check-quota-reintegration-after-recov.patch @@ -0,0 +1,41 @@ +From 13805e3a2d4f520e297bc408d94b9971a6094f9a Mon Sep 17 00:00:00 2001 +From: Alex Zhuravlev +Date: Wed, 19 Apr 2023 10:20:33 +0300 +Subject: [PATCH 04/61] LU-15123 tests: check quota reintegration after + recovery + +4th step of quota reintegration (reconciliation) waits for recovery +completion. So the tests (like sanity-quota/7a) should wait for +recovery completion before checking reintegration results. + +Lustre-change: https://review.whamcloud.com/50688 +Lustre-commit: 4432b6e2824775e292f96e202d6fc0db231bc749 + +Signed-off-by: Alex Zhuravlev +Change-Id: Id0aa5db01658621103d94ad6dafe91b2960b3a33 +Reviewed-by: Andreas Dilger +Reviewed-by: Sergey Cheremencev +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51233 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Oleg Drokin +--- + lustre/tests/sanity-quota.sh | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/lustre/tests/sanity-quota.sh b/lustre/tests/sanity-quota.sh +index f552d84fd6..047b1a9687 100755 +--- a/lustre/tests/sanity-quota.sh ++++ b/lustre/tests/sanity-quota.sh +@@ -285,6 +285,8 @@ wait_reintegration() { + cmd=${cmd}.quota_slave.info + + if $(facet_up $tgt); then ++ # reintegration starts after recovery completion ++ wait_recovery_complete $tgt + wait_update_facet $tgt "$cmd | + grep "$qtype" | awk '{ print \\\$3 }'" \ + "$result" $max || return 1 +-- +2.33.0 + diff --git a/0005-LU-13081-tests-skip-sanity-test_151-test_156.patch b/0005-LU-13081-tests-skip-sanity-test_151-test_156.patch new file mode 100644 index 0000000000000000000000000000000000000000..f8c97cbddef4d906fb9e0286f2ab426a38d902bd --- /dev/null +++ b/0005-LU-13081-tests-skip-sanity-test_151-test_156.patch @@ -0,0 +1,57 @@ +From d15f9619eec367805606f4a46d049e6ecb62f36d Mon Sep 17 00:00:00 2001 +From: Alex Deiter +Date: Thu, 27 Apr 2023 02:04:01 +0400 +Subject: [PATCH 05/61] LU-13081 tests: skip sanity test_151/test_156 + +Skip both sanity test_151 and test_156 during interop testing, +since this is really testing server-side functionality only +(OSS caching behavior). And it makes sense to just exclude +test_151 and test_156 during interop testing, otherwise it +seems that the client version of the test can become +inconsistent with the caching behavior/tunables on the OSS +and the failures don't mean anything. There is enough +non-interop testing to catch any regressions in the OSS +cache behavior. + +Lustre-change: https://review.whamcloud.com/50777 +Lustre-commit: 305dda878d1dde822eab7a9dacfe8dec0b96cb3e + +Test-Parameters: trivial +Signed-off-by: Alex Deiter +Change-Id: I39a8b54894d5b0c7573e6c56d1f8e1ba02b3e3fe +Reviewed-by: Jian Yu +Reviewed-by: Andreas Dilger +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51286 +Reviewed-by: Alex Deiter +Reviewed-by: Oleg Drokin +Tested-by: jenkins +Tested-by: Maloo +--- + lustre/tests/sanity.sh | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh +index fec778e9fe..d73b285eca 100755 +--- a/lustre/tests/sanity.sh ++++ b/lustre/tests/sanity.sh +@@ -15079,6 +15079,8 @@ function set_cache() { + test_151() { + [ $PARALLEL == "yes" ] && skip "skip parallel run" + remote_ost_nodsh && skip "remote OST with nodsh" ++ (( CLIENT_VERSION == OST1_VERSION )) || ++ skip "LU-13081: no interop testing for OSS cache" + + local CPAGES=3 + local list=$(comma_list $(osts_nodes)) +@@ -15754,6 +15756,8 @@ test_156() { + skip "stats not implemented on old servers" + [ "$ost1_FSTYPE" = "zfs" ] && + skip "LU-1956/LU-2261: stats not implemented on OSD ZFS" ++ (( CLIENT_VERSION == OST1_VERSION )) || ++ skip "LU-13081: no interop testing for OSS cache" + + local CPAGES=3 + local BEFORE +-- +2.33.0 + diff --git a/0006-LU-11785-tests-fix-conf-sanity-98-mount-check-on-64K.patch b/0006-LU-11785-tests-fix-conf-sanity-98-mount-check-on-64K.patch new file mode 100644 index 0000000000000000000000000000000000000000..980a0af2d463943a0a94ce67e44c29ca47635809 --- /dev/null +++ b/0006-LU-11785-tests-fix-conf-sanity-98-mount-check-on-64K.patch @@ -0,0 +1,45 @@ +From 3e9a06398a168a52f72a65450e5249ed502a86cd Mon Sep 17 00:00:00 2001 +From: Kevin Zhao +Date: Fri, 28 Oct 2022 10:05:24 +0800 +Subject: [PATCH 06/61] LU-11785 tests: fix conf-sanity/98 mount check on 64K + page + +This patch fix the mount option length check expectation +fail on 64K page. Since the maxopt_len is the minmium +value of page_size or 64K page_size, but the test cases +only hard code the length of option to the 4K one. This +patch add the mount options according to the page size. + +Lustre-change: https://review.whamcloud.com/48177 +Lustre-commit: 4068ca725954db2a1fc42bf8d184f4672c2ed113 + +Test-Parameters: trivial testlist=conf-sanity env=ONLY=98 +Test-Parameters: testlist=conf-sanity env=ONLY=98 clientarch=aarch64 clientdistro=el8.7 +Signed-off-by: Kevin Zhao +Change-Id: Icdeb8b73308056e216c3f4ce71907b0c928d2c30 +Reviewed-by: Andreas Dilger +Reviewed-by: xinliang +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51288 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Oleg Drokin +--- + lustre/tests/conf-sanity.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh +index dc00fb42ff..a52429afe0 100755 +--- a/lustre/tests/conf-sanity.sh ++++ b/lustre/tests/conf-sanity.sh +@@ -7783,7 +7783,7 @@ test_98() + setup + check_mount || error "mount failed" + mountopt="user_xattr" +- for ((x = 1; x <= 400; x++)); do ++ for ((x = 1; x <= PAGE_SIZE/11; x++)); do + mountopt="$mountopt,user_xattr" + done + mount_client $MOUNT remount,$mountopt 2>&1 | grep "too long" || +-- +2.33.0 + diff --git a/0007-LU-11388-tests-replay-single-131b-to-refresh-grants.patch b/0007-LU-11388-tests-replay-single-131b-to-refresh-grants.patch new file mode 100644 index 0000000000000000000000000000000000000000..a2734481af6e13d6620dabd48f8b4ff641479212 --- /dev/null +++ b/0007-LU-11388-tests-replay-single-131b-to-refresh-grants.patch @@ -0,0 +1,45 @@ +From 653ae754fa93ecf8b9d290675122956eaf63b6af Mon Sep 17 00:00:00 2001 +From: Alex Zhuravlev +Date: Mon, 17 Apr 2023 21:13:59 +0300 +Subject: [PATCH 07/61] LU-11388 tests: replay-single/131b to refresh grants + +so that the write (to be replayed after replay-barrier) +doesn't turn sync due to insufficient grant. + +Lustre-change: https://review.whamcloud.com/50661 +Lustre-commit: 384e1e858eef826677bfa6913074a83c4fab37d3 + +Test-Parameters: trivial testlist=replay-single env=ONLY=131b,ONLY_REPEAT=30 +Fixes: cb3b2bb683 ("LU-11388 test: enable replay-single test_131b") +Signed-off-by: Alex Zhuravlev +Change-Id: If4656c1028b49c58eedd905abd0c329f3706f491 +Reviewed-by: Mikhail Pershin +Reviewed-by: Jian Yu +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51289 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Oleg Drokin +--- + lustre/tests/replay-single.sh | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh +index a0ac037ead..c6641c0d0e 100755 +--- a/lustre/tests/replay-single.sh ++++ b/lustre/tests/replay-single.sh +@@ -4858,6 +4858,12 @@ test_131b() { + [ "$MDS1_VERSION" -lt $(version_code 2.10.90) ] && + skip "Do not support Data-on-MDT before 2.11" + ++ # refresh grants so write after replay_barrier doesn't ++ # turn sync ++ $LFS setstripe -E 1M -L mdt -E EOF -c 2 $DIR/$tfile-2 ++ stack_trap "rm -f $DIR/$tfile-2" ++ dd if=/dev/zero of=$DIR/$tfile-2 bs=64k count=2 || ++ error "can't dd" + $LFS setstripe -E 1M -L mdt -E EOF -c 2 $DIR/$tfile + replay_barrier $SINGLEMDS + echo "dom_data" | dd of=$DIR/$tfile bs=8 count=1 +-- +2.33.0 + diff --git a/0008-LU-16163-tests-skip-racer_on_nfs-for-NFSv3.patch b/0008-LU-16163-tests-skip-racer_on_nfs-for-NFSv3.patch new file mode 100644 index 0000000000000000000000000000000000000000..c683246dcfa46b57cfcffeaf9d15882f60eea636 --- /dev/null +++ b/0008-LU-16163-tests-skip-racer_on_nfs-for-NFSv3.patch @@ -0,0 +1,41 @@ +From 3626be5686cc395ce622d281a993603dba16e3e2 Mon Sep 17 00:00:00 2001 +From: Alex Deiter +Date: Fri, 7 Apr 2023 23:49:23 +0400 +Subject: [PATCH 08/61] LU-16163 tests: skip racer_on_nfs for NFSv3 + +Export ALWAYS_EXCEPT env for child NFS test + +Lustre-change: https://review.whamcloud.com/50579 +Lustre-commit: 892d726f274c7cd4e505689ad69194ac68dc323b + +Fixes: 513eb670b0 ("LU-16163 tests: skip racer_on_nfs for NFSv3") +Test-Parameters: trivial testlist=parallel-scale-nfsv3 +Signed-off-by: Alex Deiter +Change-Id: Ibb4a9916166f13ab9bd2374b33d4313453972276 +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51282 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Andreas Dilger +Reviewed-by: Oleg Drokin +--- + lustre/tests/parallel-scale-nfsv3.sh | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/lustre/tests/parallel-scale-nfsv3.sh b/lustre/tests/parallel-scale-nfsv3.sh +index 014d2cda2a..fdbfc949c8 100755 +--- a/lustre/tests/parallel-scale-nfsv3.sh ++++ b/lustre/tests/parallel-scale-nfsv3.sh +@@ -4,4 +4,9 @@ LUSTRE=${LUSTRE:-$(dirname $0)/..} + . $LUSTRE/tests/test-framework.sh + init_test_env $@ + +-sh $LUSTRE/tests/parallel-scale-nfs.sh 3 ++export ALWAYS_EXCEPT="$PARALLEL_SCALE_NFSV3_EXCEPT " ++# Bug number for skipped test: LU-16163 ++ALWAYS_EXCEPT+=" racer_on_nfs " ++# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! ++ ++$LUSTRE/tests/parallel-scale-nfs.sh 3 +-- +2.33.0 + diff --git a/0009-LU-14294-tests-fixed-NFS-configuration-issue.patch b/0009-LU-14294-tests-fixed-NFS-configuration-issue.patch new file mode 100644 index 0000000000000000000000000000000000000000..8aa1834bb0370423cc46628d83478a60e6f00c0e --- /dev/null +++ b/0009-LU-14294-tests-fixed-NFS-configuration-issue.patch @@ -0,0 +1,197 @@ +From cef89c354f22f873f1f2e09536de7c690852828b Mon Sep 17 00:00:00 2001 +From: Alex Deiter +Date: Mon, 7 Nov 2022 21:47:21 +0400 +Subject: [PATCH 09/61] LU-14294 tests: fixed NFS configuration issue + +* Used the systemctl command to manage system services +* Used the same order of parameters to setup and cleanup NFS +* Used tab for indentation + +Lustre-change: https://review.whamcloud.com/49062 +Lustre-commit: 1a8fe55b17ac2bc2195aaba446467ccdac67b564 + +Test-Parameters: trivial clientdistro=el7.9 \ +testlist=parallel-scale-nfsv3,parallel-scale-nfsv4 +Test-Parameters: clientdistro=el8.7 \ +testlist=parallel-scale-nfsv3,parallel-scale-nfsv4 +Test-Parameters: clientdistro=el9.0 \ +testlist=parallel-scale-nfsv3,parallel-scale-nfsv4 +Test-Parameters: clientdistro=sles12sp5 \ +testlist=parallel-scale-nfsv3,parallel-scale-nfsv4 +Test-Parameters: clientdistro=sles15sp4 \ +testlist=parallel-scale-nfsv3,parallel-scale-nfsv4 +Test-Parameters: clientdistro=ubuntu2004 \ +testlist=parallel-scale-nfsv3,parallel-scale-nfsv4 + +Change-Id: I6b087035ac7524aa99c0facad48f8c3fb7444cbc +Signed-off-by: Alex Deiter +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51283 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Andreas Dilger +Reviewed-by: Oleg Drokin +--- + lustre/tests/parallel-scale-nfs.sh | 11 ++- + lustre/tests/setup-nfs.sh | 107 +++++++++++------------------ + 2 files changed, 47 insertions(+), 71 deletions(-) + +diff --git a/lustre/tests/parallel-scale-nfs.sh b/lustre/tests/parallel-scale-nfs.sh +index 7f426ee0a2..ab88f379a4 100755 +--- a/lustre/tests/parallel-scale-nfs.sh ++++ b/lustre/tests/parallel-scale-nfs.sh +@@ -47,7 +47,8 @@ cleanup_exit () { + } + + cleanup () { +- cleanup_nfs "$NFS_CLIMNTPT" "$LUSTRE_CLIENT_NFSSRV" "$NFS_CLIENTS" || ++ cleanup_nfs "$LUSTRE_CLIENT_NFSSRV" "$NFS_SRVMNTPT" \ ++ "$NFS_CLIENTS" "$NFS_CLIMNTPT" || \ + error_noexit false "failed to cleanup nfs" + zconf_umount $LUSTRE_CLIENT_NFSSRV $NFS_SRVMNTPT force || + error_noexit false "failed to umount lustre on"\ +@@ -63,8 +64,8 @@ zconf_mount $LUSTRE_CLIENT_NFSSRV $NFS_SRVMNTPT "$cl_mnt_opt" || + error "mount lustre on $LUSTRE_CLIENT_NFSSRV failed" + + # setup the nfs +-setup_nfs "$NFSVERSION" "$NFS_SRVMNTPT" "$LUSTRE_CLIENT_NFSSRV" \ +- "$NFS_CLIENTS" "$NFS_CLIMNTPT" || ++setup_nfs "$LUSTRE_CLIENT_NFSSRV" "$NFS_SRVMNTPT" "$NFS_CLIENTS" \ ++ "$NFS_CLIMNTPT" "$NFSVERSION" || \ + error false "setup nfs failed!" + + NFSCLIENT=true +@@ -101,6 +102,10 @@ MPI_RUNAS=${MPI_RUNAS:-"runas -u $MPI_USER_UID -g $MPI_USER_GID"} + $GSS_KRB5 && refresh_krb5_tgt $MPI_USER_UID $MPI_USER_GID $MPI_RUNAS + + test_compilebench() { ++ if [[ "$TESTSUITE" =~ "parallel-scale-nfs" ]]; then ++ skip "LU-12957 and LU-13068: compilebench for $TESTSUITE" ++ fi ++ + run_compilebench $TESTDIR + } + run_test compilebench "compilebench" +diff --git a/lustre/tests/setup-nfs.sh b/lustre/tests/setup-nfs.sh +index ab3afff36c..eb6720febd 100755 +--- a/lustre/tests/setup-nfs.sh ++++ b/lustre/tests/setup-nfs.sh +@@ -1,77 +1,48 @@ + #!/bin/bash +-#set -x +-EXPORT_OPTS=${EXPORT_OPTS:-"rw,async,no_root_squash"} + +-setup_nfs() { +- local NFS_VER=${1} +- local MNTPNT=${2} +- local LUSTRE_CLIENT=${3} +- local NFS_CLIENTS=${4} +- local nfs_climntpt=${5:-$MNTPNT} +- +- local export_opts_v=$EXPORT_OPTS +- +- echo "Exporting Lustre filesystem..." +- +- if [ "$NFS_VER" = "4" ]; then +- export_opts_v="$EXPORT_OPTS,fsid=0" +- do_nodes $LUSTRE_CLIENT "mkdir -p /var/lib/nfs/v4recovery" +- fi +- +- do_nodes $LUSTRE_CLIENT,$NFS_CLIENTS "grep -q rpc_pipefs' ' /proc/mounts ||\ +- { mkdir -p /var/lib/nfs/rpc_pipefs && \ +- mount -t rpc_pipefs sunrpc /var/lib/nfs/rpc_pipefs; }" || return 1 +- sleep 5 +- +- # get rid of old $MNTPNT entries in /etc/exports +- do_nodes $LUSTRE_CLIENT "sed -i '/${MNTPNT##*/}/d' /etc/exports && +- echo $MNTPNT *\($export_opts_v\) >> /etc/exports" || +- return 1 +- +- # restart nfs server according to distro +- do_nodes $LUSTRE_CLIENT "{ [[ -e /etc/SuSE-release ]] && +- service nfsserver restart; } || +- service nfs restart || +- service nfs-server restart" || return 1 ++DEFAULT_NFS_OPTIONS=${DEFAULT_NFS_OPTIONS:-"rw,async,no_root_squash"} ++DEFAULT_EXPORTS_FILE=${DEFAULT_EXPORTS_FILE:-"/etc/exports.d/lustre.exports"} + +- do_nodes $NFS_CLIENTS "chkconfig --list rpcidmapd 2>/dev/null | +- grep -q rpcidmapd && service rpcidmapd restart || +- true" +- +- echo -e "\nMounting NFS clients (version $NFS_VER)..." ++setup_nfs() { ++ local LUSTRE_CLIENT=$1 ++ local LUSTRE_MOUNT_POINT=$2 ++ local NFS_CLIENTS=$3 ++ local NFS_MOUNT_POINT=$4 ++ local NFS_VERSION=$5 ++ local EXPORTS_FILE=$DEFAULT_EXPORTS_FILE ++ local NFS_OPTIONS=$DEFAULT_NFS_OPTIONS ++ ++ echo "Exporting Lustre filesystem via NFS version $NFS_VERSION" ++ do_nodes "$LUSTRE_CLIENT" \ ++ "echo '$LUSTRE_MOUNT_POINT *($NFS_OPTIONS)' | \ ++ tee $EXPORTS_FILE" || return 1 ++ do_nodes "$LUSTRE_CLIENT" "systemctl restart nfs-server" || return 1 ++ do_nodes "$LUSTRE_CLIENT" "systemctl restart nfs-idmapd" || return 1 ++ ++ echo "Mounting NFS clients version $NFS_VERSION" ++ do_nodes "$NFS_CLIENTS" "systemctl restart nfs-idmapd" || return 1 ++ do_nodes "$NFS_CLIENTS" "mkdir -p $NFS_MOUNT_POINT" || return 1 ++ do_nodes "$NFS_CLIENTS" \ ++ "mount -v -t nfs -o nfsvers=$NFS_VERSION,async \ ++ $LUSTRE_CLIENT:$LUSTRE_MOUNT_POINT \ ++ $NFS_MOUNT_POINT" || return 1 + +- do_nodes $NFS_CLIENTS "mkdir -p $nfs_climntpt" || return 1 +- if [ "$NFS_VER" = "4" ]; then +- do_nodes $NFS_CLIENTS \ +- "mount -t nfs$NFS_VER -o async \ +- $LUSTRE_CLIENT:/ $nfs_climntpt" || return 1 +- else +- do_nodes $NFS_CLIENTS \ +- "mount -t nfs -o nfsvers=$NFS_VER,async \ +- $LUSTRE_CLIENT:$MNTPNT $nfs_climntpt" || return 1 +- fi + return 0 + } + + cleanup_nfs() { +- local MNTPNT=${1} +- local LUSTRE_CLIENT=${2} +- local NFS_CLIENTS=${3} +- +- echo -e "\nUnmounting NFS clients..." +- do_nodes $NFS_CLIENTS "umount -f $MNTPNT" || true +- +- echo -e "\nUnexporting Lustre filesystem..." +- do_nodes $NFS_CLIENTS "chkconfig --list rpcidmapd 2>/dev/null | +- grep -q rpcidmapd && service rpcidmapd stop || +- true" +- +- do_nodes $LUSTRE_CLIENT "{ [[ -e /etc/SuSE-release ]] && +- service nfsserver stop; } || +- service nfs stop || +- service nfs-server stop" || return 1 +- +- do_nodes $LUSTRE_CLIENT "sed -i '/${MNTPNT##*/}/d' /etc/exports" || return 1 +- +- do_nodes $LUSTRE_CLIENT "exportfs -v" ++ local LUSTRE_CLIENT=$1 ++ local LUSTRE_MOUNT_POINT=$2 ++ local NFS_CLIENTS=$3 ++ local NFS_MOUNT_POINT=$4 ++ local EXPORTS_FILE=$DEFAULT_EXPORTS_FILE ++ ++ echo "Unmounting NFS clients" ++ do_nodes "$NFS_CLIENTS" "umount -v -f $NFS_MOUNT_POINT" || return 1 ++ do_nodes "$NFS_CLIENTS" "systemctl stop nfs-idmapd" || return 1 ++ ++ echo "Unexporting Lustre filesystem" ++ do_nodes "$LUSTRE_CLIENT" "systemctl stop nfs-server" || return 1 ++ do_nodes "$LUSTRE_CLIENT" "systemctl stop nfs-idmapd" || return 1 ++ do_nodes "$LUSTRE_CLIENT" "rm -v $EXPORTS_FILE" || return 1 + } +-- +2.33.0 + diff --git a/0010-LU-16717-mdt-treat-unknown-hash-type-as-sane-type.patch b/0010-LU-16717-mdt-treat-unknown-hash-type-as-sane-type.patch new file mode 100644 index 0000000000000000000000000000000000000000..f20d906bb8d0bbf3be285c44e928f82c29259007 --- /dev/null +++ b/0010-LU-16717-mdt-treat-unknown-hash-type-as-sane-type.patch @@ -0,0 +1,155 @@ +From e4208468b65a34c84c20d5d932f35b29f9025722 Mon Sep 17 00:00:00 2001 +From: Lai Siyao +Date: Sun, 23 Apr 2023 04:09:02 -0400 +Subject: [PATCH 10/61] LU-16717 mdt: treat unknown hash type as sane type + +Directory migration failure may leave directory hash type as +LMV_HASH_TYPE_UNKNOWN|LMV_HASH_FLAG_BAD_TYPE, which should be treated +as sane hash type on existing directories, otherwise such directories +can't be unlinked. + +Add sanity 230y. + +Lustre-change: https://review.whamcloud.com/50796 +Lustre-commit: 05cdb71ba6813570123613993f3cfcf74fc83561 + +Signed-off-by: Lai Siyao +Change-Id: Ieffc0808d1db989d0bf9723f05cddb06f349e208 +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/50796 +Reviewed-by: Andreas Dilger +Reviewed-by: Hongchao Zhang +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51235 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Oleg Drokin +--- + lustre/include/lustre_lmv.h | 4 +-- + lustre/include/obd_support.h | 1 + + .../include/uapi/linux/lustre/lustre_user.h | 10 ++++++++ + lustre/mdt/mdt_reint.c | 18 +++++++++++++ + lustre/tests/sanity.sh | 25 +++++++++++++++++++ + 5 files changed, 56 insertions(+), 2 deletions(-) + +diff --git a/lustre/include/lustre_lmv.h b/lustre/include/lustre_lmv.h +index 2ffd77fc57..b848408da9 100644 +--- a/lustre/include/lustre_lmv.h ++++ b/lustre/include/lustre_lmv.h +@@ -438,7 +438,7 @@ static inline bool lmv_is_sane(const struct lmv_mds_md_v1 *lmv) + if (le32_to_cpu(lmv->lmv_stripe_count) == 0) + goto insane; + +- if (!lmv_is_known_hash_type(le32_to_cpu(lmv->lmv_hash_type))) ++ if (!lmv_is_sane_hash_type(le32_to_cpu(lmv->lmv_hash_type))) + goto insane; + + return true; +@@ -460,7 +460,7 @@ static inline bool lmv_is_sane2(const struct lmv_mds_md_v1 *lmv) + if (le32_to_cpu(lmv->lmv_stripe_count) == 0) + goto insane; + +- if (!lmv_is_known_hash_type(le32_to_cpu(lmv->lmv_hash_type))) ++ if (!lmv_is_sane_hash_type(le32_to_cpu(lmv->lmv_hash_type))) + goto insane; + + return true; +diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h +index cca40a688b..5a6998043b 100644 +--- a/lustre/include/obd_support.h ++++ b/lustre/include/obd_support.h +@@ -681,6 +681,7 @@ extern char obd_jobid_var[]; + + /* MIGRATE */ + #define OBD_FAIL_MIGRATE_ENTRIES 0x1801 ++#define OBD_FAIL_MIGRATE_BAD_HASH 0x1802 + + /* LMV */ + #define OBD_FAIL_UNKNOWN_LMV_STRIPE 0x1901 +diff --git a/lustre/include/uapi/linux/lustre/lustre_user.h b/lustre/include/uapi/linux/lustre/lustre_user.h +index b2ae5992d2..25fa6ad28d 100644 +--- a/lustre/include/uapi/linux/lustre/lustre_user.h ++++ b/lustre/include/uapi/linux/lustre/lustre_user.h +@@ -1046,6 +1046,16 @@ static inline bool lmv_is_known_hash_type(__u32 type) + + #define LMV_HASH_FLAG_KNOWN 0xbe000000 + ++/* migration failure may leave hash type as ++ * LMV_HASH_TYPE_UNKNOWN|LMV_HASH_FLAG_BAD_TYPE, which should be treated as ++ * sane, so such directory can be accessed (resume migration or unlink). ++ */ ++static inline bool lmv_is_sane_hash_type(__u32 type) ++{ ++ return lmv_is_known_hash_type(type) || ++ type == (LMV_HASH_TYPE_UNKNOWN | LMV_HASH_FLAG_BAD_TYPE); ++} ++ + /* both SPLIT and MIGRATION are set for directory split */ + static inline bool lmv_hash_is_splitting(__u32 hash) + { +diff --git a/lustre/mdt/mdt_reint.c b/lustre/mdt/mdt_reint.c +index c800ca64be..a2ddbbcf07 100644 +--- a/lustre/mdt/mdt_reint.c ++++ b/lustre/mdt/mdt_reint.c +@@ -2312,6 +2312,24 @@ int mdt_reint_migrate(struct mdt_thread_info *info, + if (rc) + GOTO(put_parent, rc); + ++ if (CFS_FAIL_CHECK(OBD_FAIL_MIGRATE_BAD_HASH) && ++ (ma->ma_valid & MA_LMV) && ++ lmv_is_migrating(&ma->ma_lmv->lmv_md_v1)) { ++ struct lu_buf *buf = &info->mti_buf; ++ struct lmv_mds_md_v1 *lmv = &ma->ma_lmv->lmv_md_v1; ++ __u32 version = le32_to_cpu(lmv->lmv_layout_version); ++ ++ lmv->lmv_hash_type = cpu_to_le32(LMV_HASH_TYPE_UNKNOWN | ++ LMV_HASH_FLAG_BAD_TYPE); ++ lmv->lmv_layout_version = cpu_to_le32(version + 1); ++ buf->lb_buf = lmv; ++ buf->lb_len = sizeof(*lmv); ++ rc = mo_xattr_set(env, mdt_object_child(pobj), buf, ++ XATTR_NAME_LMV, LU_XATTR_REPLACE); ++ mo_invalidate(env, mdt_object_child(pobj)); ++ GOTO(put_parent, rc); ++ } ++ + lock_parent: + /* lock parent object */ + lhp = &info->mti_lh[MDT_LH_PARENT]; +diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh +index d73b285eca..6cd539a737 100755 +--- a/lustre/tests/sanity.sh ++++ b/lustre/tests/sanity.sh +@@ -20636,6 +20636,31 @@ test_230w() { + } + run_test 230w "non-recursive mode dir migration" + ++test_230y() { ++ (( MDSCOUNT > 1 )) || skip "needs >= 2 MDTs" ++ (( MDS1_VERSION >= $(version_code 2.15.3) )) || ++ skip "Need MDS version at least 2.15.3" ++ ++ local pid ++ ++ test_mkdir -c -1 $DIR/$tdir || error "mkdir $tdir failed" ++ $LFS getdirstripe $DIR/$tdir ++ createmany -d $DIR/$tdir/d 100 || error "createmany failed" ++ $LFS migrate -m 1 -c 2 $DIR/$tdir & ++ pid=$! ++ sleep 1 ++ ++ #OBD_FAIL_MIGRATE_BAD_HASH 0x1802 ++ do_facet mds2 lctl set_param fail_loc=0x1802 ++ ++ wait $pid ++ do_facet mds2 lctl set_param fail_loc=0 ++ $LFS getdirstripe $DIR/$tdir ++ unlinkmany -d $DIR/$tdir/d 100 || error "unlinkmany failed" ++ rmdir $DIR/$tdir || error "rmdir $tdir failed" ++} ++run_test 230y "unlink dir with bad hash type" ++ + test_231a() + { + # For simplicity this test assumes that max_pages_per_rpc +-- +2.33.0 + diff --git a/0011-LU-15481-llog-Add-LLOG_SKIP_PLAIN-to-skip-llog-plain.patch b/0011-LU-15481-llog-Add-LLOG_SKIP_PLAIN-to-skip-llog-plain.patch new file mode 100644 index 0000000000000000000000000000000000000000..45ceec2df09047888d3dbd0f0e93e21526e3afeb --- /dev/null +++ b/0011-LU-15481-llog-Add-LLOG_SKIP_PLAIN-to-skip-llog-plain.patch @@ -0,0 +1,170 @@ +From 053ab4a47a3391ff2b3b252b649b88e4579ab88f Mon Sep 17 00:00:00 2001 +From: Etienne AUJAMES +Date: Wed, 22 Feb 2023 11:18:49 -0800 +Subject: [PATCH 11/61] LU-15481 llog: Add LLOG_SKIP_PLAIN to skip llog plain + +Add the catalog callback return LLOG_SKIP_PLAIN to conditionally skip +an entire llog plain. + +This could speedup the catalog processing for specific usages when a +record need to be access in the "middle" of the catalog. This could +be usefull for changelog with several users or HSM. + +This patch modify chlg_read_cat_process_cb() to use LLOG_SKIP_PLAIN. +The main idea came from: d813c75d ("LU-14688 mdt: changelog purge +deletes plain llog") + +**Performance test:** + +* Environement: +2474195 changelogs record store on the mds0 (40 llog plain): +mds# lctl get_param -n mdd.lustrefs-MDT0000.changelog_users +current index: 2474195 +ID index (idle seconds) +cl1 0 (3509) + +* Test +Access to records at the end of the catalog (offset: 2474194): +client# time lfs changelog lustrefs-MDT0000 2474194 >/dev/null + +* Results +- with the patch: real 0m0.592s +- without the patch: real 0m17.835s (x30) + +Lustre-change: https://review.whamcloud.com/46310 +Lustre-commit: aa22a6826ee521ab14994a4533b0dbffb529aab0 + +Signed-off-by: Etienne AUJAMES +Change-Id: I887d5bef1f3a6a31c46bc58959e0f508266c53d2 +Reviewed-by: Alexander Boyko +Reviewed-by: Andreas Dilger +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48771 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Oleg Drokin +Reviewed-by: Etienne AUJAMES +--- + lustre/include/lustre_log.h | 17 +++++++++++++++++ + lustre/mdc/mdc_changelog.c | 5 +++++ + lustre/mdd/mdd_device.c | 4 ++-- + lustre/obdclass/llog.c | 6 ++++-- + lustre/obdclass/llog_cat.c | 6 ++++++ + 5 files changed, 34 insertions(+), 4 deletions(-) + +diff --git a/lustre/include/lustre_log.h b/lustre/include/lustre_log.h +index 360ba26dd5..1fdca73715 100644 +--- a/lustre/include/lustre_log.h ++++ b/lustre/include/lustre_log.h +@@ -340,6 +340,7 @@ struct llog_ctxt { + #define LLOG_PROC_BREAK 0x0001 + #define LLOG_DEL_RECORD 0x0002 + #define LLOG_DEL_PLAIN 0x0003 ++#define LLOG_SKIP_PLAIN 0x0004 + + static inline int llog_obd2ops(struct llog_ctxt *ctxt, + const struct llog_operations **lop) +@@ -519,6 +520,22 @@ static inline int llog_is_full(struct llog_handle *llh) + return llh->lgh_last_idx >= LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1; + } + ++/* Determine if a llog plain of a catalog could be skiped based on record ++ * custom indexes. ++ * This assumes that indexes follow each other. The number of records to skip ++ * can be computed base on a starting offset and the index of the current ++ * record (in llog catalog callback). ++ */ ++static inline int llog_is_plain_skipable(struct llog_log_hdr *lh, ++ struct llog_rec_hdr *rec, ++ __u64 curr, __u64 start) ++{ ++ if (start == 0 || curr >= start) ++ return 0; ++ ++ return (LLOG_HDR_BITMAP_SIZE(lh) - rec->lrh_index) < (start - curr); ++} ++ + struct llog_cfg_rec { + struct llog_rec_hdr lcr_hdr; + struct lustre_cfg lcr_cfg; +diff --git a/lustre/mdc/mdc_changelog.c b/lustre/mdc/mdc_changelog.c +index 843c4de8a4..aea492b9f5 100644 +--- a/lustre/mdc/mdc_changelog.c ++++ b/lustre/mdc/mdc_changelog.c +@@ -228,6 +228,11 @@ static int chlg_read_cat_process_cb(const struct lu_env *env, + RETURN(rc); + } + ++ /* Check if we can skip the entire llog plain */ ++ if (llog_is_plain_skipable(llh->lgh_hdr, hdr, rec->cr.cr_index, ++ crs->crs_start_offset)) ++ RETURN(LLOG_SKIP_PLAIN); ++ + /* Skip undesired records */ + if (rec->cr.cr_index < crs->crs_start_offset) + RETURN(0); +diff --git a/lustre/mdd/mdd_device.c b/lustre/mdd/mdd_device.c +index 37667afde0..94fb1f64a5 100644 +--- a/lustre/mdd/mdd_device.c ++++ b/lustre/mdd/mdd_device.c +@@ -393,8 +393,8 @@ static int llog_changelog_cancel_cb(const struct lu_env *env, + * last cr_index at this plain llog. And if it less then cookie endrec + * cancel the whole file. + */ +- if ((LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - hdr->lrh_index + +- rec->cr.cr_index) < cl_cookie->endrec) { ++ if (llog_is_plain_skipable(llh->lgh_hdr, hdr, rec->cr.cr_index, ++ cl_cookie->endrec)) { + int rc; + + if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_MDS_CHANGELOG_DEL))) { +diff --git a/lustre/obdclass/llog.c b/lustre/obdclass/llog.c +index 2c45c9673a..79f89fe365 100644 +--- a/lustre/obdclass/llog.c ++++ b/lustre/obdclass/llog.c +@@ -736,7 +736,8 @@ repeat: + lgc->lgc_index = tmp_idx; + } + +- if (rc == LLOG_PROC_BREAK) { ++ if (rc == LLOG_PROC_BREAK || ++ rc == LLOG_SKIP_PLAIN) { + GOTO(out, rc); + } else if (rc == LLOG_DEL_RECORD) { + rc = llog_cancel_rec(lpi->lpi_env, +@@ -1005,7 +1006,8 @@ int llog_reverse_process(const struct lu_env *env, + sizeof(*tail); + + rc = cb(env, loghandle, rec, data); +- if (rc == LLOG_PROC_BREAK) { ++ if (rc == LLOG_PROC_BREAK || ++ rc == LLOG_SKIP_PLAIN) { + GOTO(out, rc); + } else if (rc == LLOG_DEL_RECORD) { + rc = llog_cancel_rec(env, loghandle, +diff --git a/lustre/obdclass/llog_cat.c b/lustre/obdclass/llog_cat.c +index ba44ad3003..4f5b0b1133 100644 +--- a/lustre/obdclass/llog_cat.c ++++ b/lustre/obdclass/llog_cat.c +@@ -895,6 +895,9 @@ out: + if (rc == LLOG_DEL_PLAIN || rc == LLOG_DEL_RECORD) + /* clear wrong catalog entry */ + rc = llog_cat_cleanup(env, cat_llh, llh, rec->lrh_index); ++ else if (rc == LLOG_SKIP_PLAIN) ++ /* processing callback ask to skip the llog -> continue */ ++ rc = 0; + + if (llh) + llog_handle_put(env, llh); +@@ -1060,6 +1063,9 @@ static int llog_cat_reverse_process_cb(const struct lu_env *env, + } else if (rc == LLOG_DEL_RECORD) { + /* clear wrong catalog entry */ + rc = llog_cat_cleanup(env, cat_llh, NULL, rec->lrh_index); ++ } else if (rc == LLOG_SKIP_PLAIN) { ++ /* processing callback ask to skip the llog -> continue */ ++ rc = 0; + } + if (rc) + RETURN(rc); +-- +2.33.0 + diff --git a/0012-LU-6612-utils-strengthen-llog_reader-vs-wrong-format.patch b/0012-LU-6612-utils-strengthen-llog_reader-vs-wrong-format.patch new file mode 100644 index 0000000000000000000000000000000000000000..43c82ae57f9fa191007f9f4710a59496b91b4f1b --- /dev/null +++ b/0012-LU-6612-utils-strengthen-llog_reader-vs-wrong-format.patch @@ -0,0 +1,74 @@ +From badba63a54e905129dbdf28e31026580453ea337 Mon Sep 17 00:00:00 2001 +From: Bruno Faccini +Date: Wed, 22 Feb 2023 11:21:06 -0800 +Subject: [PATCH 12/61] LU-6612 utils: strengthen llog_reader vs wrong + format/header + +The following snippet shows that llog_reader can be puzzled due to +an invalid 0 for the number of records when parsing an expected +LLOG file header : +root# dd if=/dev/zero bs=4096 count=1 of=/tmp/zeroes +1+0 records in +1+0 records out +4096 bytes (4.1 kB) copied, 0.000263962 s, 15.5 MB/s +root# llog_reader /tmp/zeroes +Memory Alloc for recs_buf error. +Could not pack buffer; rc=-12 + +Lustre-change: https://review.whamcloud.com/15654 +Lustre-commit: 45291b8c06eebf33d3654db3a7d3cfc5836004a6 + +Test-Parameters: trivial testlist=sanity,sanity-hsm +Signed-off-by: Bruno Faccini +Change-Id: I12be79e6c6a5da384a5fd81878a76a7ea8aa5834 +Reviewed-by: Andreas Dilger +Reviewed-by: Mike Pershin +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48900 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Oleg Drokin +Reviewed-by: Mikhail Pershin +--- + lustre/utils/llog_reader.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/lustre/utils/llog_reader.c b/lustre/utils/llog_reader.c +index dc2ef05127..153a471cc2 100644 +--- a/lustre/utils/llog_reader.c ++++ b/lustre/utils/llog_reader.c +@@ -222,6 +222,9 @@ int llog_pack_buffer(int fd, struct llog_log_hdr **llog, + int count; + int i, last_idx; + ++ *recs = NULL; ++ *recs_number = 0; ++ + rc = fstat(fd, &st); + if (rc < 0) { + rc = -errno; +@@ -270,11 +273,13 @@ int llog_pack_buffer(int fd, struct llog_log_hdr **llog, + } else if (count == 0) { + llapi_printf(LLAPI_MSG_NORMAL, + "uninitialized llog: zero record number\n"); +- *recs_number = 0; + goto clear_file_buf; + } ++ + /* the llog header not countable here.*/ + recs_num = count - 1; ++ if (recs_num == 0) ++ goto clear_file_buf; + + recs_buf = calloc(recs_num, sizeof(**recs_pr)); + if (!recs_buf) { +@@ -343,7 +348,6 @@ int llog_pack_buffer(int fd, struct llog_log_hdr **llog, + + *recs = recs_pr; + *recs_number = recs_num; +- + out: + return rc; + +-- +2.33.0 + diff --git a/0013-LU-16052-llog-handle-EBADR-for-catalog-processing.patch b/0013-LU-16052-llog-handle-EBADR-for-catalog-processing.patch new file mode 100644 index 0000000000000000000000000000000000000000..486e3472ac5d4f2b177e337eb2131d382b2190b9 --- /dev/null +++ b/0013-LU-16052-llog-handle-EBADR-for-catalog-processing.patch @@ -0,0 +1,46 @@ +From b996d1e0276fdf6c084410cd1dcfac0df13437fe Mon Sep 17 00:00:00 2001 +From: Mikhail Pershin +Date: Mon, 17 Oct 2022 16:29:52 -0700 +Subject: [PATCH 13/61] LU-16052 llog: handle -EBADR for catalog processing + +Llog catalog processing might retry to get the last llog block +to check for new records if any. That might return -EBADR code +which should be considered as valid. Previously -EIO was +returned in all cases. + +Run conf-sanity test_106 several times as specific test + +Lustre-change: https://review.whamcloud.com/48070 +Lustre-commit: e260f751f2a21fa126eeb4bc9e94250ba3e815f1 + +Test-Parameters: testlist=conf-sanity env=ONLY=106,SLOW=yes,ONLY_REPEAT=10 +Signed-off-by: Mikhail Pershin +Change-Id: I30e04ba2c91c8bdce72c95675a1209639e9f0570 +Reviewed-by: Andreas Dilger +Reviewed-by: Etienne AUJAMES +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48772 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Oleg Drokin +--- + lustre/obdclass/llog.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lustre/obdclass/llog.c b/lustre/obdclass/llog.c +index 79f89fe365..b23645d8b4 100644 +--- a/lustre/obdclass/llog.c ++++ b/lustre/obdclass/llog.c +@@ -576,8 +576,8 @@ repeat: + GOTO(out, rc = 0); + /* we`ve tried to reread the chunk, but there is no + * new records */ +- if (rc == -EIO && repeated && (chunk_offset + buf_offset) == +- cur_offset) ++ if (repeated && (chunk_offset + buf_offset) == cur_offset && ++ (rc == -EBADR || rc == -EIO)) + GOTO(out, rc = 0); + if (rc != 0) + GOTO(out, rc); +-- +2.33.0 + diff --git a/0014-LU-16717-mdt-resume-dir-migration-with-bad_type.patch b/0014-LU-16717-mdt-resume-dir-migration-with-bad_type.patch new file mode 100644 index 0000000000000000000000000000000000000000..e8d45dc8cd68f11ae95f36a62a7bcfdc50429f32 --- /dev/null +++ b/0014-LU-16717-mdt-resume-dir-migration-with-bad_type.patch @@ -0,0 +1,144 @@ +From 1c882aebeaac4970c78a3616f1dd96d0920d133f Mon Sep 17 00:00:00 2001 +From: Lai Siyao +Date: Fri, 28 Apr 2023 05:22:03 -0400 +Subject: [PATCH 14/61] LU-16717 mdt: resume dir migration with bad_type + +LFSCK may set hash type to "none,bad_type" upon migration failure, +set it back to "fnv_1a_64,migrating,bad_type,fixed" to allow +migration resumption. fnv_1a_64 is set because it's the default hash +type, and now that we don't know the hash type in the original +migration command, just try with it. + +LFSCK just add "bad_type" flag on such directory, so that such +migration can always be resumed in the future. + +Add sanity 230z. + +Lustre-change: https://review.whamcloud.com/50797 +Lustre-commit: 151650e468ab423e831c30d635ea380e0434a122 + +Signed-off-by: Lai Siyao +Change-Id: I19606aefcb9115e6724843785aea89a1c380e23f +Reviewed-by: Andreas Dilger +Reviewed-by: Hongchao Zhang +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51243 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Oleg Drokin +--- + lustre/lfsck/lfsck_striped_dir.c | 6 +++++- + lustre/mdt/mdt_reint.c | 32 ++++++++++++++++++++++++++++++++ + lustre/tests/sanity.sh | 30 ++++++++++++++++++++++++++++++ + 3 files changed, 67 insertions(+), 1 deletion(-) + +diff --git a/lustre/lfsck/lfsck_striped_dir.c b/lustre/lfsck/lfsck_striped_dir.c +index 63a875403f..2aa2c7d6e3 100644 +--- a/lustre/lfsck/lfsck_striped_dir.c ++++ b/lustre/lfsck/lfsck_striped_dir.c +@@ -1567,7 +1567,11 @@ int lfsck_namespace_repair_bad_name_hash(const struct lu_env *env, + GOTO(log, rc = 1); + + *lmv2 = llmv->ll_lmv; +- lmv2->lmv_hash_type = LMV_HASH_TYPE_UNKNOWN | LMV_HASH_FLAG_BAD_TYPE; ++ /* only set BAD_TYPE here, do not clear hash type or MIGRATION flag, ++ * so that user can resume dir migration if this is caused by dir ++ * migration failure. ++ */ ++ lmv2->lmv_hash_type |= LMV_HASH_FLAG_BAD_TYPE; + rc = lfsck_namespace_set_lmv_master(env, com, parent, lmv2, + lfsck_dto2fid(shard), + llmv->ll_lmv.lmv_master_mdt_index, +diff --git a/lustre/mdt/mdt_reint.c b/lustre/mdt/mdt_reint.c +index a2ddbbcf07..23948879cd 100644 +--- a/lustre/mdt/mdt_reint.c ++++ b/lustre/mdt/mdt_reint.c +@@ -2207,6 +2207,17 @@ close: + return rc ?: rc2; + } + ++/* LFSCK used to clear hash type and MIGRATION flag upon migration failure */ ++static inline bool lmv_is_failed_migration(const struct lmv_mds_md_v1 *lmv) ++{ ++ return le32_to_cpu(lmv->lmv_hash_type) == ++ (LMV_HASH_TYPE_UNKNOWN | LMV_HASH_FLAG_BAD_TYPE) && ++ lmv_is_known_hash_type(le32_to_cpu(lmv->lmv_migrate_hash)) && ++ le32_to_cpu(lmv->lmv_migrate_offset) > 0 && ++ le32_to_cpu(lmv->lmv_migrate_offset) < ++ le32_to_cpu(lmv->lmv_stripe_count); ++} ++ + /* + * migrate file in below steps: + * 1. lock parent and its stripes +@@ -2384,6 +2395,27 @@ lock_parent: + if ((ma->ma_valid & MA_LMV) && + lmv_is_restriping(&ma->ma_lmv->lmv_md_v1)) + GOTO(unlock_links, rc = -EBUSY); ++ else if (lmv_is_failed_migration(&ma->ma_lmv->lmv_md_v1)) { ++ struct lu_buf *buf = &info->mti_buf; ++ struct lmv_mds_md_v1 *lmv = &ma->ma_lmv->lmv_md_v1; ++ __u32 version = le32_to_cpu(lmv->lmv_layout_version); ++ ++ /* migration failed before, and LFSCK cleared hash type ++ * and flags, fake it to resume migration. ++ */ ++ lmv->lmv_hash_type = ++ cpu_to_le32(LMV_HASH_TYPE_FNV_1A_64 | ++ LMV_HASH_FLAG_MIGRATION | ++ LMV_HASH_FLAG_BAD_TYPE | ++ LMV_HASH_FLAG_FIXED); ++ lmv->lmv_layout_version = cpu_to_le32(version + 1); ++ buf->lb_buf = lmv; ++ buf->lb_len = sizeof(*lmv); ++ rc = mo_xattr_set(env, mdt_object_child(sobj), buf, ++ XATTR_NAME_LMV, LU_XATTR_REPLACE); ++ mo_invalidate(env, mdt_object_child(sobj)); ++ GOTO(unlock_links, rc = -EALREADY); ++ } + } + + /* if migration HSM is allowed */ +diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh +index 6cd539a737..93f8c3b30c 100755 +--- a/lustre/tests/sanity.sh ++++ b/lustre/tests/sanity.sh +@@ -20661,6 +20661,36 @@ test_230y() { + } + run_test 230y "unlink dir with bad hash type" + ++test_230z() { ++ (( MDSCOUNT > 1 )) || skip "needs >= 2 MDTs" ++ (( MDS1_VERSION >= $(version_code 2.15.3) )) || ++ skip "Need MDS version at least 2.15.3" ++ ++ local pid ++ ++ test_mkdir -c -1 $DIR/$tdir || error "mkdir $tdir failed" ++ $LFS getdirstripe $DIR/$tdir ++ createmany -d $DIR/$tdir/d 100 || error "createmany failed" ++ $LFS migrate -m 1 -c 2 -H fnv_1a_64 $DIR/$tdir & ++ pid=$! ++ sleep 1 ++ ++ #OBD_FAIL_MIGRATE_BAD_HASH 0x1802 ++ do_facet mds2 lctl set_param fail_loc=0x1802 ++ ++ wait $pid ++ do_facet mds2 lctl set_param fail_loc=0 ++ $LFS getdirstripe $DIR/$tdir ++ ++ # resume migration ++ $LFS migrate -m 1 -c 2 -H fnv_1a_64 $DIR/$tdir || ++ error "resume migration failed" ++ $LFS getdirstripe $DIR/$tdir ++ [ $($LFS getdirstripe -H $DIR/$tdir) == "fnv_1a_64,fixed" ] || ++ error "migration is not finished" ++} ++run_test 230z "resume dir migration with bad hash type" ++ + test_231a() + { + # For simplicity this test assumes that max_pages_per_rpc +-- +2.33.0 + diff --git a/0015-LU-14668-lnet-Lock-primary-NID-logic.patch b/0015-LU-14668-lnet-Lock-primary-NID-logic.patch new file mode 100644 index 0000000000000000000000000000000000000000..7a2a69d26b47a75d48c104a2abe7793af06c779a --- /dev/null +++ b/0015-LU-14668-lnet-Lock-primary-NID-logic.patch @@ -0,0 +1,241 @@ +From b341288179d9b3ad594b461586d826d6811db5a1 Mon Sep 17 00:00:00 2001 +From: Amir Shehata +Date: Wed, 5 May 2021 11:35:06 -0700 +Subject: [PATCH 15/61] LU-14668 lnet: Lock primary NID logic + +If a peer is created by Lustre make sure to lock that peer's +primary NID. This peer can be discovered in the background. +There is no need to block until discovery is complete, as Lustre +can continue on with the primary NID it provided. + +Discovery will populate the peer with other interfaces the peer has +but will not change the peer's primary NID. It can also delete +peer's NIDs which Lustre told it about (not the Primary NID). + +If a peer has been manually discovered via + lnetctl discover +command, then make sure to delete the manually discovered +peer and recreate it with the Lustre NID information +provided for us. + +Lustre-change: https://review.whamcloud.com/50106 +Lustre-commit: aacb16191a72bc6db1155030849efb0d6971a572 + +Signed-off-by: Amir Shehata +Change-Id: I8fc8a69caccca047e3085bb33d026a3f09fb359b +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51130 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Frank Sehr +Reviewed-by: Cyril Bordage +Reviewed-by: Oleg Drokin +--- + lnet/lnet/peer.c | 113 +++++++++++++++++++++++++++++++++++++---------- + 1 file changed, 90 insertions(+), 23 deletions(-) + +diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c +index 7a438ea086..01eb9aa8aa 100644 +--- a/lnet/lnet/peer.c ++++ b/lnet/lnet/peer.c +@@ -546,6 +546,15 @@ lnet_peer_del_nid(struct lnet_peer *lp, lnet_nid_t nid4, unsigned int flags) + } + + lpni = lnet_peer_ni_find_locked(&nid); ++ /* If we're asked to lock down the primary NID we shouldn't be ++ * deleting it ++ */ ++ if (lp->lp_state & LNET_PEER_LOCK_PRIMARY && ++ nid_same(&primary_nid, &nid)) { ++ rc = -EPERM; ++ goto out; ++ } ++ + if (!lpni) { + rc = -ENOENT; + goto out; +@@ -1420,6 +1429,19 @@ LNetAddPeer(lnet_nid_t *nids, __u32 num_nids) + if (!pnid) { + pnid = nids[i]; + rc = lnet_add_peer_ni(pnid, LNET_NID_ANY, mr, true); ++ if (rc == -EALREADY) { ++ struct lnet_peer *lp; ++ ++ CDEBUG(D_NET, "A peer exists for NID %s\n", ++ libcfs_nid2str(pnid)); ++ rc = 0; ++ /* Adds a refcount */ ++ lp = lnet_find_peer4(pnid); ++ LASSERT(lp); ++ pnid = lnet_nid_to_nid4(&lp->lp_primary_nid); ++ /* Drop refcount from lookup */ ++ lnet_peer_decref_locked(lp); ++ } + } else if (lnet_peer_discovery_disabled) { + rc = lnet_add_peer_ni(nids[i], LNET_NID_ANY, mr, true); + } else { +@@ -1466,13 +1488,20 @@ LNetPrimaryNID(lnet_nid_t nid) + * down then this discovery can introduce long delays into the mount + * process, so skip it if it isn't necessary. + */ +- while (!lnet_peer_discovery_disabled && !lnet_peer_is_uptodate(lp)) { +- spin_lock(&lp->lp_lock); ++ spin_lock(&lp->lp_lock); ++ if (!lnet_peer_discovery_disabled && ++ (!(lp->lp_state & LNET_PEER_LOCK_PRIMARY) || ++ !lnet_peer_is_uptodate_locked(lp))) { + /* force a full discovery cycle */ +- lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH; ++ lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH | ++ LNET_PEER_LOCK_PRIMARY; + spin_unlock(&lp->lp_lock); + +- rc = lnet_discover_peer_locked(lpni, cpt, true); ++ /* start discovery in the background. Messages to that ++ * peer will not go through until the discovery is ++ * complete ++ */ ++ rc = lnet_discover_peer_locked(lpni, cpt, false); + if (rc) + goto out_decref; + /* The lpni (or lp) for this NID may have changed and our ref is +@@ -1486,14 +1515,8 @@ LNetPrimaryNID(lnet_nid_t nid) + goto out_unlock; + } + lp = lpni->lpni_peer_net->lpn_peer; +- +- /* If we find that the peer has discovery disabled then we will +- * not modify whatever primary NID is currently set for this +- * peer. Thus, we can break out of this loop even if the peer +- * is not fully up to date. +- */ +- if (lnet_is_discovery_disabled(lp)) +- break; ++ } else { ++ spin_unlock(&lp->lp_lock); + } + primary_nid = lnet_nid_to_nid4(&lp->lp_primary_nid); + out_decref: +@@ -1530,9 +1553,9 @@ lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id) + */ + static int + lnet_peer_attach_peer_ni(struct lnet_peer *lp, +- struct lnet_peer_net *lpn, +- struct lnet_peer_ni *lpni, +- unsigned flags) ++ struct lnet_peer_net *lpn, ++ struct lnet_peer_ni *lpni, ++ unsigned flags) + { + struct lnet_peer_table *ptable; + bool new_lpn = false; +@@ -1599,6 +1622,8 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp, + lnet_peer_clr_non_mr_pref_nids(lp); + } + } ++ if (flags & LNET_PEER_LOCK_PRIMARY) ++ lp->lp_state |= LNET_PEER_LOCK_PRIMARY; + spin_unlock(&lp->lp_lock); + + lp->lp_nnis++; +@@ -1661,13 +1686,28 @@ lnet_peer_add(lnet_nid_t nid4, unsigned int flags) + else if ((lp->lp_state ^ flags) & LNET_PEER_MULTI_RAIL) + rc = -EPERM; + goto out; +- } else if (!(flags & LNET_PEER_CONFIGURED)) { +- if (lnet_nid_to_nid4(&lp->lp_primary_nid) == nid4) { ++ } else if (lp->lp_state & LNET_PEER_LOCK_PRIMARY) { ++ if (nid_same(&lp->lp_primary_nid, &nid)) { + rc = -EEXIST; + goto out; + } ++ /* we're trying to recreate an existing peer which ++ * has already been created and its primary ++ * locked. This is likely due to two servers ++ * existing on the same node. So we'll just refer ++ * to that node with the primary NID which was ++ * first added by Lustre ++ */ ++ rc = -EALREADY; ++ goto out; + } +- /* Delete and recreate as a configured peer. */ ++ /* Delete and recreate the peer. ++ * We can get here: ++ * 1. If the peer is being recreated as a configured NID ++ * 2. if there already exists a peer which ++ * was discovered manually, but is recreated via Lustre ++ * with PRIMARY_lock ++ */ + rc = lnet_peer_del(lp); + if (rc) + goto out; +@@ -1760,9 +1800,27 @@ lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid4, unsigned int flags) + } + /* If this is the primary NID, destroy the peer. */ + if (lnet_peer_ni_is_primary(lpni)) { +- struct lnet_peer *rtr_lp = ++ struct lnet_peer *lp2 = + lpni->lpni_peer_net->lpn_peer; +- int rtr_refcount = rtr_lp->lp_rtr_refcount; ++ int rtr_refcount = lp2->lp_rtr_refcount; ++ ++ /* If the new peer that this NID belongs to is ++ * a primary NID for another peer which we're ++ * suppose to preserve the Primary for then we ++ * don't want to mess with it. But the ++ * configuration is wrong at this point, so we ++ * should flag both of these peers as in a bad ++ * state ++ */ ++ if (lp2->lp_state & LNET_PEER_LOCK_PRIMARY) { ++ spin_lock(&lp->lp_lock); ++ lp->lp_state |= LNET_PEER_BAD_CONFIG; ++ spin_unlock(&lp->lp_lock); ++ spin_lock(&lp2->lp_lock); ++ lp2->lp_state |= LNET_PEER_BAD_CONFIG; ++ spin_unlock(&lp2->lp_lock); ++ goto out_free_lpni; ++ } + /* + * if we're trying to delete a router it means + * we're moving this peer NI to a new peer so must +@@ -1770,9 +1828,9 @@ lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid4, unsigned int flags) + */ + if (rtr_refcount > 0) { + flags |= LNET_PEER_RTR_NI_FORCE_DEL; +- lnet_rtr_transfer_to_peer(rtr_lp, lp); ++ lnet_rtr_transfer_to_peer(lp2, lp); + } +- lnet_peer_del(lpni->lpni_peer_net->lpn_peer); ++ lnet_peer_del(lp2); + lnet_peer_ni_decref_locked(lpni); + lpni = lnet_peer_ni_alloc(&nid); + if (!lpni) { +@@ -1830,7 +1888,8 @@ lnet_peer_set_primary_nid(struct lnet_peer *lp, lnet_nid_t nid, + if (lnet_nid_to_nid4(&lp->lp_primary_nid) == nid) + goto out; + +- lnet_nid4_to_nid(nid, &lp->lp_primary_nid); ++ if (!(lp->lp_state & LNET_PEER_LOCK_PRIMARY)) ++ lnet_nid4_to_nid(nid, &lp->lp_primary_nid); + + rc = lnet_peer_add_nid(lp, nid, flags); + if (rc) { +@@ -1838,6 +1897,14 @@ lnet_peer_set_primary_nid(struct lnet_peer *lp, lnet_nid_t nid, + goto out; + } + out: ++ /* if this is a configured peer or the primary for that peer has ++ * been locked, then we don't want to flag this scenario as ++ * a failure ++ */ ++ if (lp->lp_state & LNET_PEER_CONFIGURED || ++ lp->lp_state & LNET_PEER_LOCK_PRIMARY) ++ return 0; ++ + CDEBUG(D_NET, "peer %s NID %s: %d\n", + libcfs_nidstr(&old), libcfs_nid2str(nid), rc); + +-- +2.33.0 + diff --git a/0016-LU-14668-lnet-Peers-added-via-kernel-API-should-be-p.patch b/0016-LU-14668-lnet-Peers-added-via-kernel-API-should-be-p.patch new file mode 100644 index 0000000000000000000000000000000000000000..30b717fc43fd506df0f0bff79ec4178dbbdafb5a --- /dev/null +++ b/0016-LU-14668-lnet-Peers-added-via-kernel-API-should-be-p.patch @@ -0,0 +1,167 @@ +From f63e87f0a88a856d5cc38039afef704676ff5521 Mon Sep 17 00:00:00 2001 +From: Chris Horn +Date: Tue, 25 May 2021 11:17:49 -0500 +Subject: [PATCH 16/61] LU-14668 lnet: Peers added via kernel API should be + permanent + +The LNetAddPeer() API allows Lustre to predefine the Peer for LNet. +Originally these peers would be temporary and potentially re-created +via discovery. Instead, let's make these peers permanent. This allows +Lustre to dictate the primary NID of the peer. LNet makes sure this +primary NID is not changed afterwards. + +Lustre-change: https://review.whamcloud.com/43788 +Lustre-commit: 41733dadd8ad0e87e44dd19e25e576e90484cb9b + +Test-Parameters: trivial +Signed-off-by: Amir Shehata +Signed-off-by: Chris Horn +Change-Id: I3f54c04719c9e0374176682af08183f0c93ef737 +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51131 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Frank Sehr +Reviewed-by: Cyril Bordage +Reviewed-by: Oleg Drokin +--- + lnet/include/lnet/lib-lnet.h | 2 +- + lnet/lnet/api-ni.c | 7 +++--- + lnet/lnet/peer.c | 43 ++++++++++++++++++++++-------------- + 3 files changed, 32 insertions(+), 20 deletions(-) + +diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h +index 223c6d328b..a1261d137d 100644 +--- a/lnet/include/lnet/lib-lnet.h ++++ b/lnet/include/lnet/lib-lnet.h +@@ -1011,8 +1011,8 @@ void lnet_peer_clr_pref_rtrs(struct lnet_peer_ni *lpni); + int lnet_peer_add_pref_rtr(struct lnet_peer_ni *lpni, struct lnet_nid *nid); + int lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, + struct lnet_nid *nid); +-int lnet_add_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid, bool mr, bool temp); + int lnet_del_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid); ++int lnet_user_add_peer_ni(lnet_nid_t prim_nid, struct lnet_nid *nid, bool mr); + int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk); + int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid, + char alivness[LNET_MAX_STR_LEN], +diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c +index b99c85b73d..08c28d5aac 100644 +--- a/lnet/lnet/api-ni.c ++++ b/lnet/lnet/api-ni.c +@@ -4204,9 +4204,10 @@ LNetCtl(unsigned int cmd, void *arg) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); +- rc = lnet_add_peer_ni(cfg->prcfg_prim_nid, +- cfg->prcfg_cfg_nid, +- cfg->prcfg_mr, false); ++ lnet_nid4_to_nid(cfg->prcfg_cfg_nid, &nid); ++ rc = lnet_user_add_peer_ni(cfg->prcfg_prim_nid, ++ &nid, ++ cfg->prcfg_mr); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } +diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c +index 01eb9aa8aa..040fa36d34 100644 +--- a/lnet/lnet/peer.c ++++ b/lnet/lnet/peer.c +@@ -47,6 +47,8 @@ + #define LNET_REDISCOVER_PEER (1) + + static int lnet_peer_queue_for_discovery(struct lnet_peer *lp); ++static int lnet_add_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid, bool mr, ++ unsigned int flags); + + static void + lnet_peer_remove_from_remote_list(struct lnet_peer_ni *lpni) +@@ -1428,7 +1430,8 @@ LNetAddPeer(lnet_nid_t *nids, __u32 num_nids) + + if (!pnid) { + pnid = nids[i]; +- rc = lnet_add_peer_ni(pnid, LNET_NID_ANY, mr, true); ++ rc = lnet_add_peer_ni(pnid, LNET_NID_ANY, mr, ++ LNET_PEER_LOCK_PRIMARY); + if (rc == -EALREADY) { + struct lnet_peer *lp; + +@@ -1443,9 +1446,11 @@ LNetAddPeer(lnet_nid_t *nids, __u32 num_nids) + lnet_peer_decref_locked(lp); + } + } else if (lnet_peer_discovery_disabled) { +- rc = lnet_add_peer_ni(nids[i], LNET_NID_ANY, mr, true); ++ rc = lnet_add_peer_ni(nids[i], LNET_NID_ANY, mr, ++ LNET_PEER_LOCK_PRIMARY); + } else { +- rc = lnet_add_peer_ni(pnid, nids[i], mr, true); ++ rc = lnet_add_peer_ni(pnid, nids[i], mr, ++ LNET_PEER_LOCK_PRIMARY); + } + + if (rc && rc != -EEXIST) +@@ -1977,20 +1982,18 @@ out: + * The caller must hold ln_api_mutex. This prevents the peer from + * being created/modified/deleted by a different thread. + */ +-int +-lnet_add_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid, bool mr, bool temp) ++static int ++lnet_add_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid, bool mr, ++ unsigned int flags) ++__must_hold(&the_lnet.ln_api_mutex) + { + struct lnet_peer *lp = NULL; + struct lnet_peer_ni *lpni; +- unsigned int flags = 0; + + /* The prim_nid must always be specified */ + if (prim_nid == LNET_NID_ANY) + return -EINVAL; + +- if (!temp) +- flags = LNET_PEER_CONFIGURED; +- + if (mr) + flags |= LNET_PEER_MULTI_RAIL; + +@@ -2008,13 +2011,6 @@ lnet_add_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid, bool mr, bool temp) + lnet_peer_ni_decref_locked(lpni); + lp = lpni->lpni_peer_net->lpn_peer; + +- /* Peer must have been configured. */ +- if (!temp && !(lp->lp_state & LNET_PEER_CONFIGURED)) { +- CDEBUG(D_NET, "peer %s was not configured\n", +- libcfs_nid2str(prim_nid)); +- return -ENOENT; +- } +- + /* Primary NID must match */ + if (lnet_nid_to_nid4(&lp->lp_primary_nid) != prim_nid) { + CDEBUG(D_NET, "prim_nid %s is not primary for peer %s\n", +@@ -2030,9 +2026,24 @@ lnet_add_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid, bool mr, bool temp) + return -EPERM; + } + ++ if ((flags & LNET_PEER_LOCK_PRIMARY) && ++ (lnet_peer_is_uptodate(lp) && ++ (lp->lp_state & LNET_PEER_LOCK_PRIMARY))) { ++ CDEBUG(D_NET, ++ "Don't add temporary peer NI for uptodate peer %s\n", ++ libcfs_nidstr(&lp->lp_primary_nid)); ++ return -EINVAL; ++ } ++ + return lnet_peer_add_nid(lp, nid, flags); + } + ++int lnet_user_add_peer_ni(lnet_nid_t prim_nid, struct lnet_nid *nid, bool mr) ++{ ++ return lnet_add_peer_ni(prim_nid, lnet_nid_to_nid4(nid), mr, ++ LNET_PEER_CONFIGURED); ++} ++ + /* + * Implementation of IOC_LIBCFS_DEL_PEER_NI. + * +-- +2.33.0 + diff --git a/0017-LU-14668-lnet-don-t-delete-peer-created-by-Lustre.patch b/0017-LU-14668-lnet-don-t-delete-peer-created-by-Lustre.patch new file mode 100644 index 0000000000000000000000000000000000000000..9e6098db4ef95024c4e2b76ed6acb71454c7247b --- /dev/null +++ b/0017-LU-14668-lnet-don-t-delete-peer-created-by-Lustre.patch @@ -0,0 +1,98 @@ +From 26d11f254795a2869ae30a7e5d6ebf2bee59f879 Mon Sep 17 00:00:00 2001 +From: Amir Shehata +Date: Wed, 5 May 2021 23:02:22 -0700 +Subject: [PATCH 17/61] LU-14668 lnet: don't delete peer created by Lustre + +Peers created by Lustre have their primary NIDs locked. +If that peer is deleted, it'll confuse lustre. So when manually +deleting a peer using: + lnetctl peer del --prim_nid ... +We must continue to preserve the primary NID. Therefore we delete +all the constituent NIDs, but keep the primary NID. We then +flag the peer for rediscovery. + +Lustre-change: https://review.whamcloud.com/43565 +Lustre-commit: 7cc5b4329fc2eecbf09dbda85efe58f4ad5a32b9 + +Signed-off-by: Amir Shehata +Change-Id: I34eef9b0049435a01fde87dc8263dd50f631c551 +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51132 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Frank Sehr +Reviewed-by: Cyril Bordage +Reviewed-by: Oleg Drokin +--- + lnet/lnet/peer.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 47 insertions(+), 2 deletions(-) + +diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c +index 040fa36d34..3b45639b19 100644 +--- a/lnet/lnet/peer.c ++++ b/lnet/lnet/peer.c +@@ -2044,6 +2044,43 @@ int lnet_user_add_peer_ni(lnet_nid_t prim_nid, struct lnet_nid *nid, bool mr) + LNET_PEER_CONFIGURED); + } + ++static int ++lnet_reset_peer(struct lnet_peer *lp) ++{ ++ struct lnet_peer_net *lpn, *lpntmp; ++ struct lnet_peer_ni *lpni, *lpnitmp; ++ unsigned int flags; ++ int rc; ++ ++ lnet_peer_cancel_discovery(lp); ++ ++ flags = LNET_PEER_CONFIGURED; ++ if (lp->lp_state & LNET_PEER_MULTI_RAIL) ++ flags |= LNET_PEER_MULTI_RAIL; ++ ++ list_for_each_entry_safe(lpn, lpntmp, &lp->lp_peer_nets, ++ lpn_peer_nets) { ++ list_for_each_entry_safe(lpni, lpnitmp, &lpn->lpn_peer_nis, ++ lpni_peer_nis) { ++ if (nid_same(&lpni->lpni_nid, &lp->lp_primary_nid)) ++ continue; ++ ++ rc = lnet_peer_del_nid(lp, ++ lnet_nid_to_nid4(&lpni->lpni_nid), ++ flags); ++ if (rc) { ++ CERROR("Failed to delete %s from peer %s\n", ++ libcfs_nidstr(&lpni->lpni_nid), ++ libcfs_nidstr(&lp->lp_primary_nid)); ++ } ++ } ++ } ++ ++ /* mark it for discovery the next time we use it */ ++ lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; ++ return 0; ++} ++ + /* + * Implementation of IOC_LIBCFS_DEL_PEER_NI. + * +@@ -2087,8 +2124,16 @@ lnet_del_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid) + } + lnet_net_unlock(LNET_LOCK_EX); + +- if (nid == LNET_NID_ANY || nid == lnet_nid_to_nid4(&lp->lp_primary_nid)) +- return lnet_peer_del(lp); ++ if (nid == LNET_NID_ANY || ++ nid == lnet_nid_to_nid4(&lp->lp_primary_nid)) { ++ if (lp->lp_state & LNET_PEER_LOCK_PRIMARY) { ++ CERROR("peer %s created by Lustre. Must preserve primary NID, but will remove other NIDs\n", ++ libcfs_nidstr(&lp->lp_primary_nid)); ++ return lnet_reset_peer(lp); ++ } else { ++ return lnet_peer_del(lp); ++ } ++ } + + flags = LNET_PEER_CONFIGURED; + if (lp->lp_state & LNET_PEER_MULTI_RAIL) +-- +2.33.0 + diff --git a/0018-LU-14668-lnet-add-force-option-to-lnetctl-peer-del.patch b/0018-LU-14668-lnet-add-force-option-to-lnetctl-peer-del.patch new file mode 100644 index 0000000000000000000000000000000000000000..e007e314e07faf4f96a18a8c0aee7809d48cf57f --- /dev/null +++ b/0018-LU-14668-lnet-add-force-option-to-lnetctl-peer-del.patch @@ -0,0 +1,375 @@ +From 8c4df87ec21bf5d61dab4b6580fc7f7ecfa91e37 Mon Sep 17 00:00:00 2001 +From: Serguei Smirnov +Date: Mon, 27 Feb 2023 15:41:19 -0800 +Subject: [PATCH 18/61] LU-14668 lnet: add 'force' option to lnetctl peer del + +Add --force option to 'lnetctl peer del' command. +If the peer has primary NID locked, this option allows +for the peer to be deleted manually: + lnetctl peer del --prim_nid --force + +Add --prim_lock option to 'lnetctl peer add' command. +If specified, the primary NID of the peer is locked +such that it is going to be the NID used to identify +the peer in communications with Lustre layer. + +Lustre-change: https://review.whamcloud.com/50149 +Lustre-commit: f1b2d8d60c593a670b36006bcf9b040549d8c13a + +Test-Parameters: trivial +Signed-off-by: Serguei Smirnov +Change-Id: Ia6001856cfbce7b0c3288cff9b244b569d259647 +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51133 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Frank Sehr +Reviewed-by: Cyril Bordage +Reviewed-by: Oleg Drokin +--- + lnet/include/lnet/lib-lnet.h | 5 +++-- + lnet/include/uapi/linux/lnet/lnet-dlc.h | 4 +++- + lnet/lnet/api-ni.c | 6 +++-- + lnet/lnet/peer.c | 12 +++++----- + lnet/utils/lnetconfig/liblnetconfig.c | 24 +++++++++++--------- + lnet/utils/lnetconfig/liblnetconfig.h | 5 +++-- + lnet/utils/lnetctl.c | 29 +++++++++++++++++++++---- + lustre/doc/lnetctl.8 | 6 +++++ + lustre/tests/sanity-lnet.sh | 18 +++++++++++++++ + 9 files changed, 83 insertions(+), 26 deletions(-) + +diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h +index a1261d137d..5805586fb2 100644 +--- a/lnet/include/lnet/lib-lnet.h ++++ b/lnet/include/lnet/lib-lnet.h +@@ -1011,8 +1011,9 @@ void lnet_peer_clr_pref_rtrs(struct lnet_peer_ni *lpni); + int lnet_peer_add_pref_rtr(struct lnet_peer_ni *lpni, struct lnet_nid *nid); + int lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, + struct lnet_nid *nid); +-int lnet_del_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid); +-int lnet_user_add_peer_ni(lnet_nid_t prim_nid, struct lnet_nid *nid, bool mr); ++int lnet_del_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid, int force); ++int lnet_user_add_peer_ni(lnet_nid_t prim_nid, struct lnet_nid *nid, bool mr, ++ bool lock_prim); + int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk); + int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid, + char alivness[LNET_MAX_STR_LEN], +diff --git a/lnet/include/uapi/linux/lnet/lnet-dlc.h b/lnet/include/uapi/linux/lnet/lnet-dlc.h +index 2b2c05fa3b..6718f34d93 100644 +--- a/lnet/include/uapi/linux/lnet/lnet-dlc.h ++++ b/lnet/include/uapi/linux/lnet/lnet-dlc.h +@@ -273,7 +273,9 @@ struct lnet_ioctl_peer_cfg { + struct libcfs_ioctl_hdr prcfg_hdr; + lnet_nid_t prcfg_prim_nid; + lnet_nid_t prcfg_cfg_nid; +- __u32 prcfg_count; ++ __u32 prcfg_count; /* ADD_PEER_NI: used for 'lock_prim' option ++ * DEL_PEER_NI: used for 'force' option ++ */ + __u32 prcfg_mr; + __u32 prcfg_state; + __u32 prcfg_size; +diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c +index 08c28d5aac..3a56ea83a1 100644 +--- a/lnet/lnet/api-ni.c ++++ b/lnet/lnet/api-ni.c +@@ -4207,7 +4207,8 @@ LNetCtl(unsigned int cmd, void *arg) + lnet_nid4_to_nid(cfg->prcfg_cfg_nid, &nid); + rc = lnet_user_add_peer_ni(cfg->prcfg_prim_nid, + &nid, +- cfg->prcfg_mr); ++ cfg->prcfg_mr, ++ cfg->prcfg_count == 1); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } +@@ -4220,7 +4221,8 @@ LNetCtl(unsigned int cmd, void *arg) + + mutex_lock(&the_lnet.ln_api_mutex); + rc = lnet_del_peer_ni(cfg->prcfg_prim_nid, +- cfg->prcfg_cfg_nid); ++ cfg->prcfg_cfg_nid, ++ cfg->prcfg_count); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } +diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c +index 3b45639b19..33a16f8470 100644 +--- a/lnet/lnet/peer.c ++++ b/lnet/lnet/peer.c +@@ -2038,10 +2038,12 @@ __must_hold(&the_lnet.ln_api_mutex) + return lnet_peer_add_nid(lp, nid, flags); + } + +-int lnet_user_add_peer_ni(lnet_nid_t prim_nid, struct lnet_nid *nid, bool mr) ++int lnet_user_add_peer_ni(lnet_nid_t prim_nid, struct lnet_nid *nid, bool mr, ++ bool lock_prim) + { +- return lnet_add_peer_ni(prim_nid, lnet_nid_to_nid4(nid), mr, +- LNET_PEER_CONFIGURED); ++ int fl = LNET_PEER_CONFIGURED | (LNET_PEER_LOCK_PRIMARY * lock_prim); ++ ++ return lnet_add_peer_ni(prim_nid, lnet_nid_to_nid4(nid), mr, fl); + } + + static int +@@ -2093,7 +2095,7 @@ lnet_reset_peer(struct lnet_peer *lp) + * being modified/deleted by a different thread. + */ + int +-lnet_del_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid) ++lnet_del_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid, int force) + { + struct lnet_peer *lp; + struct lnet_peer_ni *lpni; +@@ -2126,7 +2128,7 @@ lnet_del_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid) + + if (nid == LNET_NID_ANY || + nid == lnet_nid_to_nid4(&lp->lp_primary_nid)) { +- if (lp->lp_state & LNET_PEER_LOCK_PRIMARY) { ++ if (!force && lp->lp_state & LNET_PEER_LOCK_PRIMARY) { + CERROR("peer %s created by Lustre. Must preserve primary NID, but will remove other NIDs\n", + libcfs_nidstr(&lp->lp_primary_nid)); + return lnet_reset_peer(lp); +diff --git a/lnet/utils/lnetconfig/liblnetconfig.c b/lnet/utils/lnetconfig/liblnetconfig.c +index 9f0dabd903..bbf5d3824a 100644 +--- a/lnet/utils/lnetconfig/liblnetconfig.c ++++ b/lnet/utils/lnetconfig/liblnetconfig.c +@@ -630,7 +630,7 @@ int lustre_lnet_discover_nid(char *ping_nids, int force, int seq_no, + } + + static int lustre_lnet_handle_peer_nidlist(lnet_nid_t *nidlist, int num_nids, +- bool is_mr, __u32 cmd, ++ bool is_mr, int option, __u32 cmd, + char *cmd_type, char *err_str) + { + struct lnet_ioctl_peer_cfg data; +@@ -645,6 +645,7 @@ static int lustre_lnet_handle_peer_nidlist(lnet_nid_t *nidlist, int num_nids, + data.prcfg_mr = is_mr; + data.prcfg_prim_nid = nidlist[0]; + data.prcfg_cfg_nid = LNET_NID_ANY; ++ data.prcfg_count = option; + + rc = dispatch_peer_ni_cmd(cmd, &data, err_str, cmd_type); + +@@ -660,6 +661,7 @@ static int lustre_lnet_handle_peer_nidlist(lnet_nid_t *nidlist, int num_nids, + data.prcfg_mr = is_mr; + data.prcfg_prim_nid = nidlist[0]; + data.prcfg_cfg_nid = nidlist[nid_idx]; ++ data.prcfg_count = option; + + rc = dispatch_peer_ni_cmd(cmd, &data, err_str, cmd_type); + +@@ -675,6 +677,7 @@ static int lustre_lnet_handle_peer_nidlist(lnet_nid_t *nidlist, int num_nids, + LIBCFS_IOC_INIT_V2(data, prcfg_hdr); + data.prcfg_prim_nid = nidlist[0]; + data.prcfg_cfg_nid = LNET_NID_ANY; ++ data.prcfg_count = option; + + rc = dispatch_peer_ni_cmd(cmd, &data, err_str, cmd_type); + } +@@ -684,8 +687,8 @@ static int lustre_lnet_handle_peer_nidlist(lnet_nid_t *nidlist, int num_nids, + + static int + lustre_lnet_mod_peer_nidlist(lnet_nid_t pnid, lnet_nid_t *lnet_nidlist, +- int cmd, int num_nids, bool is_mr, int seq_no, +- struct cYAML **err_rc) ++ int cmd, int num_nids, bool is_mr, int option, ++ int seq_no, struct cYAML **err_rc) + { + int rc = LUSTRE_CFG_RC_NO_ERR; + char err_str[LNET_MAX_STR_LEN]; +@@ -706,8 +709,8 @@ lustre_lnet_mod_peer_nidlist(lnet_nid_t pnid, lnet_nid_t *lnet_nidlist, + (num_nids - 1)); + + rc = lustre_lnet_handle_peer_nidlist(lnet_nidlist2, +- num_nids, is_mr, ioc_cmd, +- cmd_str, err_str); ++ num_nids, is_mr, option, ++ ioc_cmd, cmd_str, err_str); + out: + if (lnet_nidlist2) + free(lnet_nidlist2); +@@ -734,8 +737,8 @@ replace_sep(char *str, char sep, char newsep) + } + } + +-int lustre_lnet_modify_peer(char *prim_nid, char *nids, bool is_mr, +- int cmd, int seq_no, struct cYAML **err_rc) ++int lustre_lnet_modify_peer(char *prim_nid, char *nids, bool is_mr, int cmd, ++ int option, int seq_no, struct cYAML **err_rc) + { + int num_nids, rc; + char err_str[LNET_MAX_STR_LEN] = "Error"; +@@ -774,7 +777,7 @@ int lustre_lnet_modify_peer(char *prim_nid, char *nids, bool is_mr, + + rc = lustre_lnet_mod_peer_nidlist(pnid, lnet_nidlist, + cmd, num_nids, is_mr, +- -1, err_rc); ++ option, -1, err_rc); + + out: + if (rc != LUSTRE_CFG_RC_NO_ERR) +@@ -4696,6 +4699,7 @@ static int handle_yaml_peer_common(struct cYAML *tree, struct cYAML **show_rc, + struct cYAML *seq_no, *prim_nid, *mr, *peer_nis; + lnet_nid_t lnet_nidlist[LNET_MAX_NIDS_PER_PEER]; + lnet_nid_t pnid = LNET_NID_ANY; ++ int force = 0; + + seq_no = cYAML_get_object_item(tree, "seq_no"); + seqn = seq_no ? seq_no->cy_valueint : -1; +@@ -4762,8 +4766,8 @@ static int handle_yaml_peer_common(struct cYAML *tree, struct cYAML **show_rc, + } + + rc = lustre_lnet_mod_peer_nidlist(pnid, lnet_nidlist, cmd, +- num_nids, mr_value, seqn, +- err_rc); ++ num_nids, mr_value, force, ++ seqn, err_rc); + + failed: + if (nidstr) +diff --git a/lnet/utils/lnetconfig/liblnetconfig.h b/lnet/utils/lnetconfig/liblnetconfig.h +index 5096ac0c60..2c5397dce3 100644 +--- a/lnet/utils/lnetconfig/liblnetconfig.h ++++ b/lnet/utils/lnetconfig/liblnetconfig.h +@@ -664,11 +664,12 @@ int lustre_lnet_reset_stats(int seq_no, struct cYAML **err_rc); + * nids - a comma separated string of nids + * is_mr - Specifies whether this peer is MR capable. + * cmd - CONFIG or DELETE ++ * force - whether force-deleting a peer with locked primary nid + * seq_no - sequence number of the command + * err_rc - YAML structure of the resultant return code + */ +-int lustre_lnet_modify_peer(char *prim_nid, char *nids, bool is_mr, +- int cmd, int seq_no, struct cYAML **err_rc); ++int lustre_lnet_modify_peer(char *prim_nid, char *nids, bool is_mr, int cmd, ++ int force, int seq_no, struct cYAML **err_rc); + + /* + * lustre_lnet_show_peer +diff --git a/lnet/utils/lnetctl.c b/lnet/utils/lnetctl.c +index 51dfab605e..0f89e206e8 100644 +--- a/lnet/utils/lnetctl.c ++++ b/lnet/utils/lnetctl.c +@@ -258,12 +258,14 @@ command_t peer_cmds[] = { + "\t--prim_nid: Primary NID of the peer.\n" + "\t--nid: one or more peer NIDs\n" + "\t--non_mr: create this peer as not Multi-Rail capable\n" +- "\t--ip2nets: specify a range of nids per peer"}, ++ "\t--ip2nets: specify a range of nids per peer\n" ++ "\t--lock_prim: lock primary nid\n"}, + {"del", jt_del_peer_nid, 0, "delete a peer NID\n" + "\t--prim_nid: Primary NID of the peer.\n" + "\t--nid: list of NIDs to remove. If none provided,\n" + "\t peer is deleted\n" +- "\t--ip2nets: specify a range of nids per peer"}, ++ "\t--ip2nets: specify a range of nids per peer\n" ++ "\t--force: force-delete locked primary NID\n"}, + {"show", jt_show_peer, 0, "show peer information\n" + "\t--nid: NID of peer to filter on.\n" + "\t--verbose: display detailed output per peer." +@@ -2082,12 +2084,15 @@ static int jt_peer_nid_common(int argc, char **argv, int cmd) + char *prim_nid = NULL, *nidstr = NULL; + char err_str[LNET_MAX_STR_LEN] = "Error"; + struct cYAML *err_rc = NULL; ++ int force_lock = 0; + +- const char *const short_opts = "k:mn:"; ++ const char *const short_opts = "k:m:n:f:l"; + const struct option long_opts[] = { + { .name = "prim_nid", .has_arg = required_argument, .val = 'k' }, + { .name = "non_mr", .has_arg = no_argument, .val = 'm' }, + { .name = "nid", .has_arg = required_argument, .val = 'n' }, ++ { .name = "force", .has_arg = no_argument, .val = 'f' }, ++ { .name = "lock_prim", .has_arg = no_argument, .val = 'l' }, + { .name = NULL } }; + + rc = check_cmd(peer_cmds, "peer", "add", 2, argc, argv); +@@ -2112,6 +2117,22 @@ static int jt_peer_nid_common(int argc, char **argv, int cmd) + } + is_mr = false; + break; ++ case 'f': ++ if (cmd == LNETCTL_ADD_CMD) { ++ rc = LUSTRE_CFG_RC_BAD_PARAM; ++ snprintf(err_str, LNET_MAX_STR_LEN, ++ "Unrecognized option '-%c'", opt); ++ } ++ force_lock = 1; ++ break; ++ case 'l': ++ if (cmd == LNETCTL_DEL_CMD) { ++ rc = LUSTRE_CFG_RC_BAD_PARAM; ++ snprintf(err_str, LNET_MAX_STR_LEN, ++ "Unrecognized option '-%c'", opt); ++ } ++ force_lock = 1; ++ break; + case '?': + print_help(peer_cmds, "peer", + cmd == LNETCTL_ADD_CMD ? "add" : "del"); +@@ -2121,7 +2142,7 @@ static int jt_peer_nid_common(int argc, char **argv, int cmd) + } + + rc = lustre_lnet_modify_peer(prim_nid, nidstr, is_mr, cmd, +- -1, &err_rc); ++ force_lock, -1, &err_rc); + if (rc != LUSTRE_CFG_RC_NO_ERR) + goto out; + +diff --git a/lustre/doc/lnetctl.8 b/lustre/doc/lnetctl.8 +index f2a2793825..0df13e2493 100644 +--- a/lustre/doc/lnetctl.8 ++++ b/lustre/doc/lnetctl.8 +@@ -125,6 +125,9 @@ Configure an LNET peer with at least one supplied NID\. The primary NID must be + \-\-non_mr: create this peer as not Multi-Rail capable\. + . + .br ++\-\-lock_prim: lock primary NID of the peer for the purpose of identification with Lustre\. ++. ++.br + + .TP + \fBlnetctl peer\fR del +@@ -139,6 +142,9 @@ Delete a peer NID. The primary NID must be specified. If the removed NID is th + \-\-prim_nid: Primary NID of the peer\. + . + .br ++\-\-force: optional, use to delete a peer with primary NID locked\. ++. ++.br + + .TP + \fBlnetctl peer\fR show +diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh +index 367cb19d15..4af5e96ef8 100755 +--- a/lustre/tests/sanity-lnet.sh ++++ b/lustre/tests/sanity-lnet.sh +@@ -932,6 +932,24 @@ EOF + } + run_test 25 "Delete all secondary nids from peer (tcp, gni and o2ib)" + ++test_26() { ++ reinit_dlc || return $? ++ ++ do_lnetctl peer add --prim_nid 1.1.1.1@tcp --lock_prim || ++ error "Peer add with --lock_prim option failed $?" ++ do_lnetctl peer del --prim_nid 1.1.1.1@tcp || ++ error "Peer del failed $?" ++ $LNETCTL peer show --nid 1.1.1.1@tcp | grep -q 1.1.1.1@tcp || ++ error "1.1.1.1@tcp is not listed" ++ do_lnetctl peer del --prim_nid 1.1.1.1@tcp --force || ++ error "Peer del --force failed $?" ++ do_lnetctl peer show --nid 1.1.1.1@tcp && ++ error "failed to delete 1.1.1.1@tcp" ++ ++ return 0 ++} ++run_test 26 "Delete peer with primary nid locked" ++ + test_99a() { + reinit_dlc || return $? + +-- +2.33.0 + diff --git a/0019-LU-14668-lnet-add-lock_prim_nid-lnet-module-paramete.patch b/0019-LU-14668-lnet-add-lock_prim_nid-lnet-module-paramete.patch new file mode 100644 index 0000000000000000000000000000000000000000..5b41c683d88b92f89cda9652ef901087717a0db4 --- /dev/null +++ b/0019-LU-14668-lnet-add-lock_prim_nid-lnet-module-paramete.patch @@ -0,0 +1,245 @@ +From 6cfc8e55a2e77c9c91b81a8842e2cbd886025298 Mon Sep 17 00:00:00 2001 +From: Serguei Smirnov +Date: Tue, 28 Feb 2023 15:02:20 -0800 +Subject: [PATCH 19/61] LU-14668 lnet: add 'lock_prim_nid" lnet module + parameter + +Add 'lock_prim_nid' lnet module parameter to allow control +of how Lustre peer primary NID is selected. +If set to 1 (default), the NID specified by Lustre when +calling LNet API is designated as primary for the peer, +allowing for non-blocking discovery in the background. +If set to 0, peer discovery is blocking until complete +and the NID listed first in discovery response is designated +as primary. + +Lustre-change: https://review.whamcloud.com/50159 +Lustre-commit: fc7a0d6013b46ebc17cdfdccc04a5d1d92c6af24 + +Signed-off-by: Serguei Smirnov +Change-Id: I6ed1cb0c637f4aa7a7340a6f01819ba9a85858f4 +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51134 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Frank Sehr +Reviewed-by: Cyril Bordage +Reviewed-by: Oleg Drokin +--- + lnet/include/lnet/lib-lnet.h | 1 + + lnet/lnet/api-ni.c | 5 ++ + lnet/lnet/peer.c | 106 +++++++++++++++++++++++------------ + 3 files changed, 75 insertions(+), 37 deletions(-) + +diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h +index 5805586fb2..5821d20be2 100644 +--- a/lnet/include/lnet/lib-lnet.h ++++ b/lnet/include/lnet/lib-lnet.h +@@ -615,6 +615,7 @@ extern int alive_router_check_interval; + extern int live_router_check_interval; + extern int dead_router_check_interval; + extern int portal_rotor; ++extern int lock_prim_nid; + + void lnet_mt_event_handler(struct lnet_event *event); + +diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c +index 3a56ea83a1..fa7d702c2f 100644 +--- a/lnet/lnet/api-ni.c ++++ b/lnet/lnet/api-ni.c +@@ -243,6 +243,11 @@ module_param_call(lnet_response_tracking, response_tracking_set, param_get_int, + MODULE_PARM_DESC(lnet_response_tracking, + "(0|1|2|3) LNet Internal Only|GET Reply only|PUT ACK only|Full Tracking (default)"); + ++int lock_prim_nid = 1; ++module_param(lock_prim_nid, int, 0444); ++MODULE_PARM_DESC(lock_prim_nid, ++ "Whether nid passed down by Lustre is locked as primary"); ++ + #define LNET_LND_TIMEOUT_DEFAULT ((LNET_TRANSACTION_TIMEOUT_DEFAULT - 1) / \ + (LNET_RETRY_COUNT_DEFAULT + 1)) + unsigned int lnet_lnd_timeout = LNET_LND_TIMEOUT_DEFAULT; +diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c +index 33a16f8470..4eb7a44cf4 100644 +--- a/lnet/lnet/peer.c ++++ b/lnet/lnet/peer.c +@@ -1411,6 +1411,7 @@ LNetAddPeer(lnet_nid_t *nids, __u32 num_nids) + lnet_nid_t pnid = 0; + bool mr; + int i, rc; ++ int flags = lock_prim_nid ? LNET_PEER_LOCK_PRIMARY : 0; + + if (!nids || num_nids < 1) + return -EINVAL; +@@ -1431,7 +1432,7 @@ LNetAddPeer(lnet_nid_t *nids, __u32 num_nids) + if (!pnid) { + pnid = nids[i]; + rc = lnet_add_peer_ni(pnid, LNET_NID_ANY, mr, +- LNET_PEER_LOCK_PRIMARY); ++ flags); + if (rc == -EALREADY) { + struct lnet_peer *lp; + +@@ -1447,10 +1448,10 @@ LNetAddPeer(lnet_nid_t *nids, __u32 num_nids) + } + } else if (lnet_peer_discovery_disabled) { + rc = lnet_add_peer_ni(nids[i], LNET_NID_ANY, mr, +- LNET_PEER_LOCK_PRIMARY); ++ flags); + } else { + rc = lnet_add_peer_ni(pnid, nids[i], mr, +- LNET_PEER_LOCK_PRIMARY); ++ flags); + } + + if (rc && rc != -EEXIST) +@@ -1493,37 +1494,54 @@ LNetPrimaryNID(lnet_nid_t nid) + * down then this discovery can introduce long delays into the mount + * process, so skip it if it isn't necessary. + */ ++again: + spin_lock(&lp->lp_lock); +- if (!lnet_peer_discovery_disabled && +- (!(lp->lp_state & LNET_PEER_LOCK_PRIMARY) || +- !lnet_peer_is_uptodate_locked(lp))) { +- /* force a full discovery cycle */ +- lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH | +- LNET_PEER_LOCK_PRIMARY; ++ if (!(lp->lp_state & LNET_PEER_LOCK_PRIMARY) && lock_prim_nid) ++ lp->lp_state |= LNET_PEER_LOCK_PRIMARY; ++ ++ /* DD disabled, nothing to do */ ++ if (lnet_peer_discovery_disabled) { ++ nid = lnet_nid_to_nid4(&lp->lp_primary_nid); + spin_unlock(&lp->lp_lock); ++ goto out_decref; ++ } + +- /* start discovery in the background. Messages to that +- * peer will not go through until the discovery is +- * complete +- */ +- rc = lnet_discover_peer_locked(lpni, cpt, false); +- if (rc) +- goto out_decref; +- /* The lpni (or lp) for this NID may have changed and our ref is +- * the only thing keeping the old one around. Release the ref +- * and lookup the lpni again +- */ +- lnet_peer_ni_decref_locked(lpni); +- lpni = lnet_find_peer_ni_locked(nid); +- if (!lpni) { +- rc = -ENOENT; +- goto out_unlock; +- } +- lp = lpni->lpni_peer_net->lpn_peer; +- } else { ++ /* Peer already up to date, nothing to do */ ++ if (lnet_peer_is_uptodate_locked(lp)) { ++ nid = lnet_nid_to_nid4(&lp->lp_primary_nid); + spin_unlock(&lp->lp_lock); ++ goto out_decref; ++ } ++ spin_unlock(&lp->lp_lock); ++ ++ /* If primary nid locking is enabled, discovery is performed ++ * in the background. ++ * If primary nid locking is disabled, discovery blocks here. ++ * Messages to the peer will not go through until the discovery is ++ * complete. ++ */ ++ if (lock_prim_nid) ++ rc = lnet_discover_peer_locked(lpni, cpt, false); ++ else ++ rc = lnet_discover_peer_locked(lpni, cpt, true); ++ if (rc) ++ goto out_decref; ++ ++ /* The lpni (or lp) for this NID may have changed and our ref is ++ * the only thing keeping the old one around. Release the ref ++ * and lookup the lpni again ++ */ ++ lnet_peer_ni_decref_locked(lpni); ++ lpni = lnet_find_peer_ni_locked(nid); ++ if (!lpni) { ++ rc = -ENOENT; ++ goto out_unlock; + } +- primary_nid = lnet_nid_to_nid4(&lp->lp_primary_nid); ++ lp = lpni->lpni_peer_net->lpn_peer; ++ ++ if (!lock_prim_nid && !lnet_is_discovery_disabled(lp)) ++ goto again; ++ nid = lnet_nid_to_nid4(&lp->lp_primary_nid); + out_decref: + lnet_peer_ni_decref_locked(lpni); + out_unlock: +@@ -1614,7 +1632,6 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp, + ptable->pt_peers++; + } + +- + /* Update peer state */ + spin_lock(&lp->lp_lock); + if (flags & LNET_PEER_CONFIGURED) { +@@ -1692,10 +1709,8 @@ lnet_peer_add(lnet_nid_t nid4, unsigned int flags) + rc = -EPERM; + goto out; + } else if (lp->lp_state & LNET_PEER_LOCK_PRIMARY) { +- if (nid_same(&lp->lp_primary_nid, &nid)) { ++ if (nid_same(&lp->lp_primary_nid, &nid)) + rc = -EEXIST; +- goto out; +- } + /* we're trying to recreate an existing peer which + * has already been created and its primary + * locked. This is likely due to two servers +@@ -1703,8 +1718,19 @@ lnet_peer_add(lnet_nid_t nid4, unsigned int flags) + * to that node with the primary NID which was + * first added by Lustre + */ +- rc = -EALREADY; ++ else ++ rc = -EALREADY; + goto out; ++ } else if (!(flags & ++ (LNET_PEER_LOCK_PRIMARY | LNET_PEER_CONFIGURED))) { ++ /* if not recreating peer as configured and ++ * not locking primary nid, no need to ++ * do anything if primary nid is not being changed ++ */ ++ if (nid_same(&lp->lp_primary_nid, &nid)) { ++ rc = -EEXIST; ++ goto out; ++ } + } + /* Delete and recreate the peer. + * We can get here: +@@ -2011,6 +2037,14 @@ __must_hold(&the_lnet.ln_api_mutex) + lnet_peer_ni_decref_locked(lpni); + lp = lpni->lpni_peer_net->lpn_peer; + ++ /* Peer must have been configured. */ ++ if ((flags & LNET_PEER_CONFIGURED) && ++ !(lp->lp_state & LNET_PEER_CONFIGURED)) { ++ CDEBUG(D_NET, "peer %s was not configured\n", ++ libcfs_nid2str(prim_nid)); ++ return -ENOENT; ++ } ++ + /* Primary NID must match */ + if (lnet_nid_to_nid4(&lp->lp_primary_nid) != prim_nid) { + CDEBUG(D_NET, "prim_nid %s is not primary for peer %s\n", +@@ -2026,9 +2060,7 @@ __must_hold(&the_lnet.ln_api_mutex) + return -EPERM; + } + +- if ((flags & LNET_PEER_LOCK_PRIMARY) && +- (lnet_peer_is_uptodate(lp) && +- (lp->lp_state & LNET_PEER_LOCK_PRIMARY))) { ++ if (lnet_peer_is_uptodate(lp) && !(flags & LNET_PEER_CONFIGURED)) { + CDEBUG(D_NET, + "Don't add temporary peer NI for uptodate peer %s\n", + libcfs_nidstr(&lp->lp_primary_nid)); +-- +2.33.0 + diff --git a/0020-LU-14668-tests-verify-state-of-peer-added-with-lock_.patch b/0020-LU-14668-tests-verify-state-of-peer-added-with-lock_.patch new file mode 100644 index 0000000000000000000000000000000000000000..9a4646b8a6256075fc707ff123422a567b772c1c --- /dev/null +++ b/0020-LU-14668-tests-verify-state-of-peer-added-with-lock_.patch @@ -0,0 +1,48 @@ +From 7ee579d25a614946ba22a5a08fdc4373c41ef8f1 Mon Sep 17 00:00:00 2001 +From: Serguei Smirnov +Date: Thu, 9 Mar 2023 15:00:46 -0800 +Subject: [PATCH 20/61] LU-14668 tests: verify state of peer added with + '--lock_prim' + +Add peer state verification to sanity-lnet test_26: +check that peer state has corresponding bit set for a peer +created with '--lock_prim' option. + +Lustre-change: https://review.whamcloud.com/50249 +Lustre-commit: 9b6fcfa334b153e52caec16d4cfd180306826a3a + +Test-Parameters: trivial testlist=sanity-lnet +Fixes: 05f7f6a0b ("LU-14668 lnet: add 'force' option to lnetctl peer del") +Signed-off-by: Serguei Smirnov +Change-Id: Id5fde036907f9dd19a21e8e6611a070321310f0e +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51135 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Frank Sehr +Reviewed-by: Cyril Bordage +Reviewed-by: Oleg Drokin +--- + lustre/tests/sanity-lnet.sh | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh +index 4af5e96ef8..1fb298776c 100755 +--- a/lustre/tests/sanity-lnet.sh ++++ b/lustre/tests/sanity-lnet.sh +@@ -937,6 +937,13 @@ test_26() { + + do_lnetctl peer add --prim_nid 1.1.1.1@tcp --lock_prim || + error "Peer add with --lock_prim option failed $?" ++ local peer_state=$($LNETCTL peer show -v 4 --nid 1.1.1.1@tcp | ++ awk '/peer state/ {print $NF}') ++ # This relies on the following peer state definition: ++ # #define LNET_PEER_LOCK_PRIMARY BIT(20) ++ if ((!("$peer_state" & (1 << 20)))); then ++ error "Peer state does not have 'locked' bit set: $peer_state" ++ fi + do_lnetctl peer del --prim_nid 1.1.1.1@tcp || + error "Peer del failed $?" + $LNETCTL peer show --nid 1.1.1.1@tcp | grep -q 1.1.1.1@tcp || +-- +2.33.0 + diff --git a/0021-LU-11787-test-Fix-checkfilemap-tests-for-64K-page.patch b/0021-LU-11787-test-Fix-checkfilemap-tests-for-64K-page.patch new file mode 100644 index 0000000000000000000000000000000000000000..0197a03a34887ba38cf8a1639d14b1311546a949 --- /dev/null +++ b/0021-LU-11787-test-Fix-checkfilemap-tests-for-64K-page.patch @@ -0,0 +1,93 @@ +From 66d9916dc947064b598f52476fb6482c0bbaff10 Mon Sep 17 00:00:00 2001 +From: James Simmons +Date: Mon, 31 Jan 2022 12:44:46 -0500 +Subject: [PATCH 21/61] LU-11787 test: Fix checkfilemap tests for 64K page + +File mapping is page size aligned. Modify the tests to handle 64K +page. + +Lustre-change: https://review.whamcloud.com/45629 +Lustre-commit: 7c88dfd28b5cc6114a85f187ecb2473657d42c9d + +Test-Parameters: trivial clientdistro=el8.7 clientarch=aarch64 testlist=sanityn env=ONLY="71a 71b" +Change-Id: I316a197db8cdd0f9064431f8c572b43adf6110b8 +Signed-off-by: James Simmons +Signed-off-by: Xinliang Liu +Reviewed-by: Arshad Hussain +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51287 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Oleg Drokin +--- + lustre/tests/sanityn.sh | 22 ++++++++-------------- + 1 file changed, 8 insertions(+), 14 deletions(-) + +diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh +index 17a1231d4c..ca366fd10a 100755 +--- a/lustre/tests/sanityn.sh ++++ b/lustre/tests/sanityn.sh +@@ -22,12 +22,6 @@ ALWAYS_EXCEPT="$SANITYN_EXCEPT " + ALWAYS_EXCEPT+=" 28 " + # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! + +-# skip tests for PPC until they are fixed +-if [[ $(uname -m) = ppc64 ]]; then +- # bug number: LU-11787 +- ALWAYS_EXCEPT+=" 71a" +-fi +- + if [ $mds1_FSTYPE = "zfs" ]; then + # bug number: LU-15757 (test_102() causes crash in umount later) + ALWAYS_EXCEPT+=" 102" +@@ -3560,28 +3554,28 @@ test_71a() { + checkfiemap --test || + skip "checkfiemap not runnable: $?" + # write data this way: hole - data - hole - data +- dd if=/dev/urandom of=$DIR1/$tfile bs=40K seek=1 count=1 ++ dd if=/dev/urandom of=$DIR1/$tfile bs=64K seek=1 count=1 + [ "$(facet_fstype ost$(($($LFS getstripe -i $DIR1/$tfile) + 1)))" = \ + "zfs" ] && + skip "ORI-366/LU-1941: FIEMAP unimplemented on ZFS" && return 0 +- dd if=/dev/urandom of=$DIR1/$tfile bs=40K seek=3 count=1 ++ dd if=/dev/urandom of=$DIR1/$tfile bs=64K seek=3 count=1 + GET_STAT="lctl get_param -n ldlm.services.ldlm_cbd.stats" + stat $DIR2/$tfile + local can1=$($GET_STAT | awk '/ldlm_bl_callback/ {print $2}') + echo $can1 +- checkfiemap $DIR2/$tfile 81920 || ++ checkfiemap $DIR2/$tfile 131072 || + error "data is not flushed from client" + local can2=$($GET_STAT | awk '/ldlm_bl_callback/ {print $2}') + echo $can2 + + # common case of "create file, copy file" on a single node + # should not flush data from ost +- dd if=/dev/urandom of=$DIR1/$tfile bs=40K seek=1 count=1 +- dd if=/dev/urandom of=$DIR1/$tfile bs=40K seek=3 count=1 ++ dd if=/dev/urandom of=$DIR1/$tfile bs=64K seek=1 count=1 ++ dd if=/dev/urandom of=$DIR1/$tfile bs=64K seek=3 count=1 + stat $DIR1/$tfile + local can3=$($GET_STAT | awk '/ldlm_bl_callback/ {print $2}') + echo $can3 +- checkfiemap $DIR1/$tfile 81920 || ++ checkfiemap $DIR1/$tfile 131072 || + error 4 + local can4=$($GET_STAT | awk '/ldlm_bl_callback/ {print $2}') + echo $can2 +@@ -3605,11 +3599,11 @@ test_71b() { + mkdir -p $DIR1/$tdir + + $LFS setstripe -c -1 $DIR1/$tdir || error "setstripe failed" +- dd if=/dev/urandom of=$DIR1/$tdir/$tfile bs=40K count=1 ++ dd if=/dev/urandom of=$DIR1/$tdir/$tfile bs=64K count=1 + [ "$(facet_fstype ost$(($($LFS getstripe -i $DIR1/$tdir/$tfile) + 1)))" = \ + "zfs" ] && + skip "ORI-366/LU-1941: FIEMAP unimplemented on ZFS" && return 0 +- checkfiemap $DIR1/$tdir/$tfile 40960 || error "checkfiemap failed" ++ checkfiemap $DIR1/$tdir/$tfile 65536 || error "checkfiemap failed" + } + run_test 71b "check fiemap support for stripecount > 1" + +-- +2.33.0 + diff --git a/0022-LU-15800-ofd-take-a-read-lock-for-fallocate.patch b/0022-LU-15800-ofd-take-a-read-lock-for-fallocate.patch new file mode 100644 index 0000000000000000000000000000000000000000..42a83dff35b0a8eb6717677774f6aa8ad0d07c00 --- /dev/null +++ b/0022-LU-15800-ofd-take-a-read-lock-for-fallocate.patch @@ -0,0 +1,51 @@ +From 8299b3fd77ebcc372b5d929eaa08231fc703c431 Mon Sep 17 00:00:00 2001 +From: Alex Zhuravlev +Date: Tue, 10 May 2022 10:48:55 +0300 +Subject: [PATCH 22/61] LU-15800 ofd: take a read lock for fallocate + +there is no need to take an write (exclusive) object's +lock for fallocate - we just need to serialize fallocate +vs destroy, all internal structures should be protected +by OSD and disk filesystem like the write path does. + +Lustre-change: https://review.whamcloud.com/47268 +Lustre-commit: 5fae80066162ea637c8649f6439fc14e1d9a7cf8 + +Fixes: cdaaa87f6b ("LU-14214 ofd: fix locking in ofd_object_fallocate()") +Signed-off-by: Alex Zhuravlev +Change-Id: I65986745865ee329c5257a7efca5e79403830608 +Reviewed-by: Arshad Hussain +Reviewed-by: Andreas Dilger +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51702 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Oleg Drokin +--- + lustre/ofd/ofd_objects.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lustre/ofd/ofd_objects.c b/lustre/ofd/ofd_objects.c +index 41fe74bb11..16f9fba3d8 100644 +--- a/lustre/ofd/ofd_objects.c ++++ b/lustre/ofd/ofd_objects.c +@@ -801,7 +801,7 @@ int ofd_object_fallocate(const struct lu_env *env, struct ofd_object *fo, + if (rc) + GOTO(stop, rc); + +- ofd_write_lock(env, fo); ++ ofd_read_lock(env, fo); + if (!ofd_object_exists(fo)) + GOTO(unlock, rc = -ENOENT); + +@@ -824,7 +824,7 @@ int ofd_object_fallocate(const struct lu_env *env, struct ofd_object *fo, + filter_fid_le_to_cpu(&fo->ofo_ff, ff, sizeof(*ff)); + } + unlock: +- ofd_write_unlock(env, fo); ++ ofd_read_unlock(env, fo); + stop: + ofd_trans_stop(env, ofd, th, rc); + RETURN(rc); +-- +2.33.0 + diff --git a/0023-LU-16873-osd-update-OI_Scrub-file-with-new-magic.patch b/0023-LU-16873-osd-update-OI_Scrub-file-with-new-magic.patch new file mode 100644 index 0000000000000000000000000000000000000000..18b58437ab774fb197aff042f3a97ba7a91a8963 --- /dev/null +++ b/0023-LU-16873-osd-update-OI_Scrub-file-with-new-magic.patch @@ -0,0 +1,93 @@ +From 703d8a994bf33ac07f8bd4c956880db3d9abb016 Mon Sep 17 00:00:00 2001 +From: Alexander Zarochentsev +Date: Sun, 28 May 2023 08:42:27 -0400 +Subject: [PATCH 23/61] LU-16873 osd: update OI_Scrub file with new magic + +The fix for LUS-11542 detects the format change correctly +but does not write new oi scrub file magic, so new mount +triggers the "oi files counter reset" again and again. + +Lustre-change: https://review.whamcloud.com/51226 +Lustre-commit: 38b7c408212f60d684c9b114d90b4514e0044ffe + +Fixes: 126275ba83 ("LU-16655 scrub: upgrade scrub_file from 2.12 format") +HPE-bug-id: LUS-11646 +Signed-off-by: Alexander Zarochentsev +Change-Id: Ia13fcfaf0d8f2c4ee9331dd9fec0ff159d195186 +Reviewed-by: Andreas Dilger +Reviewed-by: Andrew Perepechko +Signed-off-by: Etienne AUJAMES +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51525 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Oleg Drokin +--- + lustre/obdclass/scrub.c | 2 ++ + lustre/osd-ldiskfs/osd_oi.c | 15 ++++++--------- + 2 files changed, 8 insertions(+), 9 deletions(-) + +diff --git a/lustre/obdclass/scrub.c b/lustre/obdclass/scrub.c +index 89c3f752da..cc100244c0 100644 +--- a/lustre/obdclass/scrub.c ++++ b/lustre/obdclass/scrub.c +@@ -127,6 +127,7 @@ EXPORT_SYMBOL(scrub_file_init); + void scrub_file_reset(struct lustre_scrub *scrub, uuid_t uuid, u64 flags) + { + struct scrub_file *sf = &scrub->os_file; ++ ENTRY; + + CDEBUG(D_LFSCK, "%s: reset OI scrub file, old flags = " + "%#llx, add flags = %#llx\n", +@@ -150,6 +151,7 @@ void scrub_file_reset(struct lustre_scrub *scrub, uuid_t uuid, u64 flags) + sf->sf_items_igif = 0; + if (!scrub->os_in_join) + sf->sf_items_updated_prior = 0; ++ EXIT; + } + EXPORT_SYMBOL(scrub_file_reset); + +diff --git a/lustre/osd-ldiskfs/osd_oi.c b/lustre/osd-ldiskfs/osd_oi.c +index 21468fec15..b361796e41 100644 +--- a/lustre/osd-ldiskfs/osd_oi.c ++++ b/lustre/osd-ldiskfs/osd_oi.c +@@ -316,7 +316,7 @@ osd_oi_table_open(struct osd_thread_info *info, struct osd_device *osd, + + if (rc == -ENOENT && create == false) { + if (oi_count == 0) +- return count; ++ RETURN(count); + + rc = 0; + ldiskfs_set_bit(i, sf->sf_oi_bitmap); +@@ -425,23 +425,20 @@ int osd_oi_init(struct osd_thread_info *info, struct osd_device *osd, + if (count == sf->sf_oi_count) + GOTO(out, rc = count); + +- if (sf->sf_oi_count == 0) { +- if (likely((count & (count - 1)) == 0)) +- GOTO(out, rc = count); +- +- LCONSOLE_WARN( +- "%s: invalid oi count %d, remove them, then set it to %d\n", ++ /* Trust the counted number of OI files if it is sane */ ++ if ((count & (count - 1)) != 0) { ++ LCONSOLE_WARN("%s: invalid oi count %d, remove them, then set it to %d\n", + osd_dev2name(osd), count, osd_oi_count); + osd_oi_table_put(info, oi, count); + rc = osd_remove_ois(info, osd); + if (rc) + GOTO(out, rc); + +- sf->sf_oi_count = osd_oi_count; ++ count = osd_oi_count; + } + + scrub_file_reset(scrub, osd->od_uuid, SF_RECREATED); +- count = sf->sf_oi_count; ++ sf->sf_oi_count = count; + goto create; + } + +-- +2.33.0 + diff --git a/0024-LU-15519-quota-fallocate-does-not-increase-projectid.patch b/0024-LU-15519-quota-fallocate-does-not-increase-projectid.patch new file mode 100644 index 0000000000000000000000000000000000000000..3d77bca48d74362ec892b7db8eb1d50d6c4328f5 --- /dev/null +++ b/0024-LU-15519-quota-fallocate-does-not-increase-projectid.patch @@ -0,0 +1,224 @@ +From 691387d77bfe9260b65669978fb3f988055fc1e0 Mon Sep 17 00:00:00 2001 +From: Arshad Hussain +Date: Mon, 14 Feb 2022 14:06:47 +0530 +Subject: [PATCH 24/61] LU-15519 quota: fallocate does not increase projectid + usage + +fallocate() was not accounting for projectid quota usage. +This was happening due to two reasons. 1) the projectid +was not properly passed to md_op_data in ll_set_project() +and 2) the OBD_MD_FLPROJID flag was not set receive the +projctid. + +This patch addresses the above reasons. + +Test-case: sanity-quota/78a added + +Lustre-change: https://review.whamcloud.com/46676 +Lustre-commit: 5fc934ebbbe665f24e2f11fe224065dd8e9a08ba + +Fixes: 48457868a02a ("LU-3606 fallocate: Implement fallocate preallocate operation") +Signed-off-by: Arshad Hussain +Change-Id: I3ed44e7ef7ca8fe49a08133449c33b62b1eff500 +Reviewed-by: Andreas Dilger +Reviewed-by: Hongchao Zhang +Signed-off-by: Etienne AUJAMES +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51535 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Oleg Drokin +--- + lustre/include/cl_object.h | 1 + + lustre/llite/file.c | 15 ++++++++---- + lustre/llite/vvp_object.c | 3 ++- + lustre/lov/lov_io.c | 2 ++ + lustre/osc/osc_io.c | 9 ++++--- + lustre/tests/sanity-quota.sh | 47 ++++++++++++++++++++++++++++++++++++ + 6 files changed, 68 insertions(+), 9 deletions(-) + +diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h +index e4fb55a705..d44d59b074 100644 +--- a/lustre/include/cl_object.h ++++ b/lustre/include/cl_object.h +@@ -1874,6 +1874,7 @@ struct cl_io { + loff_t sa_falloc_end; + uid_t sa_falloc_uid; + gid_t sa_falloc_gid; ++ __u32 sa_falloc_projid; + } ci_setattr; + struct cl_data_version_io { + u64 dv_data_version; +diff --git a/lustre/llite/file.c b/lustre/llite/file.c +index 219931e06b..71f11a9ea2 100644 +--- a/lustre/llite/file.c ++++ b/lustre/llite/file.c +@@ -2834,7 +2834,7 @@ static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap, + GOTO(out, rc); + } + +- fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; ++ fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLPROJID; + obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE); + obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid); + +@@ -3575,8 +3575,8 @@ int ll_ioctl_check_project(struct inode *inode, __u32 xflags, + + static int ll_set_project(struct inode *inode, __u32 xflags, __u32 projid) + { +- struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; ++ struct md_op_data *op_data; + struct cl_object *obj; + unsigned int inode_flags; + int rc = 0; +@@ -3594,7 +3594,10 @@ static int ll_set_project(struct inode *inode, __u32 xflags, __u32 projid) + op_data->op_attr_flags = ll_inode_to_ext_flags(inode_flags); + if (xflags & FS_XFLAG_PROJINHERIT) + op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL; ++ ++ /* pass projid to md_op_data */ + op_data->op_projid = projid; ++ + op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS; + rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL, 0, &req); + ptlrpc_req_finished(req); +@@ -5454,11 +5457,11 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat) + int cl_falloc(struct file *file, struct inode *inode, int mode, loff_t offset, + loff_t len) + { ++ loff_t size = i_size_read(inode); + struct lu_env *env; + struct cl_io *io; + __u16 refcheck; + int rc; +- loff_t size = i_size_read(inode); + + ENTRY; + +@@ -5477,12 +5480,14 @@ int cl_falloc(struct file *file, struct inode *inode, int mode, loff_t offset, + io->u.ci_setattr.sa_falloc_end = offset + len; + io->u.ci_setattr.sa_subtype = CL_SETATTR_FALLOCATE; + +- CDEBUG(D_INODE, "UID %u GID %u\n", ++ CDEBUG(D_INODE, "UID %u GID %u PRJID %u\n", + from_kuid(&init_user_ns, inode->i_uid), +- from_kgid(&init_user_ns, inode->i_gid)); ++ from_kgid(&init_user_ns, inode->i_gid), ++ ll_i2info(inode)->lli_projid); + + io->u.ci_setattr.sa_falloc_uid = from_kuid(&init_user_ns, inode->i_uid); + io->u.ci_setattr.sa_falloc_gid = from_kgid(&init_user_ns, inode->i_gid); ++ io->u.ci_setattr.sa_falloc_projid = ll_i2info(inode)->lli_projid; + + if (io->u.ci_setattr.sa_falloc_end > size) { + loff_t newsize = io->u.ci_setattr.sa_falloc_end; +diff --git a/lustre/llite/vvp_object.c b/lustre/llite/vvp_object.c +index 2413da9498..ab5c68b797 100644 +--- a/lustre/llite/vvp_object.c ++++ b/lustre/llite/vvp_object.c +@@ -198,7 +198,8 @@ static void vvp_req_attr_set(const struct lu_env *env, struct cl_object *obj, + { + struct inode *inode; + struct obdo *oa; +- u64 valid_flags = OBD_MD_FLTYPE | OBD_MD_FLUID | OBD_MD_FLGID; ++ u64 valid_flags = OBD_MD_FLTYPE | OBD_MD_FLUID | OBD_MD_FLGID | ++ OBD_MD_FLPROJID; + + oa = attr->cra_oa; + inode = vvp_object_inode(obj); +diff --git a/lustre/lov/lov_io.c b/lustre/lov/lov_io.c +index ce4fa30b84..381ae56699 100644 +--- a/lustre/lov/lov_io.c ++++ b/lustre/lov/lov_io.c +@@ -694,6 +694,8 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio, + parent->u.ci_setattr.sa_falloc_uid; + io->u.ci_setattr.sa_falloc_gid = + parent->u.ci_setattr.sa_falloc_gid; ++ io->u.ci_setattr.sa_falloc_projid = ++ parent->u.ci_setattr.sa_falloc_projid; + } + if (cl_io_is_trunc(io)) { + loff_t new_size = parent->u.ci_setattr.sa_attr.lvb_size; +diff --git a/lustre/osc/osc_io.c b/lustre/osc/osc_io.c +index e4bd2738a6..86399655f5 100644 +--- a/lustre/osc/osc_io.c ++++ b/lustre/osc/osc_io.c +@@ -680,11 +680,14 @@ static int osc_io_setattr_start(const struct lu_env *env, + oa->o_blocks = io->u.ci_setattr.sa_falloc_end; + oa->o_uid = io->u.ci_setattr.sa_falloc_uid; + oa->o_gid = io->u.ci_setattr.sa_falloc_gid; ++ oa->o_projid = io->u.ci_setattr.sa_falloc_projid; + oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | +- OBD_MD_FLUID | OBD_MD_FLGID; ++ OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLPROJID; + +- CDEBUG(D_INODE, "size %llu blocks %llu uid %u gid %u\n", +- oa->o_size, oa->o_blocks, oa->o_uid, oa->o_gid); ++ CDEBUG(D_INODE, ++ "size %llu blocks %llu uid %u gid %u prjid %u\n", ++ oa->o_size, oa->o_blocks, oa->o_uid, oa->o_gid, ++ oa->o_projid); + result = osc_fallocate_base(osc_export(cl2osc(obj)), + oa, osc_async_upcall, + cbargs, falloc_mode); +diff --git a/lustre/tests/sanity-quota.sh b/lustre/tests/sanity-quota.sh +index 047b1a9687..f36601ce5d 100755 +--- a/lustre/tests/sanity-quota.sh ++++ b/lustre/tests/sanity-quota.sh +@@ -5251,6 +5251,53 @@ test_78() + } + run_test 78 "Check fallocate increase quota usage" + ++test_78a() ++{ ++ (( $CLIENT_VERSION >= $(version_code 2.15.3) )) || ++ skip "need client at least 2.15.3" ++ (( $OST1_VERSION >= $(version_code 2.15.3) )) || ++ skip "need OST at least 2.15.3" ++ check_set_fallocate_or_skip ++ ++ setup_quota_test || error "setup quota failed with $?" ++ ++ # enable ost quota ++ set_ost_qtype $QTYPE || error "enable ost quota failed" ++ ++ mkdir -p $DIR/$tdir || error "failed to create $tdir" ++ ++ local projectid=5200 # Random project id to test ++ ++ change_project -sp $projectid $DIR/$tdir ++ ++ # setup quota limit ++ $LFS setquota -p $projectid -b25M -B25M $DIR/$tdir || ++ error "lfs setquota project failed" ++ ++ # call fallocate ++ fallocate -l 204800 $DIR/$tdir/$tfile ++ ++ # Get curspace (kbytes) for $projectid ++ local kbytes=$(getquota -p $projectid global curspace) ++ ++ echo "kbytes returned:$kbytes" ++ ++ # For file size of 204800. We should be having roughly 200 kbytes ++ # returned. Anything alarmingly low (50 taken as arbitrary value) ++ # would bail out this TC. Also this also avoids $kbytes of 0 ++ # to be used in calculation below. ++ (( $kbytes > 50 )) || ++ error "fallocate did not use projectid. kbytes returned:$kbytes" ++ ++ local expect_lo=$(($kbytes * 95 / 100)) # 5% below ++ local expect_hi=$(($kbytes * 105 / 100)) # 5% above ++ ++ # Verify kbytes is 200 (204800/1024). With a permited 5% drift ++ (( $kbytes >= $expect_lo && $kbytes <= $expect_hi )) || ++ error "fallocate did not use quota projectid correctly" ++} ++run_test 78a "Check fallocate increase projectid usage" ++ + test_79() + { + local qpool="qpool1" +-- +2.33.0 + diff --git a/0025-LU-16060-osd-ldiskfs-copy-nul-byte-terminator-in-wri.patch b/0025-LU-16060-osd-ldiskfs-copy-nul-byte-terminator-in-wri.patch new file mode 100644 index 0000000000000000000000000000000000000000..68197cb5eec84ac5e6f2a20a481cc10fefebf5ed --- /dev/null +++ b/0025-LU-16060-osd-ldiskfs-copy-nul-byte-terminator-in-wri.patch @@ -0,0 +1,47 @@ +From d1f0ef129d8e9af13c31dc821a00639de3349873 Mon Sep 17 00:00:00 2001 +From: Alexander Zarochentsev +Date: Wed, 20 Jul 2022 19:05:53 +0300 +Subject: [PATCH 25/61] LU-16060 osd-ldiskfs: copy nul byte terminator in + writelink + +memcpy() call in osd_ldiskfs_writelink() doesn't copy the nul +terminator byte from the source buffer, leaving the space +after target link name uninialized which is ok for the kernel +code and debugfs but not e2fsck. + +HPE-bug-id: LUS-11103 + +Lustre-change: https://review.whamcloud.com/48092 +Lustre-commit: 907dc0a2d333f2df2d654a968fc50f8cc05b779d + +Signed-off-by: Alexander Zarochentsev +Change-Id: I914f2c78e1a6571bf360a23b0ede8c70502bf0df +Reviewed-by: Artem Blagodarenko +Reviewed-by: Andrew Perepechko +Reviewed-by: Andreas Dilger +Signed-off-by: Etienne AUJAMES +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51356 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Oleg Drokin +--- + lustre/osd-ldiskfs/osd_io.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c +index f44420dda2..7131add58d 100644 +--- a/lustre/osd-ldiskfs/osd_io.c ++++ b/lustre/osd-ldiskfs/osd_io.c +@@ -2022,7 +2022,8 @@ static int osd_ldiskfs_writelink(struct inode *inode, char *buffer, int buflen) + /* LU-2634: clear the extent format for fast symlink */ + ldiskfs_clear_inode_flag(inode, LDISKFS_INODE_EXTENTS); + +- memcpy((char *)&LDISKFS_I(inode)->i_data, (char *)buffer, buflen); ++ /* Copying the NUL byte terminating the link target as well */ ++ memcpy((char *)&LDISKFS_I(inode)->i_data, (char *)buffer, buflen + 1); + spin_lock(&inode->i_lock); + LDISKFS_I(inode)->i_disksize = buflen; + i_size_write(inode, buflen); +-- +2.33.0 + diff --git a/0026-LU-16934-kernel-update-RHEL-8.8-4.18.0-477.15.1.el8_.patch b/0026-LU-16934-kernel-update-RHEL-8.8-4.18.0-477.15.1.el8_.patch new file mode 100644 index 0000000000000000000000000000000000000000..130c8b1c5cf4f9f710106b3ce9b070c404217213 --- /dev/null +++ b/0026-LU-16934-kernel-update-RHEL-8.8-4.18.0-477.15.1.el8_.patch @@ -0,0 +1,79 @@ +From f144190bbe3728dc15fc731612f10fe3fad98bef Mon Sep 17 00:00:00 2001 +From: Jian Yu +Date: Fri, 28 Jul 2023 20:36:03 -0700 +Subject: [PATCH 26/61] LU-16934 kernel: update RHEL 8.8 + [4.18.0-477.15.1.el8_8] + +Update RHEL 8.8 kernel to 4.18.0-477.15.1.el8_8. + +Lustre-change: https://review.whamcloud.com/51517 +Lustre-commit: 830bf7a1f8de73a4f46248e6b8d2bbcd944a1f09 + +Test-Parameters: trivial fstype=ldiskfs \ +clientdistro=el8.8 serverdistro=el8.8 testlist=sanity + +Test-Parameters: trivial fstype=zfs \ +clientdistro=el8.8 serverdistro=el8.8 testlist=sanity + +Change-Id: I66365dce63065a0a07958a182a3c705e9948d424 +Signed-off-by: Jian Yu +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51518 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Yang Sheng +Reviewed-by: xinliang +Reviewed-by: Oleg Drokin +--- + lustre/ChangeLog | 4 ++-- + lustre/kernel_patches/targets/4.18-rhel8.8.target.in | 2 +- + lustre/kernel_patches/which_patch | 2 +- + 3 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/lustre/ChangeLog b/lustre/ChangeLog +index 57699d2232..4b21088869 100644 +--- a/lustre/ChangeLog ++++ b/lustre/ChangeLog +@@ -3,7 +3,7 @@ TBD Whamcloud + * See https://wiki.whamcloud.com/display/PUB/Lustre+Support+Matrix + for currently supported client and server kernel versions. + * Server primary kernels built and tested during release cycle: +- 4.18.0-477.10.1.el8 (RHEL8.8) ++ 4.18.0-477.15.1.el8 (RHEL8.8) + * Other server kernels known to build and work at some point (others may also work): + 3.10.0-862.14.4.el7 (RHEL7.5) + 3.10.0-957.27.2.el7 (RHEL7.6) +@@ -30,7 +30,7 @@ TBD Whamcloud + * Client primary kernels built and tested during release cycle: + 5.14.0-284.11.1.el9 (RHEL9.2) + 5.14.0-162.23.1.el9 (RHEL9.1) +- 4.18.0-477.10.1.el8 (RHEL8.8) ++ 4.18.0-477.15.1.el8 (RHEL8.8) + 5.4.0-37 (Ubuntu 20.04) + 5.14.21-150400.24.28 (SLES15 SP4) + * Other clients known to build on these kernels at some point (others may also work): +diff --git a/lustre/kernel_patches/targets/4.18-rhel8.8.target.in b/lustre/kernel_patches/targets/4.18-rhel8.8.target.in +index 172b7ca8c9..215a468fe4 100644 +--- a/lustre/kernel_patches/targets/4.18-rhel8.8.target.in ++++ b/lustre/kernel_patches/targets/4.18-rhel8.8.target.in +@@ -1,5 +1,5 @@ + lnxmaj="4.18.0" +-lnxrel="477.10.1.el8_8" ++lnxrel="477.15.1.el8_8" + + KERNEL_SRPM=kernel-${lnxmaj}-${lnxrel}.src.rpm + SERIES=4.18-rhel8.8.series +diff --git a/lustre/kernel_patches/which_patch b/lustre/kernel_patches/which_patch +index 0047bf08eb..50ec106010 100644 +--- a/lustre/kernel_patches/which_patch ++++ b/lustre/kernel_patches/which_patch +@@ -25,6 +25,6 @@ PATCH SERIES FOR SERVER KERNELS: + 4.18-rhel8.5.series 4.18.0-348.23.1.el8 (RHEL 8.5) + 4.18-rhel8.6.series 4.18.0-372.32.1.el8 (RHEL 8.6) + 4.18-rhel8.7.series 4.18.0-425.3.1.el8 (RHEL 8.7) +-4.18-rhel8.8.series 4.18.0-477.10.1.el8 (RHEL 8.8) ++4.18-rhel8.8.series 4.18.0-477.15.1.el8 (RHEL 8.8) + + See lustre/ChangeLog for supported client kernel versions. +-- +2.33.0 + diff --git a/0027-LU-15740-tests-scale-fs_log_size-by-OSTCOUNT.patch b/0027-LU-15740-tests-scale-fs_log_size-by-OSTCOUNT.patch new file mode 100644 index 0000000000000000000000000000000000000000..50d501e0ce14df6904eee377bd61cb083f488b9d --- /dev/null +++ b/0027-LU-15740-tests-scale-fs_log_size-by-OSTCOUNT.patch @@ -0,0 +1,188 @@ +From 34e1409cad412086d509349f64dc9d77911d2fb8 Mon Sep 17 00:00:00 2001 +From: Andreas Dilger +Date: Fri, 24 Mar 2023 17:09:44 -0600 +Subject: [PATCH 27/61] LU-15740 tests: scale fs_log_size by OSTCOUNT + +The fs_log_size "free space skew" was being scaled by MDSCOUNT, +but in fact this parameter is only ever used to compare the OST +free space usage, so the OSTCOUNT should be used when scaling it. + +It is likely that the skew is actually caused by blocks allocated +by OST object directories and not llogs (no llogs used on OSTs for +many years), but it isn't worthwhile to rename the function. + +Lustre-change: https://review.whamcloud.com/50419 +Lustre-commit: fabec6f2cb39950a2f208567dac716e21880fa9f + +Test-Parameters: trivial testlist=runtests +Signed-off-by: Andreas Dilger +Change-Id: I97f05b10fa7ec367534b5bdce09feae5e93ebbe5 +Reviewed-by: Arshad Hussain +Reviewed-by: Alex Deiter +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51606 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: James Simmons +Reviewed-by: Oleg Drokin +--- + lustre/tests/replay-ost-single.sh | 20 ++++++++------------ + lustre/tests/replay-single.sh | 17 ++++++----------- + lustre/tests/test-framework.sh | 8 +++++--- + 3 files changed, 19 insertions(+), 26 deletions(-) + +diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh +index e6ba4d0430..5ce0b7f28d 100755 +--- a/lustre/tests/replay-ost-single.sh ++++ b/lustre/tests/replay-ost-single.sh +@@ -169,10 +169,6 @@ test_5() { + } + run_test 5 "Fail OST during iozone" + +-kbytesfree() { +- calc_osc_kbytes kbytesfree +-} +- + test_6() { + remote_mds_nodsh && skip "remote MDS with nodsh" && return 0 + +@@ -185,7 +181,7 @@ test_6() { + wait_destroy_complete || error "first wait_destroy_complete failed" + sync_all_data + +- local before=$(kbytesfree) ++ local before=$(calc_osc_kbytes kbytesfree) + dd if=/dev/urandom bs=4096 count=1280 of=$f || error "dd failed" + $LFS getstripe $f || error "$LFS getstripe $f failed" + local stripe_index=$(lfs getstripe -i $f) +@@ -200,13 +196,13 @@ test_6() { + wait_mds_ost_sync || error "second wait_mds_ost_sync failed" + + # retry till statfs returns useful results +- local after_dd=$(kbytesfree) ++ local after_dd=$(calc_osc_kbytes kbytesfree) + local i=0 + while (( $before <= $after_dd && $i < 20 )); do + sync + sleep 1 + let ++i +- after_dd=$(kbytesfree) ++ after_dd=$(calc_osc_kbytes kbytesfree) + done + + log "before_free: $before after_dd_free: $after_dd took $i seconds" +@@ -222,7 +218,7 @@ test_6() { + # let the delete happen + wait_mds_ost_sync || error "third wait_mds_ost_sync failed" + wait_delete_completed || error "second wait_delete_completed failed" +- local after=$(kbytesfree) ++ local after=$(calc_osc_kbytes kbytesfree) + log "free_before: $before free_after: $after" + (( $before <= $after + $(fs_log_size) )) || + error "$before > $after + logsize $(fs_log_size)" +@@ -238,18 +234,18 @@ test_7() { + wait_mds_ost_sync || error "wait_mds_ost_sync failed" + wait_destroy_complete || error "wait_destroy_complete failed" + +- local before=$(kbytesfree) ++ local before=$(calc_osc_kbytes kbytesfree) + dd if=/dev/urandom bs=4096 count=1280 of=$f || + error "dd to file failed: $?" + + sync +- local after_dd=$(kbytesfree) ++ local after_dd=$(calc_osc_kbytes kbytesfree) + local i=0 + while (( $before <= $after_dd && $i < 10 )); do + sync + sleep 1 + let ++i +- after_dd=$(kbytesfree) ++ after_dd=$(calc_osc_kbytes kbytesfree) + done + + log "before: $before after_dd: $after_dd took $i seconds" +@@ -264,7 +260,7 @@ test_7() { + # let the delete happen + wait_mds_ost_sync || error "wait_mds_ost_sync failed" + wait_delete_completed || error "wait_delete_completed failed" +- local after=$(kbytesfree) ++ local after=$(calc_osc_kbytes kbytesfree) + log "before: $before after: $after" + (( $before <= $after + $(fs_log_size) )) || + error "$before > $after + logsize $(fs_log_size)" +diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh +index c6641c0d0e..c05570a850 100755 +--- a/lustre/tests/replay-single.sh ++++ b/lustre/tests/replay-single.sh +@@ -3289,23 +3289,17 @@ test_88() { #bug 17485 + } + run_test 88 "MDS should not assign same objid to different files " + +-function calc_osc_kbytes_used() { +- local kbtotal=$(calc_osc_kbytes kbytestotal) +- local kbfree=$(calc_osc_kbytes kbytesfree) +- echo $((kbtotal-kbfree)) +-} +- + test_89() { + cancel_lru_locks osc + mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed" + rm -f $DIR/$tdir/$tfile + wait_mds_ost_sync || error "initial MDS-OST sync timed out" + wait_delete_completed || error "initial wait delete timed out" +- local blocks1=$(calc_osc_kbytes_used) ++ local before=$(calc_osc_kbytes kbytesfree) + local write_size=$(fs_log_size) + + $LFS setstripe -i 0 -c 1 $DIR/$tdir/$tfile +- [ $write_size -lt 1024 ] && write_size=1024 ++ (( $write_size >= 1024 )) || write_size=1024 + dd if=/dev/zero bs=${write_size}k count=10 of=$DIR/$tdir/$tfile + sync + stop ost1 +@@ -3323,10 +3317,11 @@ test_89() { + + wait_mds_ost_sync || error "MDS-OST sync timed out" + wait_delete_completed || error "wait delete timed out" +- local blocks2=$(calc_osc_kbytes_used) ++ local after=$(calc_osc_kbytes kbytesfree) + +- [ $((blocks2 - blocks1)) -le $(fs_log_size) ] || +- error $((blocks2 - blocks1)) blocks leaked ++ log "free_before: $before free_after: $after" ++ (( $before <= $after + $(fs_log_size) )) || ++ error "kbytesfree $before > $after + margin $(fs_log_size)" + } + run_test 89 "no disk space leak on late ost connection" + +diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh +index 4d556d05bd..825b7189d7 100755 +--- a/lustre/tests/test-framework.sh ++++ b/lustre/tests/test-framework.sh +@@ -918,18 +918,20 @@ unload_modules() { + } + + fs_log_size() { +- local facet=${1:-$SINGLEMDS} ++ local facet=${1:-ost1} + local size=0 ++ local mult=$OSTCOUNT + + case $(facet_fstype $facet) in +- ldiskfs) size=72;; # largest seen is 64, leave some headroom ++ ldiskfs) size=32;; # largest seen is 64 with multiple OSTs + # grant_block_size is in bytes, allow at least 2x max blocksize + zfs) size=$(lctl get_param osc.$FSNAME*.import | + awk '/grant_block_size:/ {print $2/512; exit;}') + ;; + esac + +- echo -n $((size * MDSCOUNT)) ++ [[ $facet =~ mds ]] && mult=$MDTCOUNT ++ echo -n $((size * mult)) + } + + fs_inode_ksize() { +-- +2.33.0 + diff --git a/0028-LU-16943-tests-fix-replay-single-135-under-hard-fail.patch b/0028-LU-16943-tests-fix-replay-single-135-under-hard-fail.patch new file mode 100644 index 0000000000000000000000000000000000000000..f0b0f7c1bef4bd2a20c1662f829ae85631f99fdd --- /dev/null +++ b/0028-LU-16943-tests-fix-replay-single-135-under-hard-fail.patch @@ -0,0 +1,84 @@ +From dc943357276b3bec99d24c89d9b36f7b3cf20aeb Mon Sep 17 00:00:00 2001 +From: Jian Yu +Date: Fri, 14 Jul 2023 14:04:42 +0800 +Subject: [PATCH 28/61] LU-16943 tests: fix replay-single/135 under hard + failure mode + +This patch fixes replay-single test_135() to load libcfs module +on the failover partner node to avoid 'fail_val' setting error. +It also fixes the issue that not all of the OSTs are mounted after +failing back ost1. + +Lustre-change: https://review.whamcloud.com/51574 +Lustre-commit: 74140e5df4c094f7f0e923e1b82c464b18e8a7cc + +Test-Parameters: trivial testlist=replay-single +Test-Parameters: trivial fstype=zfs testlist=replay-single + +Test-Parameters: trivial env=FAILURE_MODE=HARD \ + clientcount=4 mdtcount=1 mdscount=2 osscount=2 \ + austeroptions=-R failover=true iscsi=1 \ + testlist=replay-single + +Change-Id: Id46c722a6db9d832829a739f41f7462b32a6d9d9 +Signed-off-by: Jian Yu +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51608 +Reviewed-by: Oleg Drokin +Reviewed-by: Alex Deiter +Tested-by: jenkins +Tested-by: Maloo +--- + lustre/tests/replay-single.sh | 7 +++++++ + lustre/tests/test-framework.sh | 9 +++++++++ + 2 files changed, 16 insertions(+) + +diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh +index c05570a850..f631b73125 100755 +--- a/lustre/tests/replay-single.sh ++++ b/lustre/tests/replay-single.sh +@@ -4977,6 +4977,8 @@ test_135() { + + #define OBD_FAIL_TGT_REPLAY_RECONNECT 0x32d + # Make sure lock replay server side never completes and errors out. ++ do_rpc_nodes $(facet_active_host ost1) \ ++ load_module ../libcfs/libcfs/libcfs + do_facet ost1 "$LCTL set_param fail_val=20" + do_facet ost1 "$LCTL set_param fail_loc=0x32d" + +@@ -4993,8 +4995,13 @@ test_135() { + change_active ost1 + wait_for_facet ost1 + ++ do_rpc_nodes $(facet_active_host ost1) \ ++ load_module ../libcfs/libcfs/libcfs + do_facet ost1 "$LCTL set_param fail_loc=0" + mount_facet ost1 ++ unmountoss ++ mountoss ++ clients_up || clients_up || error "$LFS df $MOUNT failed" + echo blah > $DIR/$tdir/file.test2 + + rm -rf $DIR/$tdir +diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh +index 825b7189d7..2c92993117 100755 +--- a/lustre/tests/test-framework.sh ++++ b/lustre/tests/test-framework.sh +@@ -5033,6 +5033,15 @@ mountmds() { + done + } + ++unmountoss() { ++ local num ++ ++ for num in $(seq $OSTCOUNT); do ++ stop ost$num -f ++ rm -f $TMP/ost${num}active ++ done ++} ++ + mountoss() { + local num + local devname +-- +2.33.0 + diff --git a/0029-LU-16517-build-pass-extra-configure-options-to-make-.patch b/0029-LU-16517-build-pass-extra-configure-options-to-make-.patch new file mode 100644 index 0000000000000000000000000000000000000000..d181bf3bee9ab0d11a2093c5ffa28d47c0e04a36 --- /dev/null +++ b/0029-LU-16517-build-pass-extra-configure-options-to-make-.patch @@ -0,0 +1,514 @@ +From 5abc979eb64d8a78888aea63c0cf7e285fa0a4dd Mon Sep 17 00:00:00 2001 +From: Jian Yu +Date: Tue, 30 May 2023 23:40:09 -0700 +Subject: [PATCH 29/61] LU-16517 build: pass extra configure options to "make + debs" + +While running "make debs", the configure command in debian/rules +ignores some user defined configure options. This patch fixes +the issue by adding the detection of the extra options into +debian/rules. + +Lustre-change: https://review.whamcloud.com/50464 +Lustre-commit: 3989529f22f5c54a98e445674b4b3cc443a3af5f + +Test-Parameters: trivial clientdistro=ubuntu2004 + +Change-Id: Ia9db4e05abf33834cb3c853f4f0829dadc8d7400 +Signed-off-by: Jian Yu +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51178 +Reviewed-by: Andreas Dilger +Reviewed-by: Alex Deiter +Reviewed-by: Oleg Drokin +Tested-by: Maloo +Tested-by: jenkins +--- + autoMakefile.am | 95 ++++++++++++++++++++++++++++++++ + debian/rules | 39 ++++++++----- + libcfs/autoconf/lustre-libcfs.m4 | 39 ++++++++++--- + lnet/autoconf/lustre-lnet.m4 | 12 ++++ + lustre/autoconf/lustre-core.m4 | 87 +++++++++++++++++++++-------- + lustre/ldlm/ldlm_lib.c | 2 +- + lustre/llite/llite_lib.c | 4 +- + 7 files changed, 228 insertions(+), 50 deletions(-) + +diff --git a/autoMakefile.am b/autoMakefile.am +index 8c2eeb7f4c..8a2cd04953 100644 +--- a/autoMakefile.am ++++ b/autoMakefile.am +@@ -302,6 +302,101 @@ debs: undef.h debs_common + if test "x@systemdsystemunitdir@" != "x"; then \ + export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} systemd"; \ + fi; \ ++ if test "x@ENABLE_PINGER@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} pinger"; \ ++ elif test "x@ENABLE_PINGER@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} nopinger"; \ ++ fi; \ ++ if test "x@ENABLE_CHECKSUM@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} checksum"; \ ++ elif test "x@ENABLE_CHECKSUM@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} nochecksum"; \ ++ fi; \ ++ if test "x@ENABLE_FLOCK@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} flock"; \ ++ elif test "x@ENABLE_FLOCK@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} noflock"; \ ++ fi; \ ++ if test "x@ENABLE_HEALTH_WRITE@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} health_write"; \ ++ elif test "x@ENABLE_HEALTH_WRITE@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} nohealth_write"; \ ++ fi; \ ++ if test "x@ENABLE_LRU_RESIZE@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} lru-resize"; \ ++ elif test "x@ENABLE_LRU_RESIZE@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} nolru-resize"; \ ++ fi; \ ++ if test "x@ENABLE_MINDF@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} mindf"; \ ++ elif test "x@ENABLE_MINDF@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} nomindf"; \ ++ fi; \ ++ if test "x@ENABLE_FAIL_ALLOC@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} fail-alloc"; \ ++ elif test "x@ENABLE_FAIL_ALLOC@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} nofail-alloc"; \ ++ fi; \ ++ if test "x@ENABLE_INVARIANTS@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} invariants"; \ ++ elif test "x@ENABLE_INVARIANTS@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} noinvariants"; \ ++ fi; \ ++ if test "x@ENABLE_LU_REF@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} lu_ref"; \ ++ elif test "x@ENABLE_LU_REF@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} nolu_ref"; \ ++ fi; \ ++ if test "x@ENABLE_PGSTAT_TRACK@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} pgstate-track"; \ ++ elif test "x@ENABLE_PGSTAT_TRACK@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} nopgstate-track"; \ ++ fi; \ ++ if test "x@ENABLE_LIBCFS_CDEBUG@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} libcfs-cdebug"; \ ++ elif test "x@ENABLE_LIBCFS_CDEBUG@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} nolibcfs-cdebug"; \ ++ fi; \ ++ if test "x@ENABLE_LIBCFS_TRACE@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} libcfs-trace"; \ ++ elif test "x@ENABLE_LIBCFS_TRACE@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} nolibcfs-trace"; \ ++ fi; \ ++ if test "x@ENABLE_LIBCFS_ASSERT@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} libcfs-assert"; \ ++ elif test "x@ENABLE_LIBCFS_ASSERT@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} nolibcfs-assert"; \ ++ fi; \ ++ if test "x@ENABLE_PANIC_DUMPLOG@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} panic_dumplog"; \ ++ elif test "x@ENABLE_PANIC_DUMPLOG@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} nopanic_dumplog"; \ ++ fi; \ ++ if test "x@ENABLE_READLINE@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} readline"; \ ++ elif test "x@ENABLE_READLINE@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} noreadline"; \ ++ fi; \ ++ if test "x@ENABLE_LIBPTHREAD@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} libpthread"; \ ++ elif test "x@ENABLE_LIBPTHREAD@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} nolibpthread"; \ ++ fi; \ ++ if test "x@ENABLE_BACKOFF@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} backoff"; \ ++ elif test "x@ENABLE_BACKOFF@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} nobackoff"; \ ++ fi; \ ++ if test "x@ENABLE_GNI@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} gni"; \ ++ elif test "x@ENABLE_GNI@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} nogni"; \ ++ fi; \ ++ if test "x@ENABLE_EFENCE@" = "xyes"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} efence"; \ ++ elif test "x@ENABLE_EFENCE@" = "xno"; then \ ++ export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} noefence"; \ ++ fi; \ + export KERNEL_OBJ="$(LINUX_OBJ)"; \ + export KERNEL_SRC="$(LINUX)"; \ + echo "Enabled Build Profiles: $${DEB_BUILD_PROFILES}"; \ +diff --git a/debian/rules b/debian/rules +index 074b59ac39..344f255c04 100755 +--- a/debian/rules ++++ b/debian/rules +@@ -189,16 +189,18 @@ configure-stamp: autogen-stamp debian/control.main debian/control.modules.in + if echo "$${DEB_BUILD_PROFILES}" | grep -q "o2ib"; then \ + export EXTRAFLAGS="$${EXTRAFLAGS} --with-o2ib=$${O2IB_SRC}"; \ + fi; \ +- if echo "$${DEB_BUILD_PROFILES}" | grep -qw "gss"; then \ +- export EXTRAFLAGS="$${EXTRAFLAGS} --enable-gss"; \ +- elif echo "$${DEB_BUILD_PROFILES}" | grep -qw "nogss"; then \ +- export EXTRAFLAGS="$${EXTRAFLAGS} --disable-gss"; \ +- fi; \ +- if echo "$${DEB_BUILD_PROFILES}" | grep -qw "crypto"; then \ +- export EXTRAFLAGS="$${EXTRAFLAGS} --enable-crypto"; \ +- elif echo "$${DEB_BUILD_PROFILES}" | grep -qw "nocrypto"; then \ +- export EXTRAFLAGS="$${EXTRAFLAGS} --disable-crypto"; \ +- fi; \ ++ options="gss crypto pinger checksum flock health_write lru-resize"; \ ++ options="$${options} mindf fail-alloc invariants lu_ref pgstate-track"; \ ++ options="$${options} libcfs-cdebug libcfs-trace libcfs-assert"; \ ++ options="$${options} panic_dumplog readline libpthread"; \ ++ options="$${options} backoff gni efence"; \ ++ for option in $${options}; do \ ++ if echo "$${DEB_BUILD_PROFILES}" | grep -qw "$${option}"; then \ ++ export EXTRAFLAGS="$${EXTRAFLAGS} --enable-$${option}"; \ ++ elif echo "$${DEB_BUILD_PROFILES}" | grep -qw "no$${option}"; then \ ++ export EXTRAFLAGS="$${EXTRAFLAGS} --disable-$${option}"; \ ++ fi; \ ++ done; \ + if [ -f "$${CONFIG_CACHE_FILE}" ]; then \ + export TMP_CACHE_FILE=$$(mktemp); \ + sed -e "/ac_cv_env/d" \ +@@ -490,11 +492,18 @@ kdist_config: prep-deb-files patch-stamp + if echo "$${DEB_BUILD_PROFILES}" | grep -q "o2ib"; then \ + export EXTRAFLAGS="$${EXTRAFLAGS} --with-o2ib=$${O2IB_SRC}"; \ + fi; \ +- if echo "$${DEB_BUILD_PROFILES}" | grep -qw "crypto"; then \ +- export EXTRAFLAGS="$${EXTRAFLAGS} --enable-crypto"; \ +- elif echo "$${DEB_BUILD_PROFILES}" | grep -qw "nocrypto"; then \ +- export EXTRAFLAGS="$${EXTRAFLAGS} --disable-crypto"; \ +- fi; \ ++ options="gss crypto pinger checksum flock health_write lru-resize"; \ ++ options="$${options} mindf fail-alloc invariants lu_ref pgstate-track"; \ ++ options="$${options} libcfs-cdebug libcfs-trace libcfs-assert"; \ ++ options="$${options} panic_dumplog readline libpthread"; \ ++ options="$${options} backoff gni efence"; \ ++ for option in $${options}; do \ ++ if echo "$${DEB_BUILD_PROFILES}" | grep -qw "$${option}"; then \ ++ export EXTRAFLAGS="$${EXTRAFLAGS} --enable-$${option}"; \ ++ elif echo "$${DEB_BUILD_PROFILES}" | grep -qw "no$${option}"; then \ ++ export EXTRAFLAGS="$${EXTRAFLAGS} --disable-$${option}"; \ ++ fi; \ ++ done; \ + if [ -f "$${CONFIG_CACHE_FILE}" ]; then \ + export TMP_CACHE_FILE=$$(mktemp --tmpdir newconfig-XXXXXXXX.cache); \ + sed -e "/ac_cv_env/d" \ +diff --git a/libcfs/autoconf/lustre-libcfs.m4 b/libcfs/autoconf/lustre-libcfs.m4 +index 66c97eb044..2e99cff179 100644 +--- a/libcfs/autoconf/lustre-libcfs.m4 ++++ b/libcfs/autoconf/lustre-libcfs.m4 +@@ -10,8 +10,12 @@ AC_ARG_ENABLE([libcfs_cdebug], + [disable libcfs CDEBUG, CWARN]), + [], [enable_libcfs_cdebug="yes"]) + AC_MSG_RESULT([$enable_libcfs_cdebug]) +-AS_IF([test "x$enable_libcfs_cdebug" = xyes], +- [AC_DEFINE(CDEBUG_ENABLED, 1, [enable libcfs CDEBUG, CWARN])]) ++AS_IF([test "x$enable_libcfs_cdebug" = xyes], [ ++ AC_DEFINE(CDEBUG_ENABLED, 1, [enable libcfs CDEBUG, CWARN]) ++ AC_SUBST(ENABLE_LIBCFS_CDEBUG, yes) ++], [ ++ AC_SUBST(ENABLE_LIBCFS_CDEBUG, no) ++]) + + AC_MSG_CHECKING([whether to enable ENTRY/EXIT]) + AC_ARG_ENABLE([libcfs_trace], +@@ -19,8 +23,12 @@ AC_ARG_ENABLE([libcfs_trace], + [disable libcfs ENTRY/EXIT]), + [], [enable_libcfs_trace="yes"]) + AC_MSG_RESULT([$enable_libcfs_trace]) +-AS_IF([test "x$enable_libcfs_trace" = xyes], +- [AC_DEFINE(CDEBUG_ENTRY_EXIT, 1, [enable libcfs ENTRY/EXIT])]) ++AS_IF([test "x$enable_libcfs_trace" = xyes], [ ++ AC_DEFINE(CDEBUG_ENTRY_EXIT, 1, [enable libcfs ENTRY/EXIT]) ++ AC_SUBST(ENABLE_LIBCFS_TRACE, yes) ++], [ ++ AC_SUBST(ENABLE_LIBCFS_TRACE, no) ++]) + + AC_MSG_CHECKING([whether to enable LASSERT, LASSERTF]) + AC_ARG_ENABLE([libcfs_assert], +@@ -28,8 +36,12 @@ AC_ARG_ENABLE([libcfs_assert], + [disable libcfs LASSERT, LASSERTF]), + [], [enable_libcfs_assert="yes"]) + AC_MSG_RESULT([$enable_libcfs_assert]) +-AS_IF([test x$enable_libcfs_assert = xyes], +- [AC_DEFINE(LIBCFS_DEBUG, 1, [enable libcfs LASSERT, LASSERTF])]) ++AS_IF([test x$enable_libcfs_assert = xyes], [ ++ AC_DEFINE(LIBCFS_DEBUG, 1, [enable libcfs LASSERT, LASSERTF]) ++ AC_SUBST(ENABLE_LIBCFS_ASSERT, yes) ++], [ ++ AC_SUBST(ENABLE_LIBCFS_ASSERT, no) ++]) + ]) # LIBCFS_CONFIG_CDEBUG + + # +@@ -44,8 +56,12 @@ AC_ARG_ENABLE([panic_dumplog], + [enable panic_dumplog]), + [], [enable_panic_dumplog="no"]) + AC_MSG_RESULT([$enable_panic_dumplog]) +-AS_IF([test "x$enable_panic_dumplog" = xyes], +- [AC_DEFINE(LNET_DUMP_ON_PANIC, 1, [use dumplog on panic])]) ++AS_IF([test "x$enable_panic_dumplog" = xyes], [ ++ AC_DEFINE(LNET_DUMP_ON_PANIC, 1, [use dumplog on panic]) ++ AC_SUBST(ENABLE_PANIC_DUMPLOG, yes) ++], [ ++ AC_SUBST(ENABLE_PANIC_DUMPLOG, no) ++]) + ]) # LIBCFS_CONFIG_PANIC_DUMPLOG + + # +@@ -2534,6 +2550,10 @@ AS_IF([test "x$enable_readline" = xyes], [ + AC_DEFINE(HAVE_LIBREADLINE, 1, + [readline library is available]) + ]) ++ ++ AC_SUBST(ENABLE_READLINE, yes) ++], [ ++ AC_SUBST(ENABLE_READLINE, no) + ]) + AC_SUBST(LIBREADLINE) + +@@ -2551,7 +2571,10 @@ AS_IF([test "x$enable_libpthread" = xyes], [ + AC_DEFINE([HAVE_LIBPTHREAD], 1, + [use libpthread for libcfs library]) + ]) ++ ++ AC_SUBST(ENABLE_LIBPTHREAD, yes) + ], [ ++ AC_SUBST(ENABLE_LIBPTHREAD, no) + AC_MSG_WARN([Using libpthread for libcfs library is disabled explicitly]) + ]) + AC_SUBST(PTHREAD_LIBS) +diff --git a/lnet/autoconf/lustre-lnet.m4 b/lnet/autoconf/lustre-lnet.m4 +index 05dd808a6a..da6fd582b2 100644 +--- a/lnet/autoconf/lustre-lnet.m4 ++++ b/lnet/autoconf/lustre-lnet.m4 +@@ -20,6 +20,11 @@ AS_IF([test "x$enable_backoff" = xyes], [ + ], [ + AC_MSG_RESULT([no]) + ]) ++ ++ AC_SUBST(ENABLE_BACKOFF, yes) ++], [ ++ AC_SUBST(ENABLE_BACKOFF, no) ++ + ]) + ]) # LN_CONFIG_BACKOFF + +@@ -681,6 +686,10 @@ AS_IF([test "x$enable_gni" = xyes], [ + GNICPPFLAGS="$GNICPPFLAGS -DGNILND_USE_RCA=1" + ]) + EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" ++ ++ AC_SUBST(ENABLE_GNI, yes) ++], [ ++ AC_SUBST(ENABLE_GNI, no) + ]) + AC_SUBST(GNICPPFLAGS) + AC_SUBST(GNILND) +@@ -1020,6 +1029,9 @@ AS_IF([test "$enable_efence" = yes], [ + LIBEFENCE="-lefence" + AC_DEFINE(HAVE_LIBEFENCE, 1, + [libefence support is requested]) ++ AC_SUBST(ENABLE_EFENCE, yes) ++], [ ++ AC_SUBST(ENABLE_EFENCE, no) + ]) + AC_SUBST(LIBEFENCE) + +diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 +index 524645f708..2927df790c 100644 +--- a/lustre/autoconf/lustre-core.m4 ++++ b/lustre/autoconf/lustre-core.m4 +@@ -137,8 +137,12 @@ AC_ARG_ENABLE([pinger], + [disable recovery pinger support]), + [], [enable_pinger="yes"]) + AC_MSG_RESULT([$enable_pinger]) +-AS_IF([test "x$enable_pinger" != xno], +- [AC_DEFINE(CONFIG_LUSTRE_FS_PINGER, 1,[Use the Pinger])]) ++AS_IF([test "x$enable_pinger" != xno], [ ++ AC_DEFINE(CONFIG_LUSTRE_FS_PINGER, 1, [Use the Pinger]) ++ AC_SUBST(ENABLE_PINGER, yes) ++], [ ++ AC_SUBST(ENABLE_PINGER, no) ++]) + ]) # LC_CONFIG_PINGER + + # +@@ -153,8 +157,12 @@ AC_ARG_ENABLE([checksum], + [disable data checksum support]), + [], [enable_checksum="yes"]) + AC_MSG_RESULT([$enable_checksum]) +-AS_IF([test "x$enable_checksum" != xno], +- [AC_DEFINE(ENABLE_CHECKSUM, 1, [do data checksums])]) ++AS_IF([test "x$enable_checksum" != xno], [ ++ AC_DEFINE(CONFIG_ENABLE_CHECKSUM, 1, [do data checksums]) ++ AC_SUBST(ENABLE_CHECKSUM, yes) ++], [ ++ AC_SUBST(ENABLE_CHECKSUM, no) ++]) + ]) # LC_CONFIG_CHECKSUM + + # +@@ -169,8 +177,12 @@ AC_ARG_ENABLE([flock], + [disable flock by default]), + [], [enable_flock="yes"]) + AC_MSG_RESULT([$enable_flock]) +-AS_IF([test "x$enable_flock" != xno], +- [AC_DEFINE(ENABLE_FLOCK, 1, [enable flock by default])]) ++AS_IF([test "x$enable_flock" != xno], [ ++ AC_DEFINE(CONFIG_ENABLE_FLOCK, 1, [enable flock by default]) ++ AC_SUBST(ENABLE_FLOCK, yes) ++], [ ++ AC_SUBST(ENABLE_FLOCK, no) ++]) + ]) # LC_CONFIG_FLOCK + + # +@@ -185,8 +197,12 @@ AC_ARG_ENABLE([health_write], + [enable disk writes when doing health check]), + [], [enable_health_write="no"]) + AC_MSG_RESULT([$enable_health_write]) +-AS_IF([test "x$enable_health_write" != xno], +- [AC_DEFINE(USE_HEALTH_CHECK_WRITE, 1, [Write when Checking Health])]) ++AS_IF([test "x$enable_health_write" != xno], [ ++ AC_DEFINE(USE_HEALTH_CHECK_WRITE, 1, [Write when Checking Health]) ++ AC_SUBST(ENABLE_HEALTH_WRITE, yes) ++], [ ++ AC_SUBST(ENABLE_HEALTH_WRITE, no) ++]) + ]) # LC_CONFIG_HEALTH_CHECK_WRITE + + # +@@ -199,8 +215,12 @@ AC_ARG_ENABLE([lru_resize], + [enable lru resize support]), + [], [enable_lru_resize="yes"]) + AC_MSG_RESULT([$enable_lru_resize]) +-AS_IF([test "x$enable_lru_resize" != xno], +- [AC_DEFINE(HAVE_LRU_RESIZE_SUPPORT, 1, [Enable lru resize support])]) ++AS_IF([test "x$enable_lru_resize" != xno], [ ++ AC_DEFINE(HAVE_LRU_RESIZE_SUPPORT, 1, [Enable lru resize support]) ++ AC_SUBST(ENABLE_LRU_RESIZE, yes) ++], [ ++ AC_SUBST(ENABLE_LRU_RESIZE, no) ++]) + ]) # LC_CONFIG_LRU_RESIZE + + # +@@ -3311,8 +3331,12 @@ AC_ARG_ENABLE([mindf], + [Make statfs report the minimum available space on any single OST instead of the sum of free space on all OSTs]), + [], [enable_mindf="no"]) + AC_MSG_RESULT([$enable_mindf]) +-AS_IF([test "$enable_mindf" = "yes"], +- [AC_DEFINE([MIN_DF], 1, [Report minimum OST free space])]) ++AS_IF([test "$enable_mindf" = "yes"], [ ++ AC_DEFINE([MIN_DF], 1, [Report minimum OST free space]) ++ AC_SUBST(ENABLE_MINDF, yes) ++], [ ++ AC_SUBST(ENABLE_MINDF, no) ++]) + + AC_MSG_CHECKING([whether to randomly failing memory alloc]) + AC_ARG_ENABLE([fail_alloc], +@@ -3320,9 +3344,12 @@ AC_ARG_ENABLE([fail_alloc], + [disable randomly alloc failure]), + [], [enable_fail_alloc="yes"]) + AC_MSG_RESULT([$enable_fail_alloc]) +-AS_IF([test "x$enable_fail_alloc" != xno], +- [AC_DEFINE([RANDOM_FAIL_ALLOC], 1, +- [enable randomly alloc failure])]) ++AS_IF([test "x$enable_fail_alloc" != xno], [ ++ AC_DEFINE([RANDOM_FAIL_ALLOC], 1, [enable randomly alloc failure]) ++ AC_SUBST(ENABLE_FAIL_ALLOC, yes) ++], [ ++ AC_SUBST(ENABLE_FAIL_ALLOC, no) ++]) + + AC_MSG_CHECKING([whether to check invariants (expensive cpu-wise)]) + AC_ARG_ENABLE([invariants], +@@ -3330,9 +3357,13 @@ AC_ARG_ENABLE([invariants], + [enable invariant checking (cpu intensive)]), + [], [enable_invariants="no"]) + AC_MSG_RESULT([$enable_invariants]) +-AS_IF([test "x$enable_invariants" = xyes], +- [AC_DEFINE([CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK], 1, +- [enable invariant checking])]) ++AS_IF([test "x$enable_invariants" = xyes], [ ++ AC_DEFINE([CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK], 1, ++ [enable invariant checking]) ++ AC_SUBST(ENABLE_INVARIANTS, yes) ++], [ ++ AC_SUBST(ENABLE_INVARIANTS, no) ++]) + + AC_MSG_CHECKING([whether to track references with lu_ref]) + AC_ARG_ENABLE([lu_ref], +@@ -3340,9 +3371,13 @@ AC_ARG_ENABLE([lu_ref], + [enable lu_ref reference tracking code]), + [], [enable_lu_ref="no"]) + AC_MSG_RESULT([$enable_lu_ref]) +-AS_IF([test "x$enable_lu_ref" = xyes], +- [AC_DEFINE([CONFIG_LUSTRE_DEBUG_LU_REF], 1, +- [enable lu_ref reference tracking code])]) ++AS_IF([test "x$enable_lu_ref" = xyes], [ ++ AC_DEFINE([CONFIG_LUSTRE_DEBUG_LU_REF], 1, ++ [enable lu_ref reference tracking code]) ++ AC_SUBST(ENABLE_LU_REF, yes) ++], [ ++ AC_SUBST(ENABLE_LU_REF, no) ++]) + + AC_MSG_CHECKING([whether to enable page state tracking]) + AC_ARG_ENABLE([pgstate-track], +@@ -3350,9 +3385,13 @@ AC_ARG_ENABLE([pgstate-track], + [enable page state tracking]), + [], [enable_pgstat_track="no"]) + AC_MSG_RESULT([$enable_pgstat_track]) +-AS_IF([test "x$enable_pgstat_track" = xyes], +- [AC_DEFINE([CONFIG_DEBUG_PAGESTATE_TRACKING], 1, +- [enable page state tracking code])]) ++AS_IF([test "x$enable_pgstat_track" = xyes], [ ++ AC_DEFINE([CONFIG_DEBUG_PAGESTATE_TRACKING], 1, ++ [enable page state tracking code]) ++ AC_SUBST(ENABLE_PGSTAT_TRACK, yes) ++], [ ++ AC_SUBST(ENABLE_PGSTAT_TRACK, no) ++]) + + PKG_PROG_PKG_CONFIG + AC_MSG_CHECKING([systemd unit file directory]) +diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c +index bf61555c33..1d7a29d76f 100644 +--- a/lustre/ldlm/ldlm_lib.c ++++ b/lustre/ldlm/ldlm_lib.c +@@ -465,7 +465,7 @@ int client_obd_setup(struct obd_device *obd, struct lustre_cfg *lcfg) + + cli->cl_supp_cksum_types = OBD_CKSUM_CRC32; + cli->cl_preferred_cksum_type = 0; +-#ifdef ENABLE_CHECKSUM ++#ifdef CONFIG_ENABLE_CHECKSUM + /* Turn on checksumming by default. */ + cli->cl_checksum = 1; + /* +diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c +index dfb6b185b2..f67ea3f7eb 100644 +--- a/lustre/llite/llite_lib.c ++++ b/lustre/llite/llite_lib.c +@@ -162,10 +162,10 @@ static struct ll_sb_info *ll_init_sbi(void) + atomic_set(&sbi->ll_ra_info.ra_async_inflight, 0); + + set_bit(LL_SBI_VERBOSE, sbi->ll_flags); +-#ifdef ENABLE_CHECKSUM ++#ifdef CONFIG_ENABLE_CHECKSUM + set_bit(LL_SBI_CHECKSUM, sbi->ll_flags); + #endif +-#ifdef ENABLE_FLOCK ++#ifdef CONFIG_ENABLE_FLOCK + set_bit(LL_SBI_FLOCK, sbi->ll_flags); + #endif + +-- +2.33.0 + diff --git a/0030-LU-15193-quota-expand-QUOTA_MAX_TRANSIDS-to-12.patch b/0030-LU-15193-quota-expand-QUOTA_MAX_TRANSIDS-to-12.patch new file mode 100644 index 0000000000000000000000000000000000000000..8e3843fc1a341a5e9af9e7be47304a8e8bcc4ce3 --- /dev/null +++ b/0030-LU-15193-quota-expand-QUOTA_MAX_TRANSIDS-to-12.patch @@ -0,0 +1,89 @@ +From c20d23cd92c5bc748a618e9ed96e6eddd794ab45 Mon Sep 17 00:00:00 2001 +From: Lei Feng +Date: Thu, 4 Nov 2021 19:41:06 +0800 +Subject: [PATCH 30/61] LU-15193 quota: expand QUOTA_MAX_TRANSIDS to 12 + +In some rare cases 12 quota ids are needed. +Usually (user, group) * (block, inode) * (inode, parent) = 8 qids +are needed. But with project id, +(user, group, project) * (block, inode) * (inode, parent) = 12 qids +are needed. + +Lustre-change: https://review.whamcloud.com/45456 +Lustre-commit: 61481796ac85e9ab2469b8d2f4cc75088c65d298 + +Change-Id: I4b3ee197f6e274abda06edf60b246f089fe28d10 +Signed-off-by: Lei Feng +Test-Parameters: trivial testlist=sanity-quota +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49611 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Stephane Thiell +Reviewed-by: Oleg Drokin +Reviewed-by: Andreas Dilger +--- + lustre/include/lustre_quota.h | 5 +++-- + lustre/tests/sanity-quota.sh | 31 +++++++++++++++++++++++++++++++ + 2 files changed, 34 insertions(+), 2 deletions(-) + +diff --git a/lustre/include/lustre_quota.h b/lustre/include/lustre_quota.h +index 4b674d8b12..6b98f4c9a2 100644 +--- a/lustre/include/lustre_quota.h ++++ b/lustre/include/lustre_quota.h +@@ -234,8 +234,9 @@ struct lquota_id_info { + * a single transaction for inode and block quota, which is chown transaction: + * original uid and gid, new uid and gid. + * +- * This value might need to be revised when directory quota is added. */ +-#define QUOTA_MAX_TRANSIDS 8 ++ * Given a parent dir and a sub dir, with different uid, gid and project id, ++ * need x x = 12 ids */ ++#define QUOTA_MAX_TRANSIDS 12 + + /* all qids involved in a single transaction */ + struct lquota_trans { +diff --git a/lustre/tests/sanity-quota.sh b/lustre/tests/sanity-quota.sh +index f36601ce5d..5b9ede2d26 100755 +--- a/lustre/tests/sanity-quota.sh ++++ b/lustre/tests/sanity-quota.sh +@@ -5414,6 +5414,37 @@ test_81() { + } + run_test 81 "Race qmt_start_pool_recalc with qmt_pool_free" + ++test_82() ++{ ++ (( $MDS1_VERSION >= $(version_code 2.14.55) )) || ++ skip "need MDS 2.14.55 or later" ++ is_project_quota_supported || ++ skip "skip project quota unsupported" ++ ++ setup_quota_test || error "setup quota failed with $?" ++ stack_trap cleanup_quota_test ++ quota_init ++ ++ local parent_dir="$DIR/$tdir.parent" ++ local child_dir="$parent_dir/child" ++ ++ mkdir -p $child_dir ++ stack_trap "chown -R 0:0 $parent_dir" ++ ++ chown $TSTUSR:$TSTUSR $parent_dir || ++ error "failed to chown on $parent_dir" ++ chown $TSTUSR2:$TSTUSRS2 $child_dir || ++ error "failed to chown on $parent_dir" ++ ++ $LFS project -p 1000 $parent_dir || ++ error "failed to set project id on $parent_dir" ++ $LFS project -p 1001 $child_dir || ++ error "failed to set project id on $child_dir" ++ ++ rmdir $child_dir || error "cannot remove child dir, test failed" ++} ++run_test 82 "verify more than 8 qids for single operation" ++ + quota_fini() + { + do_nodes $(comma_list $(nodes_list)) \ +-- +2.33.0 + diff --git a/0031-LU-16916-tests-fix-client_evicted-not-to-ignore-EOPN.patch b/0031-LU-16916-tests-fix-client_evicted-not-to-ignore-EOPN.patch new file mode 100644 index 0000000000000000000000000000000000000000..a8c67788c0e16f7b7473b3f8c5969e1b944b0851 --- /dev/null +++ b/0031-LU-16916-tests-fix-client_evicted-not-to-ignore-EOPN.patch @@ -0,0 +1,76 @@ +From ff9e29d3b534959261950e45595f970d7f39213f Mon Sep 17 00:00:00 2001 +From: Jian Yu +Date: Fri, 14 Jul 2023 13:22:18 +0800 +Subject: [PATCH 31/61] LU-16916 tests: fix client_evicted() not to ignore + EOPNOTSUPP + +After RHEL 9.x or Ubuntu 22.04 client is evicted, "lfs df" returns +error code 95 (EOPNOTSUPP), which is ignored in check_lfs_df_ret_val() +and then causes client_evicted() to ingore that error. + +This patch fixes client_evicted() to check the return value +from "lfs df" directly so as not to ignore EOPNOTSUPP. + +Lustre-change: https://review.whamcloud.com/51667 +Lustre-commit: a5a9ded43b72238c2df8e0a74f03151ea3d4ce99 + +Test-Parameters: trivial clientdistro=el9.2 testlist=replay-vbr +Test-Parameters: trivial clientdistro=el8.8 testlist=replay-vbr + +Change-Id: I633ae8769fc563b8068f433e2afae29463ac5553 +Signed-off-by: Jian Yu +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51668 +Reviewed-by: Oleg Drokin +Reviewed-by: Andreas Dilger +Reviewed-by: Arshad Hussain +Tested-by: Maloo +Tested-by: jenkins +--- + lustre/tests/test-framework.sh | 16 +++++++++++++--- + 1 file changed, 13 insertions(+), 3 deletions(-) + +diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh +index 2c92993117..6126541ef4 100755 +--- a/lustre/tests/test-framework.sh ++++ b/lustre/tests/test-framework.sh +@@ -3583,16 +3583,25 @@ wait_remote_prog () { + return $rc + } + +-lfs_df_check() { ++_lfs_df_check() { + local clients=${1:-$CLIENTS} + local rc=0 + +- if [ -z "$clients" ]; then ++ if [[ -z "$clients" ]]; then + $LFS df $MOUNT > /dev/null || rc=$? + else + $PDSH $clients "$LFS df $MOUNT" > /dev/null || rc=$? + fi + ++ return $rc ++} ++ ++lfs_df_check() { ++ local clients=${1:-$CLIENTS} ++ local rc=0 ++ ++ _lfs_df_check "$clients" || rc=$? ++ + check_lfs_df_ret_val $rc + } + +@@ -3622,7 +3631,8 @@ client_up() { + } + + client_evicted() { +- ! client_up $1 ++ sleep 1 ++ ! _lfs_df_check $1 + } + + client_reconnect_try() { +-- +2.33.0 + diff --git a/0032-LU-16626-build-remove-python2-dependencies.patch b/0032-LU-16626-build-remove-python2-dependencies.patch new file mode 100644 index 0000000000000000000000000000000000000000..5c1451ad878660d49a0b1f734482c1dd7dc0b611 --- /dev/null +++ b/0032-LU-16626-build-remove-python2-dependencies.patch @@ -0,0 +1,108 @@ +From 53f40033ccf9eeb4155b18f4046f33b19252aa18 Mon Sep 17 00:00:00 2001 +From: Alex Deiter +Date: Thu, 9 Mar 2023 18:09:19 +0400 +Subject: [PATCH 32/61] LU-16626 build: remove python2 dependencies + +Fixed packaging issue caused by zfsobj2fid script. + +Lustre-change: https://review.whamcloud.com/50241 +Lustre-commit: 404a1e827b0a9d86864695c8699e1ca076be6c9d + +Test-Parameters: trivial +Signed-off-by: Alex Deiter +Change-Id: I4375038b0d2c2b42ac4080fe834d35bdd3ef54f8 +Reviewed-by: Minh Diep +Reviewed-by: Jian Yu +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51426 +Reviewed-by: Oleg Drokin +Tested-by: jenkins +Tested-by: Maloo +--- + lustre/scripts/zfsobj2fid | 47 ++++++++++++++++++++++----------------- + 1 file changed, 26 insertions(+), 21 deletions(-) + +diff --git a/lustre/scripts/zfsobj2fid b/lustre/scripts/zfsobj2fid +index f7ae96bd17..a76095e1de 100755 +--- a/lustre/scripts/zfsobj2fid ++++ b/lustre/scripts/zfsobj2fid +@@ -1,4 +1,4 @@ +-#!/usr/bin/env python2 ++#!/usr/bin/env python3 + + # Copyright (c) 2014, Lawrence Livermore National Security, LLC. + # Produced at the Lawrence Livermore National Laboratory. +@@ -27,21 +27,31 @@ + import sys + import subprocess + +-def from_bytes(b): +- return sum(b[i] << i*8 for i in range(len(b))) ++def from_bytes(data): ++ return hex(sum(data[i] << i * 8 for i in range(len(data)))) + + def main(): + if len(sys.argv) != 3: +- print "Usage:", sys.argv[0], " " ++ print('Usage:', sys.argv[0], '', '') + return 1 + +- p = subprocess.Popen(["zdb", "-e", "-vvv", sys.argv[1], sys.argv[2]], +- stdout=subprocess.PIPE) +- pout, perr = p.communicate() ++ cmd = ['zdb', '-e', '-vvv', sys.argv[1], sys.argv[2]] ++ process = subprocess.Popen(cmd, ++ stdout=subprocess.PIPE, ++ stderr=subprocess.PIPE, ++ universal_newlines=True) ++ stdout, stderr = process.communicate() ++ result = process.returncode + +- b = bytearray() ++ if result != 0: ++ msg = 'Error %d on %s: %s %s' % (result, cmd, stdout, stderr) ++ raise RuntimeError(msg) ++ ++ lines = stdout.splitlines() ++ data = bytearray() + found_fid = False +- for line in pout.split('\n'): ++ ++ for line in lines: + part = line.split() + if not part or part[0] != 'trusted.fid': + continue +@@ -53,24 +63,19 @@ def main(): + if val == '\\': + val = fid[0:3] + fid = fid[3:] +- b.append(int(val, 8)) ++ data.append(int(val, 8)) + else: +- b.append(ord(val)) ++ data.append(ord(val)) + break + + if not found_fid: +- print "FID not found on", sys.argv[1], sys.argv[2] ++ print('FID not found on', sys.argv[1], sys.argv[2]) + return 1 + +- print '[' \ +- + hex(from_bytes(b[0:8])) \ +- + ':' \ +- + hex(from_bytes(b[8:12])) \ +- + ':' \ +- + hex(from_bytes(b[12:16])) \ +- + ']' +- ++ print('[%s:%s:%s]' % (from_bytes(data[0:8]), ++ from_bytes(data[8:12]), ++ from_bytes(data[12:16]))) + return 0 + + if __name__ == '__main__': +- sys.exit(main()) ++ sys.exit(main()) +-- +2.33.0 + diff --git a/0033-LU-16943-tests-use-primary-ost1-server-in-replay-sin.patch b/0033-LU-16943-tests-use-primary-ost1-server-in-replay-sin.patch new file mode 100644 index 0000000000000000000000000000000000000000..e55124be8d0be22b443f2daa40e45ee341a94962 --- /dev/null +++ b/0033-LU-16943-tests-use-primary-ost1-server-in-replay-sin.patch @@ -0,0 +1,48 @@ +From db9108300af2949a7ff334973f3df4486cec4a6a Mon Sep 17 00:00:00 2001 +From: Jian Yu +Date: Wed, 23 Aug 2023 18:01:20 -0700 +Subject: [PATCH 33/61] LU-16943 tests: use primary ost1 server in + replay-single/135 + +This patch fixes replay-single test_135() to make sure +the primary ost1 server is used at the beginning of the test. + +Lustre-change: https://review.whamcloud.com/52058 +Lustre-commit: cdd8b056bff0d48155eaf4b7732d1d8880ceda55 + +Test-Parameters: trivial testlist=replay-single + +Test-Parameters: trivial env=FAILURE_MODE=HARD \ + clientcount=4 mdtcount=1 mdscount=2 osscount=2 \ + austeroptions=-R failover=true iscsi=1 \ + testlist=replay-single,mmp + +Fixes: 81418be83ed8 ("LU-16943 tests: fix replay-single/135 under hard failure mode") +Change-Id: Ia25314255c9f00ba71687e1f757517f37031caed +Signed-off-by: Jian Yu +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/52059 +Reviewed-by: Oleg Drokin +Reviewed-by: Alex Deiter +Tested-by: jenkins +Tested-by: Maloo +--- + lustre/tests/replay-single.sh | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh +index f631b73125..c02bcc2c18 100755 +--- a/lustre/tests/replay-single.sh ++++ b/lustre/tests/replay-single.sh +@@ -4958,6 +4958,9 @@ run_test 134 "replay creation of a file created in a pool" + + # LU-14027 + test_135() { ++ # make sure we are using the primary server ++ [[ $(facet_active ost1) == "ost1" ]] || fail ost1 ++ + mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed" + + # All files to ost1 +-- +2.33.0 + diff --git a/0034-LU-16585-build-remove-python2-dependencies.patch b/0034-LU-16585-build-remove-python2-dependencies.patch new file mode 100644 index 0000000000000000000000000000000000000000..48f695d4e5e2dc89eab280c0a0f572acd8180e3f --- /dev/null +++ b/0034-LU-16585-build-remove-python2-dependencies.patch @@ -0,0 +1,164 @@ +From cfc219ced11adf837e3bd1e356217f33cbe68317 Mon Sep 17 00:00:00 2001 +From: Alex Deiter +Date: Wed, 22 Feb 2023 02:27:47 +0400 +Subject: [PATCH 34/61] LU-16585 build: remove python2 dependencies + +Fixed packaging issue casued by scripts and control files. + +Lustre-change: https://review.whamcloud.com/50084 +Lustre-commit: bea3f81f84fd16d2d403682ef25b8abe314acd0f + +Test-Parameters: trivial +Signed-off-by: Alex Deiter +Change-Id: I6c9b24bf811269928494af17c15627902e5fe27b +Reviewed-by: Patrick Farrell +Reviewed-by: Feng Lei +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/52176 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Oleg Drokin +--- + contrib/scripts/gerrit_checkpatch.py | 29 ++++++++++++++-------------- + debian/control | 2 +- + debian/control.main | 2 +- + 3 files changed, 17 insertions(+), 16 deletions(-) + +diff --git a/contrib/scripts/gerrit_checkpatch.py b/contrib/scripts/gerrit_checkpatch.py +index 9dbf1383ca..1563413abb 100755 +--- a/contrib/scripts/gerrit_checkpatch.py ++++ b/contrib/scripts/gerrit_checkpatch.py +@@ -1,4 +1,4 @@ +-#!/usr/bin/env python2 ++#!/usr/bin/env python3 + # + # GPL HEADER START + # +@@ -33,6 +33,7 @@ Gerrit Checkpatch Reviewer Daemon + * POST reviews back to gerrit based on checkpatch output. + """ + ++from __future__ import print_function + import base64 + import fnmatch + import logging +@@ -41,7 +42,7 @@ import os + import requests + import subprocess + import time +-import urllib ++from six.moves.urllib.parse import quote + + def _getenv_list(key, default=None, sep=':'): + """ +@@ -144,7 +145,7 @@ def parse_checkpatch_output(out, path_line_comments, warning_count): + except ValueError: + level, kind, message = None, None, None + +- if level != 'ERROR' and level != 'WARNING': ++ if level not in ('ERROR', 'WARNING'): + level, kind, message = None, None, None + + +@@ -155,9 +156,9 @@ def review_input_and_score(path_line_comments, warning_count): + """ + review_comments = {} + +- for path, line_comments in path_line_comments.iteritems(): ++ for path, line_comments in list(path_line_comments.items()): + path_comments = [] +- for line, comment_list in line_comments.iteritems(): ++ for line, comment_list in list(line_comments.items()): + message = '\n'.join(comment_list) + path_comments.append({'line': line, 'message': message}) + review_comments[path] = path_comments +@@ -194,7 +195,7 @@ def review_input_and_score(path_line_comments, warning_count): + + def _now(): + """_""" +- return long(time.time()) ++ return int(time.time()) + + + class Reviewer(object): +@@ -214,7 +215,7 @@ class Reviewer(object): + self.history_path = history_path + self.history_mode = 'rw' + self.history = {} +- self.timestamp = 0L ++ self.timestamp = 0 + self.post_enabled = True + self.post_interval = 10 + self.update_interval = 300 +@@ -292,7 +293,7 @@ class Reviewer(object): + for line in history_file: + epoch, change_id, revision, score = line.split() + if change_id == '-': +- self.timestamp = long(float(epoch)) ++ self.timestamp = int(float(epoch)) + else: + self.history[change_id + ' ' + revision] = score + +@@ -311,7 +312,7 @@ class Reviewer(object): + + if 'w' in self.history_mode: + with open(self.history_path, 'a') as history_file: +- print >> history_file, epoch, change_id, revision, score ++ print(epoch, change_id, revision, score, file=history_file) + + def in_history(self, change_id, revision): + """ +@@ -323,8 +324,8 @@ class Reviewer(object): + """ + GET one change by id. + """ +- path = ('/changes/' + urllib.quote(self.project, safe='') + '~' + +- urllib.quote(self.branch, safe='') + '~' + change_id + ++ path = ('/changes/' + quote(self.project, safe='') + '~' + ++ quote(self.branch, safe='') + '~' + change_id + + '?o=CURRENT_REVISION') + res = self._get(path) + if not res: +@@ -343,11 +344,11 @@ class Reviewer(object): + """ + query = dict(query) + project = query.get('project', self.project) +- query['project'] = urllib.quote(project, safe='') ++ query['project'] = quote(project, safe='') + branch = query.get('branch', self.branch) +- query['branch'] = urllib.quote(branch, safe='') ++ query['branch'] = quote(branch, safe='') + path = ('/changes/?q=' + +- '+'.join(k + ':' + v for k, v in query.iteritems()) + ++ '+'.join(k + ':' + v for k, v in list(query.items())) + + '&o=CURRENT_REVISION') + res = self._get(path) + if not res: +diff --git a/debian/control b/debian/control +index ac91ac2f24..836623edc4 100644 +--- a/debian/control ++++ b/debian/control +@@ -72,7 +72,7 @@ Package: lustre-iokit + Section: utils + Architecture: i386 armhf powerpc ppc64el amd64 ia64 arm64 + Priority: optional +-Depends: lustre-client-utils (= ${binary:Version}), python2, perl, sg3-utils ++Depends: lustre-client-utils (= ${binary:Version}), python3, perl, sg3-utils + Description: Collection of benchmark tools for the Lustre filesystem + Lustre is a scalable, secure, robust, highly-available cluster file system. + This release is maintained by Whamcloud and available from +diff --git a/debian/control.main b/debian/control.main +index ac91ac2f24..836623edc4 100644 +--- a/debian/control.main ++++ b/debian/control.main +@@ -72,7 +72,7 @@ Package: lustre-iokit + Section: utils + Architecture: i386 armhf powerpc ppc64el amd64 ia64 arm64 + Priority: optional +-Depends: lustre-client-utils (= ${binary:Version}), python2, perl, sg3-utils ++Depends: lustre-client-utils (= ${binary:Version}), python3, perl, sg3-utils + Description: Collection of benchmark tools for the Lustre filesystem + Lustre is a scalable, secure, robust, highly-available cluster file system. + This release is maintained by Whamcloud and available from +-- +2.33.0 + diff --git a/0035-LU-15660-statahead-statahead-thread-doesn-t-stop.patch b/0035-LU-15660-statahead-statahead-thread-doesn-t-stop.patch new file mode 100644 index 0000000000000000000000000000000000000000..c58014eff9fa71dc1fc19912a7eb318aa92a4052 --- /dev/null +++ b/0035-LU-15660-statahead-statahead-thread-doesn-t-stop.patch @@ -0,0 +1,96 @@ +From f7438d13b2aadbdf8e90e5b0b7732eeb20f3d475 Mon Sep 17 00:00:00 2001 +From: Yang Sheng +Date: Fri, 17 Jun 2022 20:30:34 +0800 +Subject: [PATCH 35/61] LU-15660 statahead: statahead thread doesn't stop + +Add a barrier to ensure sai_task changing can be seen +when access it without locking. Else the statahead +thread could sleep forever since wake_up was lost. + +Lustre-change: https://review.whamcloud.com/47673 +Lustre-commit: b977caa2dc7dddcec9e20d393ee79dfa9fe31c0d + +Signed-off-by: Yang Sheng +Change-Id: I211e99f1bdddaaaf028a205658f603fda034d389 +Reviewed-by: Neil Brown +Reviewed-by: Andreas Dilger +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/52300 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Oleg Drokin +--- + lustre/llite/statahead.c | 21 +++++++++++++-------- + 1 file changed, 13 insertions(+), 8 deletions(-) + +diff --git a/lustre/llite/statahead.c b/lustre/llite/statahead.c +index faf2860c8b..9df5137d9a 100644 +--- a/lustre/llite/statahead.c ++++ b/lustre/llite/statahead.c +@@ -1055,7 +1055,8 @@ static int ll_statahead_thread(void *arg) + if (!op_data) + GOTO(out, rc = -ENOMEM); + +- while (pos != MDS_DIR_END_OFF && sai->sai_task) { ++ /* matches smp_store_release() in ll_deauthorize_statahead() */ ++ while (pos != MDS_DIR_END_OFF && smp_load_acquire(&sai->sai_task)) { + struct lu_dirpage *dp; + struct lu_dirent *ent; + +@@ -1081,7 +1082,8 @@ static int ll_statahead_thread(void *arg) + + dp = page_address(page); + for (ent = lu_dirent_start(dp); +- ent != NULL && sai->sai_task && ++ /* matches smp_store_release() in ll_deauthorize_statahead() */ ++ ent != NULL && smp_load_acquire(&sai->sai_task) && + !sa_low_hit(sai); + ent = lu_dirent_next(ent)) { + __u64 hash; +@@ -1134,7 +1136,9 @@ static int ll_statahead_thread(void *arg) + fid_le_to_cpu(&fid, &ent->lde_fid); + + while (({set_current_state(TASK_IDLE); +- sai->sai_task; })) { ++ /* matches smp_store_release() in ++ * ll_deauthorize_statahead() */ ++ smp_load_acquire(&sai->sai_task); })) { + if (sa_has_callback(sai)) { + __set_current_state(TASK_RUNNING); + sa_handle_callback(sai); +@@ -1217,7 +1221,8 @@ static int ll_statahead_thread(void *arg) + * for file release to stop me. + */ + while (({set_current_state(TASK_IDLE); +- sai->sai_task; })) { ++ /* matches smp_store_release() in ll_deauthorize_statahead() */ ++ smp_load_acquire(&sai->sai_task); })) { + if (sa_has_callback(sai)) { + __set_current_state(TASK_RUNNING); + sa_handle_callback(sai); +@@ -1304,7 +1309,8 @@ void ll_deauthorize_statahead(struct inode *dir, void *key) + */ + struct task_struct *task = sai->sai_task; + +- sai->sai_task = NULL; ++ /* matches smp_load_acquire() in ll_statahead_thread() */ ++ smp_store_release(&sai->sai_task, NULL); + wake_up_process(task); + } + spin_unlock(&lli->lli_sa_lock); +@@ -1686,11 +1692,10 @@ static int start_statahead_thread(struct inode *dir, struct dentry *dentry, + GOTO(out, rc); + } + +- if (test_bit(LL_SBI_AGL_ENABLED, ll_i2sbi(parent->d_inode)->ll_flags) && +- agl) ++ if (test_bit(LL_SBI_AGL_ENABLED, sbi->ll_flags) && agl) + ll_start_agl(parent, sai); + +- atomic_inc(&ll_i2sbi(parent->d_inode)->ll_sa_total); ++ atomic_inc(&sbi->ll_sa_total); + sai->sai_task = task; + + wake_up_process(task); +-- +2.33.0 + diff --git a/0036-LU-16042-tests-can-not-get-cache-size-on-Arm64.patch b/0036-LU-16042-tests-can-not-get-cache-size-on-Arm64.patch new file mode 100644 index 0000000000000000000000000000000000000000..230feeca894d87d47312951b5460100dc12b2a16 --- /dev/null +++ b/0036-LU-16042-tests-can-not-get-cache-size-on-Arm64.patch @@ -0,0 +1,51 @@ +From a9e47f3bf9255047890d9aa886954432fe058ef5 Mon Sep 17 00:00:00 2001 +From: Kevin Zhao +Date: Mon, 25 Jul 2022 15:53:44 +0800 +Subject: [PATCH 36/61] LU-16042 tests: can not get cache size on Arm64 + +This fix the test fail on Arm64, the cache size can not be +display on /proc/cpuinfo. And even in the VM and somee +older Arm64 CPU, we can not get the cachesize. So it's +better to fallback to a pre-set value here if we don't get +the cache size. + +Lustre-change: https://review.whamcloud.com/48030 +Lustre-commit: f276f1cb0859e8718448e69bd99ee305f5e62d42 + +Test-Parameters: trivial +Test-Parameters: clientarch=aarch64 clientdistro=el8.7 \ + testlist=sanity env=ONLY=155 + +Signed-off-by: Kevin Zhao +Change-Id: I17ce1d8accc69d1489db2071a2741b3927fff302 +Reviewed-by: Andreas Dilger +Reviewed-by: James Simmons +Signed-off-by: Xinliang Liu +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51179 +Tested-by: Maloo +Tested-by: jenkins +Reviewed-by: Oleg Drokin +--- + lustre/tests/sanity.sh | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh +index 93f8c3b30c..7b4d43ef12 100755 +--- a/lustre/tests/sanity.sh ++++ b/lustre/tests/sanity.sh +@@ -15602,6 +15602,12 @@ test_155_big_load() { + free_min_max + local cache_size=$(do_facet ost$((MAXI+1)) \ + "awk '/cache/ {sum+=\\\$4} END {print sum}' /proc/cpuinfo") ++ ++ # LU-16042: can not get the cache size on Arm64 VM here, fallback to a ++ # pre-set value ++ if [ -z "$cache_size" ]; then ++ cache_size=256 ++ fi + local large_file_size=$((cache_size * 2)) + + echo "OSS cache size: $cache_size KB" +-- +2.33.0 + diff --git a/0037-LU-16662-autoconf-fix-configure-test-compile-for-CON.patch b/0037-LU-16662-autoconf-fix-configure-test-compile-for-CON.patch new file mode 100644 index 0000000000000000000000000000000000000000..0f3224cb3461401121ddebef9a2099cffdfed0bf --- /dev/null +++ b/0037-LU-16662-autoconf-fix-configure-test-compile-for-CON.patch @@ -0,0 +1,76 @@ +From da20815eeddf748ec815071fec8a786f1bc7c37b Mon Sep 17 00:00:00 2001 +From: Xinliang Liu +Date: Thu, 6 Apr 2023 23:24:04 -0700 +Subject: [PATCH 37/61] LU-16662 autoconf: fix configure test compile for + CONFIG_KEYS + +This fixes below configure error on Linux v5.19+: +$ ./configure --disable-server +... +checking whether to enable gss keyring backend... yes +checking if Linux kernel was built with CONFIG_KEYS in or as module... +no +configure: WARNING: GSS keyring backend requires that CONFIG_KEYS be +enabled in your kernel. +checking for keyctl_search in -lkeyutils... yes +configure: error: Cannot enable gss_keyring. See above for details. +$ grep CONFIG_KEYS -rn /boot/config-* +6884:CONFIG_KEYS=y + +For in-tree IB support and without passing Linux src path when run +./configure, the LINUX_OBJ maybe just a soft link to O2IBPATH, they +are pointing to the same dir. E.g.: +O2IBPATH='/usr/src/kernels/6.1.8-3.0.0.7.oe1.aarch64' +LINUX_OBJ='/lib/modules/6.1.8-3.0.0.7.oe1.aarch64/build' +$ ls -l /lib/modules/6.1.8-3.0.0.7.oe1.aarch64/build +lrwxrwxrwx 1 root root 42 Feb 7 00:00 +/lib/modules/6.1.8-3.0.0.7.oe1.aarch64/build -> +/usr/src/kernels/6.1.8-3.0.0.7.oe1.aarch64 +In this case, current configure will put kernel's Module.symvers to +variable KBUILD_EXTRA_SYMBOLS. This should be avoided after kernel +v5.19 which contains commit "b8422711080f modpost: make multiple export +error". This making multiple export symbol as an error from a warning +which can be seen in the config.log: +... +ERROR: modpost: vmlinux: 'init_uts_ns' exported twice. Previous export +was in vmlinux +... + +Lustre-change: https://review.whamcloud.com/50399 +Lustre-commit: 321a533b868908f37d01a4b787f5a463a02e427c + +Test-Parameters: trivial +Change-Id: I35295b3acc7fffb93716362f5d8c659eb922afcb +Signed-off-by: Xinliang Liu +Reviewed-by: Shaun Tancheff +Reviewed-by: James Simmons +Signed-off-by: Xinliang Liu +--- + lnet/autoconf/lustre-lnet.m4 | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lnet/autoconf/lustre-lnet.m4 b/lnet/autoconf/lustre-lnet.m4 +index da6fd582b2..f1dce43dd3 100644 +--- a/lnet/autoconf/lustre-lnet.m4 ++++ b/lnet/autoconf/lustre-lnet.m4 +@@ -251,7 +251,7 @@ AS_IF([test $ENABLEO2IB = "no"], [ + O2IB_SYMVER=$LINUX_OBJ/Module.symvers + fi + if test -n "$O2IB_SYMVER"; then +- if test "$O2IB_SYMVER" != "$LINUX_OBJ/Module.symvers"; then ++ if test ! "$O2IB_SYMVER" -ef "$LINUX_OBJ/Module.symvers"; then + AC_MSG_NOTICE([adding $O2IB_SYMVER to Symbol Path O2IB]) + EXTRA_SYMBOLS="$EXTRA_SYMBOLS $O2IB_SYMVER" + AC_SUBST(EXTRA_SYMBOLS) +@@ -292,7 +292,7 @@ AC_SUBST(ENABLEO2IB) + + AS_IF([test $ENABLEO2IB != "no"], [ + EXTRA_CHECK_INCLUDE="$EXTRA_OFED_CONFIG $EXTRA_OFED_INCLUDE" +- if test $O2IBPATH != $LINUX_OBJ; then ++ if test ! $O2IBPATH -ef $LINUX_OBJ; then + KBUILD_EXTRA_SYMBOLS="$KBUILD_EXTRA_SYMBOLS $O2IBPATH/Module.symvers" + fi + +-- +2.33.0 + diff --git a/0038-LU-16322-build-Add-client-build-support-for-openEule.patch b/0038-LU-16322-build-Add-client-build-support-for-openEule.patch new file mode 100644 index 0000000000000000000000000000000000000000..31340c5079ce1f4810c550cb204bc89fd5fc6029 --- /dev/null +++ b/0038-LU-16322-build-Add-client-build-support-for-openEule.patch @@ -0,0 +1,308 @@ +From 1efdc0f82bf67c06ad50d09c24cab97a7f0e2e89 Mon Sep 17 00:00:00 2001 +From: Xinliang Liu +Date: Wed, 26 Oct 2022 08:58:14 +0000 +Subject: [PATCH 38/61] LU-16322: build: Add client build support for openEuler + +The kernel of current openEuler LTS version 22.03 is based on Linux +5.10.0 which is already supported in Lustre master. Thus we only need +to add build support for openEuler client. + +OpenEuler Linux although is not compatible with RHEL, but it uses the +same package manager DNF/tools as RHEL and references the package +naming of RHEL. Thus we can reuse most of the RHEL build logic/scripts +for openEuler client building. + +OpenEuler Linux is becoming the mainstream Linux distro in China. So +adding support for it makes sense for the users. For more details about +it see: https://www.openeuler.org/en/. + +Lustre-change: https://review.whamcloud.com/49187 +Lustre-commit: d622b26d8d8a7f13b3a078d4dd58e795b77d232f + +Test-Parameters: trivial +Change-Id: I8e8b59d36e566c6e49b12346c2fde985153f014d +Signed-off-by: Xinliang Liu +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49187 +Reviewed-by: Oleg Drokin +Reviewed-by: Jian Yu +Reviewed-by: Andreas Dilger +Tested-by: jenkins +Tested-by: Maloo +(cherry picked from commit d622b26d8d8a7f13b3a078d4dd58e795b77d232f) +Signed-off-by: Xinliang Liu +--- + config/lustre-build-linux.m4 | 20 +++++++++- + contrib/lbuild/funcs.sh | 6 +++ + contrib/lbuild/lbuild | 3 ++ + contrib/lbuild/lbuild-oe2203 | 39 +++++++++++++++++++ + lustre.spec.in | 20 +++++++--- + lustre/ChangeLog | 1 + + lustre/autoconf/lustre-core.m4 | 1 + + .../targets/5.10-oe2203.target.in | 21 ++++++++++ + 8 files changed, 103 insertions(+), 8 deletions(-) + create mode 100644 contrib/lbuild/lbuild-oe2203 + create mode 100644 lustre/kernel_patches/targets/5.10-oe2203.target.in + +diff --git a/config/lustre-build-linux.m4 b/config/lustre-build-linux.m4 +index db1b2ea86a..63e212a3d1 100644 +--- a/config/lustre-build-linux.m4 ++++ b/config/lustre-build-linux.m4 +@@ -112,6 +112,7 @@ AC_DEFUN([LB_LINUX_RELEASE], [ + RHEL_KERNEL="no" + SUSE_KERNEL="no" + UBUNTU_KERNEL="no" ++ OPENEULER_KERNEL="no" + # And if any of the above kernels has been detected yet + KERNEL_FOUND="no" + +@@ -171,9 +172,23 @@ AC_DEFUN([LB_LINUX_RELEASE], [ + ]) + ]) + ++ # Check for openEuler ++ AS_IF([test "x$KERNEL_FOUND" = "xno"], [ ++ AC_CACHE_CHECK([for openEuler kernel signature], lb_cv_openeuler_kernel_sig, [ ++ lb_cv_openeuler_kernel_sig="no" ++ AS_IF([fgrep -q 'openEuler' $LINUX_OBJ/include/linux/kabi.h], [ ++ lb_cv_openeuler_kernel_sig="yes" ++ ]) ++ ]) ++ AS_IF([test "x$lb_cv_openeuler_kernel_sig" = "xyes"], [ ++ OPENEULER_KERNEL="yes" ++ KERNEL_FOUND="yes" ++ ]) ++ ]) ++ + # If still no kernel was found, a warning is issued + AS_IF([test "x$KERNEL_FOUND" = "xno"], [ +- AC_MSG_WARN([Kernel Distro seems to be neither RedHat, SuSE nor Ubuntu]) ++ AC_MSG_WARN([Kernel Distro seems to be neither RedHat, SuSE, openEuler nor Ubuntu]) + ]) + + AC_MSG_CHECKING([for Linux kernel module package directory]) +@@ -183,9 +198,10 @@ AC_DEFUN([LB_LINUX_RELEASE], [ + [KMP_MODDIR=$withval + IN_KERNEL=''],[ + AS_IF([test x$RHEL_KERNEL = xyes], [KMP_MODDIR="extra/kernel"], ++ [test x$OPENEULER_KERNEL = xyes], [KMP_MODDIR="extra/kernel"], + [test x$SUSE_KERNEL = xyes], [KMP_MODDIR="updates/kernel"], + [test x$UBUNTU_KERNEL = xyes], [KMP_MODDIR="updates/kernel"], +- [AC_MSG_WARN([Kernel Distro seems to be neither RedHat, SuSE nor Ubuntu])] ++ [AC_MSG_WARN([Kernel Distro seems to be neither RedHat, SuSE, openEuler nor Ubuntu])] + ) + IN_KERNEL="${PACKAGE}"]) + AC_MSG_RESULT($KMP_MODDIR) +diff --git a/contrib/lbuild/funcs.sh b/contrib/lbuild/funcs.sh +index a6c39473cf..78e2af9468 100644 +--- a/contrib/lbuild/funcs.sh ++++ b/contrib/lbuild/funcs.sh +@@ -154,6 +154,11 @@ autodetect_distro() { + "Fedora") + name="fc" + ;; ++ "openEuler") ++ name="oe" ++ # Change from YY.MM to YYMM, let DISTROMAJ contain MM part ++ version=${version/./} ++ ;; + *) + fatal 1 "I don't know what distro name $name and version $version is.\nEither update autodetect_distro() or use the --distro argument." + ;; +@@ -207,6 +212,7 @@ autodetect_target() { + sles15.3) target="$(uname -r | cut -d . -f 1,2)-sles15sp3";; + sles15.4) target="$(uname -r | cut -d . -f 1,2)-sles15sp4";; + fc18) target="3.x-fc18";; ++ oe2203) target="5.10-oe2203";; + *) fatal 1 "I don't know what distro $distro is.\nEither update autodetect_target() or use the --target argument.";; + esac + +diff --git a/contrib/lbuild/lbuild b/contrib/lbuild/lbuild +index c5af86132c..90d3668aff 100755 +--- a/contrib/lbuild/lbuild ++++ b/contrib/lbuild/lbuild +@@ -329,6 +329,9 @@ check_options() { + 3.0-sles11) + CANONICAL_TARGET="sles11" + ;; ++ 5.10-oe2203) ++ CANONICAL_TARGET="oe2203" ++ ;; + esac + + local timestampnodig=$(echo $TIMESTAMP | sed -e s/[0-9]*//g) +diff --git a/contrib/lbuild/lbuild-oe2203 b/contrib/lbuild/lbuild-oe2203 +new file mode 100644 +index 0000000000..2abdcb6c52 +--- /dev/null ++++ b/contrib/lbuild/lbuild-oe2203 +@@ -0,0 +1,39 @@ ++source ${LBUILD_DIR}/lbuild-rhel ++ ++# increment this if you have made a change that should force a new kernel ++# to build built ++BUILD_GEN+=".0" ++ ++SPEC_NAME="kernel.spec" ++DEVEL_PATH_ARCH_DELIMETER="." ++USE_KABI=false ++RPM_HELPERS_DIR="/usr/lib/rpm/openEuler" ++ ++# force local definition of %dist into ~/.rpmmacros ++# to avoid verbose extended strings like ".el9.centos" ++# in kernel version and rpm names ++# ++RMAC=$HOME/.rpmmacros ++grep '^%dist' $RMAC &> /dev/null || echo '%dist .oe2203' >> $RMAC ++ ++unpack_linux_devel_rpm-oe2203() { ++ local callers_rpm="$1" ++ ++ unpack_linux_devel_rpm-rhel "$callers_rpm" ++} ++ ++find_linux_rpm-oe2203() { ++ local prefix="$1" ++ local wanted_kernel="$2" ++ local pathtorpms=${3:-"$KERNELRPMSBASE/$lnxmaj/$DISTROMAJ/$TARGET_ARCH"} ++ ++ find_linux_rpm-rhel "$prefix" "$wanted_kernel" "$pathtorpms" ++} ++ ++apply_kmod_requires_conflicts() { ++ if $PATCHLESS; then ++ # don't allow the patched kernel to be considered as ++ # a valid kernel for the patchless client ++ echo "Conflicts: kernel-lustre" >> rpm/kmp-lustre.preamble ++ fi ++} +diff --git a/lustre.spec.in b/lustre.spec.in +index f4ac0c894d..a8c0934428 100644 +--- a/lustre.spec.in ++++ b/lustre.spec.in +@@ -136,7 +136,7 @@ + # requires want to set a version including epoch + %global krequires %(echo %{kver} | sed -e 's/\.x86_64$//' -e 's/\.i[3456]86$//' -e 's/-smp$//' -e 's/-bigsmp$//' -e 's/[-.]ppc64$//' -e 's/\.aarch64$//' -e 's/-default$//' -e 's/-%{_flavor}//') + +-%if "%{_vendor}" == "redhat" || "%{_vendor}" == "fedora" ++%if "%{_vendor}" == "redhat" || "%{_vendor}" == "fedora" || "%{_vendor}" == "openEuler" + %global requires_kmod_name kmod-%{lustre_name} + %global requires_kmod_osd_zfs_name kmod-%{lustre_name}-osd-zfs + %if %{with lustre_tests} +@@ -175,6 +175,11 @@ + %define with_systemd 1 + %endif + ++# openEuler comes with systemd ++%if "%{_vendor}" == "openEuler" ++%define with_systemd 1 ++%endif ++ + %{!?mpi_name: %global mpi_name openmpi} + + Summary: Lustre File System +@@ -207,6 +212,9 @@ BuildRequires: libtool libyaml-devel zlib-devel libnl3-devel flex bison + BuildRequires: redhat-rpm-config + BuildRequires: pkgconfig + %else ++%if "%{_vendor}" == "openEuler" ++BuildRequires: openEuler-rpm-config ++%endif + BuildRequires: pkg-config + %endif + %if %{with gss} +@@ -220,20 +228,20 @@ Provides: lustre-server = %{version}-%{release} + %endif + Obsoletes: lustre-client < %{version} + Provides: lustre-client = %{version}-%{release} +-%if "%{_vendor}" == "redhat" || "%{_vendor}" == "fedora" ++%if "%{_vendor}" == "redhat" || "%{_vendor}" == "fedora" || "%{_vendor}" == "openEuler" + #suse don't support selinux + BuildRequires: libselinux-devel + %endif + %if %{with lustre_modules} + %if %{with mofed} + BuildRequires: mlnx-ofa_kernel-devel +-%if "%{_vendor}" == "redhat" ++%if "%{_vendor}" == "redhat" || "%{_vendor}" == "openEuler" + Requires: kmod-mlnx-ofa_kernel + %else + Requires: mlnx-ofa_kernel-kmp + %endif + %endif +-%if 0%{?rhel} >= 8 ++%if 0%{?rhel} >= 8 || "%{_vendor}" == "openEuler" + BuildRequires: kernel-rpm-macros + %endif + BuildRequires: %kernel_module_package_buildreqs +@@ -374,7 +382,7 @@ Requires: attr, rsync, perl, lsof, /usr/bin/getconf + BuildRequires: mpich-devel + %endif + %if "%{mpi_name}" == "openmpi" +-%if "%{_vendor}" == "redhat" || 0%{?suse_version} < 1500 ++%if "%{_vendor}" == "redhat" || "%{_vendor}" == "openEuler" || 0%{?suse_version} < 1500 + BuildRequires: openmpi-devel + %else + BuildRequires: openmpi2-devel +@@ -626,7 +634,7 @@ echo '%{_sysconfdir}/ha.d/resource.d/Lustre' >>lustre.files + echo '%{_unitdir}/lnet.service' >>lustre.files + %endif + +-%if "%{_vendor}" == "redhat" ++%if "%{_vendor}" == "redhat" || "%{_vendor}" == "openEuler" + # The following scripts are Red Hat specific + %if %{with servers} + echo '%{_sysconfdir}/init.d/lustre' >>lustre.files +diff --git a/lustre/ChangeLog b/lustre/ChangeLog +index 4b21088869..192662875e 100644 +--- a/lustre/ChangeLog ++++ b/lustre/ChangeLog +@@ -206,6 +206,7 @@ TBD Whamcloud + 5.8.0-53 (Ubuntu 20.04.2 HWE) + 5.11.0-31 (Ubuntu 20.04.3 HWE) + 5.11.0 (vanilla kernel.org) ++ 5.10.0-60.56.0.84.oe2203 (openEuler 22.03 LTS) + * Recommended e2fsprogs version: 1.46.5.wc1 or newer + * Recommended ZFS version: 2.1.5 + * NFS export disabled when stack size < 8192 (32-bit Lustre clients), +diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 +index 2927df790c..12afe95fbc 100644 +--- a/lustre/autoconf/lustre-core.m4 ++++ b/lustre/autoconf/lustre-core.m4 +@@ -3496,6 +3496,7 @@ lustre/kernel_patches/targets/5.3-sles15sp2.target + lustre/kernel_patches/targets/5.3-sles15sp3.target + lustre/kernel_patches/targets/5.14-sles15sp4.target + lustre/kernel_patches/targets/3.x-fc18.target ++lustre/kernel_patches/targets/5.10-oe2203.target + lustre/ldlm/Makefile + lustre/fid/Makefile + lustre/fid/autoMakefile +diff --git a/lustre/kernel_patches/targets/5.10-oe2203.target.in b/lustre/kernel_patches/targets/5.10-oe2203.target.in +new file mode 100644 +index 0000000000..5ba89d235f +--- /dev/null ++++ b/lustre/kernel_patches/targets/5.10-oe2203.target.in +@@ -0,0 +1,21 @@ ++lnxmaj="5.10.0" ++lnxrel="60.56.0.84.oe2203" ++ ++KERNEL_SRPM=kernel-${lnxmaj}-${lnxrel}.src.rpm ++SERIES="" ++EXTRA_VERSION=${lnxrel}_lustre.@VERSION@ ++LUSTRE_VERSION=@VERSION@ ++ ++DEVEL_PATH_ARCH_DELIMETER="." ++OFED_VERSION=inkernel ++ ++#SMP_ARCHS="i686 x86_64 ia64 ppc64" ++# openEuler doesn't use smp specific kernels ++SMP_ARCHS="" ++ ++for cc in gcc ; do ++ if which $cc >/dev/null 2>/dev/null ; then ++ export CC=$cc ++ break ++ fi ++done +-- +2.33.0 + diff --git a/0039-LU-16481-build-add-server-support-for-openEuler.patch b/0039-LU-16481-build-add-server-support-for-openEuler.patch new file mode 100644 index 0000000000000000000000000000000000000000..b1da3fde7f3ee16d3f5f37f7deeb72023e9da73d --- /dev/null +++ b/0039-LU-16481-build-add-server-support-for-openEuler.patch @@ -0,0 +1,2063 @@ +From f989b37b3bd1e4f7a465e959eaf7559438bab248 Mon Sep 17 00:00:00 2001 +From: Xinliang Liu +Date: Mon, 21 Nov 2022 03:36:38 +0000 +Subject: [PATCH 39/61] LU-16481 build: add server support for openEuler + +openEuer uses dnf as rpm pkg manager, it is somewhat like RHEL/Fedora. +The current openEuler LTS 22.03 kernel is based on Linux 5.10.0. + +Ldiskfs patches based on ldiskfs-5.10.0-ml.series, different patches +compared with ldiskfs-5.10.0-ml.series are: +oe2203/ext4-misc.patch +oe2203/ext4-pdirop.patch + use due to openEuler kernel backport new bugfixes and + based on ldiskfs-5.14.21-sles15sp4.series +linux-5.16/ext4-inode-version.patch +ubuntu20.04.3/ext4-simple-blockalloc.patch +linux-5.14/ext4-xattr-disable-credits-check.patch + use due to openEuler kernel backport new bugfixes. + +This patch also fixes lbuild that no need a kernel config file for +patchless-server build. And add patched-server build needs an series +patches checking. + +Test notes +---------- +This patch is tested with below lbuild cmd: +../lustre-release/contrib/lbuild/lbuild --ccache + --kerneldir=/home/openeuler/kernel-src-rpm/ + --kernelrpm=/home/openeuler/kernel-src-rpm/ + --lustre=/home/openeuler/lustre-release/lustre-2.15.54_1_xxx.tar.gz + --patchless-server --disable-zfs +Note that, due to zfs openEuler build support patches[1] haven't been +backported to the stable release branch zfs-2.1-release and tag 2.1.5, +current lbuild doesn't support zfs rpms build for openEuler you need +to build zfs rpms in the zfs source code individually with cmd 'make +rpms'. +And until the openEuler gcc issue[2] is fixed, or you need to apply +Lustre rpm spec patch[3]. +Until the openEuler kernel symbols providing issue[4] is fixed, or you +need to install kmod rpms with cmd 'sudo rpm -ivh --nodeps +./*.aarch64.rpm ' +[1] https://github.com/openzfs/zfs/pulls?q=is%3Apr+is%3Aclosed+openeuler +[2] https://gitee.com/openeuler/gcc/issues/I5XMD0 +[3] diff lustre.spec.in +... +-%define optflags -g -O2 -Werror ++%define optflags -g -O2 -Werror -Wno-stringop-overflow +[4] https://gitee.com/src-openeuler/kernel/issues/I6DQDX + +Test-Parameters: trivial +Change-Id: Ie00e7d37ba3965e409b924109085a675bf3f7f4f +Signed-off-by: Xinliang Liu +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49652 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Jian Yu +Reviewed-by: Shaun Tancheff +Reviewed-by: Oleg Drokin +Signed-off-by: Xinliang Liu +--- + config/lustre-build-ldiskfs.m4 | 6 +- + config/lustre-build-linux.m4 | 14 +- + config/lustre-build.m4 | 2 +- + contrib/lbuild/lbuild | 11 +- + contrib/lbuild/lbuild-oe2203 | 12 + + contrib/lbuild/lbuild-rhel | 2 + + .../patches/oe2203/ext4-delayed-iput.patch | 187 ++++ + .../patches/oe2203/ext4-inode-version.patch | 50 + + .../patches/oe2203/ext4-misc.patch | 202 ++++ + .../patches/oe2203/ext4-pdirop.patch | 926 ++++++++++++++++++ + .../oe2203/ext4-simple-blockalloc.patch | 340 +++++++ + .../ext4-xattr-disable-credits-check.patch | 24 + + .../series/ldiskfs-5.10.0-oe2203.series | 33 + + lustre/ChangeLog | 3 +- + .../targets/5.10-oe2203.target.in | 2 +- + 15 files changed, 1802 insertions(+), 12 deletions(-) + create mode 100644 ldiskfs/kernel_patches/patches/oe2203/ext4-delayed-iput.patch + create mode 100644 ldiskfs/kernel_patches/patches/oe2203/ext4-inode-version.patch + create mode 100644 ldiskfs/kernel_patches/patches/oe2203/ext4-misc.patch + create mode 100644 ldiskfs/kernel_patches/patches/oe2203/ext4-pdirop.patch + create mode 100644 ldiskfs/kernel_patches/patches/oe2203/ext4-simple-blockalloc.patch + create mode 100644 ldiskfs/kernel_patches/patches/oe2203/ext4-xattr-disable-credits-check.patch + create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203.series + +diff --git a/config/lustre-build-ldiskfs.m4 b/config/lustre-build-ldiskfs.m4 +index 14ad1e2215..8bc334b951 100644 +--- a/config/lustre-build-ldiskfs.m4 ++++ b/config/lustre-build-ldiskfs.m4 +@@ -123,9 +123,13 @@ AS_IF([test x$RHEL_KERNEL = xyes], [ + [LDISKFS_SERIES="5.8.0-53-ubuntu20.series"], + [LDISKFS_SERIES="5.8.0-63-ubuntu20.series"], + [LDISKFS_SERIES="5.8.0-ml.series"]) ++], [test x$OPENEULER_KERNEL = xyes], [ ++ case $OPENEULER_VERSION_NO in ++ 2203.0) LDISKFS_SERIES="5.10.0-oe2203.series" ;; ++ esac + ]) + ]) +-# Not RHEL/SLES or Ubuntu .. probably mainline ++# Not RHEL/SLES/openEuler or Ubuntu .. probably mainline + AS_IF([test -z "$LDISKFS_SERIES"], + [ + AS_VERSION_COMPARE([$LINUXRELEASE],[5.4.0],[], +diff --git a/config/lustre-build-linux.m4 b/config/lustre-build-linux.m4 +index 63e212a3d1..1ff6c47856 100644 +--- a/config/lustre-build-linux.m4 ++++ b/config/lustre-build-linux.m4 +@@ -174,15 +174,19 @@ AC_DEFUN([LB_LINUX_RELEASE], [ + + # Check for openEuler + AS_IF([test "x$KERNEL_FOUND" = "xno"], [ +- AC_CACHE_CHECK([for openEuler kernel signature], lb_cv_openeuler_kernel_sig, [ +- lb_cv_openeuler_kernel_sig="no" +- AS_IF([fgrep -q 'openEuler' $LINUX_OBJ/include/linux/kabi.h], [ +- lb_cv_openeuler_kernel_sig="yes" ++ AC_CACHE_CHECK([for openEuler kernel version number], lb_cv_openeuler_kernel_version, [ ++ lb_cv_openeuler_kernel_version="" ++ AS_IF([fgrep -q OPENEULER_VERSION $LINUX_OBJ/include/$VERSION_HDIR/version.h], [ ++ lb_cv_openeuler_kernel_version=$(awk '/ OPENEULER_MAJOR / { print [$]3 }' \ ++ $LINUX_OBJ/include/$VERSION_HDIR/version.h).$(awk \ ++ '/ OPENEULER_MINOR / { print [$]3 }' \ ++ $LINUX_OBJ/include/$VERSION_HDIR/version.h) + ]) + ]) +- AS_IF([test "x$lb_cv_openeuler_kernel_sig" = "xyes"], [ ++ AS_IF([test -n "$lb_cv_openeuler_kernel_version"], [ + OPENEULER_KERNEL="yes" + KERNEL_FOUND="yes" ++ OPENEULER_VERSION_NO=$lb_cv_openeuler_kernel_version + ]) + ]) + +diff --git a/config/lustre-build.m4 b/config/lustre-build.m4 +index 28eb3ecfe8..0e775d9d41 100644 +--- a/config/lustre-build.m4 ++++ b/config/lustre-build.m4 +@@ -448,7 +448,7 @@ AM_CONDITIONAL([DOC], [test x$ENABLE_DOC = x1]) + AM_CONDITIONAL([MANPAGES], [test x$enable_manpages = xyes]) + AM_CONDITIONAL([LINUX], [test x$lb_target_os = xlinux]) + AM_CONDITIONAL([USE_QUILT], [test x$use_quilt = xyes]) +-AM_CONDITIONAL([RHEL], [test -f /etc/redhat-release]) ++AM_CONDITIONAL([RHEL], [test -f /etc/redhat-release -o -f /etc/openEuler-release]) + AM_CONDITIONAL([SUSE], [test -f /etc/SUSE-brand -o -f /etc/SuSE-release]) + AM_CONDITIONAL([UBUNTU], [test x$UBUNTU_KERNEL = xyes]) + AM_CONDITIONAL([BUILD_LUTF], [test x$enable_lutf = xyes]) +diff --git a/contrib/lbuild/lbuild b/contrib/lbuild/lbuild +index 90d3668aff..0ad235afeb 100755 +--- a/contrib/lbuild/lbuild ++++ b/contrib/lbuild/lbuild +@@ -672,6 +672,12 @@ load_target() { + + . "$TARGET_FILE" + ++ ++ # patched kernel build needs a series patches ++ if ! $PATCHLESS && ! $PATCHLESS_SERVER && [ ! "$SERIES" ]; then ++ fatal 1 "Distro $DISTRO doesn't support patched kernel server build!! Please use option --patchless-server" ++ fi ++ + if [ -n "$env_OFED_VERSION" ]; then + OFED_VERSION="$env_OFED_VERSION" + fi +@@ -729,7 +735,7 @@ load_target() { + # a new variable. + PRISTINE_EXTRA_VERSION=$EXTRA_VERSION + +- if ! $PATCHLESS && [ ! -f "$CONFIG_FILE" ]; then ++ if ! $PATCHLESS && ! $PATCHLESS_SERVER && [ ! -f "$CONFIG_FILE" ]; then + fatal 1 "Config file for target $TARGET missing from $TOPDIR/lustre/lustre/kernel_patches/kernel_configs/." + fi + +@@ -1746,8 +1752,7 @@ build_with_srpm() { + fatal 1 "Could not find the Linux tree in $kernelrpm" + fi + # download and unpack kernel-debuginfo-common (only in EL) +- if [[ $DISTROMAJ =~ rhel ]]; then +- local KERNEL_DEBUGINFO="kernel-debuginfo-common-${TARGET_ARCH}-${lnxmaj}-${lnxrel}.${TARGET_ARCH}.rpm" ++ if [[ -n "$KERNEL_DEBUGINFO" ]]; then + download_debuginfo_common "$KERNEL_DEBUGINFO" + if ! lnxrel="$lnxrel" unpack_linux_devel_rpm \ + "$KERNELRPMSBASE/$lnxmaj/$DISTROMAJ/$TARGET_ARCH/$KERNEL_DEBUGINFO"; then +diff --git a/contrib/lbuild/lbuild-oe2203 b/contrib/lbuild/lbuild-oe2203 +index 2abdcb6c52..d49a1e67f8 100644 +--- a/contrib/lbuild/lbuild-oe2203 ++++ b/contrib/lbuild/lbuild-oe2203 +@@ -8,6 +8,8 @@ SPEC_NAME="kernel.spec" + DEVEL_PATH_ARCH_DELIMETER="." + USE_KABI=false + RPM_HELPERS_DIR="/usr/lib/rpm/openEuler" ++# Pkg which contains ext4 source code ++KERNEL_DEBUGINFO="kernel-debugsource-${lnxmaj}-${lnxrel}.${TARGET_ARCH}.rpm" + + # force local definition of %dist into ~/.rpmmacros + # to avoid verbose extended strings like ".el9.centos" +@@ -30,6 +32,16 @@ find_linux_rpm-oe2203() { + find_linux_rpm-rhel "$prefix" "$wanted_kernel" "$pathtorpms" + } + ++kernel_debuginfo_location() { ++ local base_os="https://repo.openeuler.org/openEuler-22.03-LTS" ++ ++ echo "$base_os/update/$TARGET_ARCH/Packages/" ++} ++ ++cleanup_rpmmacros() { ++ sed -i "/^%kernel_module_package/,/^)}$/d" $RMAC ++} ++ + apply_kmod_requires_conflicts() { + if $PATCHLESS; then + # don't allow the patched kernel to be considered as +diff --git a/contrib/lbuild/lbuild-rhel b/contrib/lbuild/lbuild-rhel +index b5058c4b33..a0522070b2 100644 +--- a/contrib/lbuild/lbuild-rhel ++++ b/contrib/lbuild/lbuild-rhel +@@ -11,6 +11,8 @@ BUILD_GEN+=".2" # LU-9850 + + DEVEL_KERNEL_TYPE="devel" + RPM_HELPERS_DIR="/usr/lib/rpm/redhat" ++# Pkg which contains ext4 source code ++KERNEL_DEBUGINFO="kernel-debuginfo-common-${TARGET_ARCH}-${lnxmaj}-${lnxrel}.${TARGET_ARCH}.rpm" + + # a method which can be overriden by the release specific code + get_rpmbuildopts() { +diff --git a/ldiskfs/kernel_patches/patches/oe2203/ext4-delayed-iput.patch b/ldiskfs/kernel_patches/patches/oe2203/ext4-delayed-iput.patch +new file mode 100644 +index 0000000000..8dd44bd4af +--- /dev/null ++++ b/ldiskfs/kernel_patches/patches/oe2203/ext4-delayed-iput.patch +@@ -0,0 +1,187 @@ ++From 5230b17f70f7d30161db506d4f631131befb319d Mon Sep 17 00:00:00 2001 ++From: Xinliang Liu ++Date: Wed, 10 May 2023 09:46:14 +0000 ++Subject: [PATCH] ext4 delayed iput ++ ++When changing a large xattr value to a different large xattr value, ++the old xattr inode is freed. Truncate during the final iput causes ++current transaction restart. Eventually, parent inode bh is marked ++dirty and kernel panic happens when jbd2 figures out that this bh ++belongs to the committed transaction. ++ ++A possible fix is to call this final iput in a separate thread. ++This way, setxattr transactions will never be split into two. ++Since the setxattr code adds xattr inodes with nlink=0 into the ++orphan list, old xattr inodes will be properly cleaned up in ++any case. ++ ++Signed-off-by: Andrew Perepechko ++HPE-bug-id: LUS-10534 ++ ++Changes since v1: ++- fixed a bug added during the porting ++- fixed a workqueue related deadlock reported by Tetsuo Handa ++ ++Signed-off-by: Xinliang Liu ++--- ++ fs/ext4/ext4.h | 7 +++++-- ++ fs/ext4/page-io.c | 2 +- ++ fs/ext4/super.c | 15 ++++++++------- ++ fs/ext4/xattr.c | 39 +++++++++++++++++++++++++++++++++++++-- ++ 4 files changed, 51 insertions(+), 12 deletions(-) ++ ++diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h ++index 213a2b6..bbf50b6 100644 ++--- a/fs/ext4/ext4.h +++++ b/fs/ext4/ext4.h ++@@ -1599,8 +1599,11 @@ struct ext4_sb_info { ++ struct flex_groups * __rcu *s_flex_groups; ++ ext4_group_t s_flex_groups_allocated; ++ ++- /* workqueue for reserved extent conversions (buffered io) */ ++- struct workqueue_struct *rsv_conversion_wq; +++ /* +++ * workqueue for reserved extent conversions (buffered io) +++ * and large ea inodes reclaim +++ */ +++ struct workqueue_struct *s_misc_wq; ++ ++ /* timer for periodic error stats printing */ ++ struct timer_list s_err_report; ++diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c ++index b076fab..c027af5 100644 ++--- a/fs/ext4/page-io.c +++++ b/fs/ext4/page-io.c ++@@ -233,7 +233,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) ++ WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); ++ WARN_ON(!io_end->handle && sbi->s_journal); ++ spin_lock_irqsave(&ei->i_completed_io_lock, flags); ++- wq = sbi->rsv_conversion_wq; +++ wq = sbi->s_misc_wq; ++ if (list_empty(&ei->i_rsv_conversion_list)) ++ queue_work(wq, &ei->i_rsv_conversion_work); ++ list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); ++diff --git a/fs/ext4/super.c b/fs/ext4/super.c ++index 8829e9d..b0fd2bc 100644 ++--- a/fs/ext4/super.c +++++ b/fs/ext4/super.c ++@@ -1223,10 +1223,11 @@ static void ext4_put_super(struct super_block *sb) ++ ext4_unregister_sysfs(sb); ++ ++ ext4_unregister_li_request(sb); +++ flush_workqueue(sbi->s_misc_wq); ++ ext4_quota_off_umount(sb); ++ ++ flush_work(&sbi->s_error_work); ++- destroy_workqueue(sbi->rsv_conversion_wq); +++ destroy_workqueue(sbi->s_misc_wq); ++ ++ if (sbi->s_journal) { ++ aborted = is_journal_aborted(sbi->s_journal); ++@@ -5001,9 +5002,9 @@ no_journal: ++ * The maximum number of concurrent works can be high and ++ * concurrency isn't really necessary. Limit it to 1. ++ */ ++- EXT4_SB(sb)->rsv_conversion_wq = ++- alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); ++- if (!EXT4_SB(sb)->rsv_conversion_wq) { +++ EXT4_SB(sb)->s_misc_wq = +++ alloc_workqueue("ext4-misc", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); +++ if (!EXT4_SB(sb)->s_misc_wq) { ++ printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); ++ ret = -ENOMEM; ++ goto failed_mount4; ++@@ -5225,8 +5226,8 @@ failed_mount4a: ++ sb->s_root = NULL; ++ failed_mount4: ++ ext4_msg(sb, KERN_ERR, "mount failed"); ++- if (EXT4_SB(sb)->rsv_conversion_wq) ++- destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); +++ if (EXT4_SB(sb)->s_misc_wq) +++ destroy_workqueue(EXT4_SB(sb)->s_misc_wq); ++ failed_mount_wq: ++ ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); ++ sbi->s_ea_inode_cache = NULL; ++@@ -5800,7 +5801,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) ++ return 0; ++ ++ trace_ext4_sync_fs(sb, wait); ++- flush_workqueue(sbi->rsv_conversion_wq); +++ flush_workqueue(sbi->s_misc_wq); ++ /* ++ * Writeback quota in non-journalled quota case - journalled quota has ++ * no dirty dquots ++diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c ++index 1cb753a..dd72157 100644 ++--- a/fs/ext4/xattr.c +++++ b/fs/ext4/xattr.c ++@@ -1597,6 +1597,36 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, ++ return 0; ++ } ++ +++struct delayed_iput_work { +++ struct work_struct work; +++ struct inode *inode; +++}; +++ +++static void delayed_iput_fn(struct work_struct *work) +++{ +++ struct delayed_iput_work *diwork; +++ +++ diwork = container_of(work, struct delayed_iput_work, work); +++ iput(diwork->inode); +++ kfree(diwork); +++} +++ +++static void delayed_iput(struct inode *inode, struct delayed_iput_work *work) +++{ +++ if (!inode) { +++ kfree(work); +++ return; +++ } +++ +++ if (!work) { +++ iput(inode); +++ } else { +++ INIT_WORK(&work->work, delayed_iput_fn); +++ work->inode = inode; +++ queue_work(EXT4_SB(inode->i_sb)->s_misc_wq, &work->work); +++ } +++} +++ ++ /* ++ * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode ++ * feature is enabled. ++@@ -1614,6 +1644,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, ++ int in_inode = i->in_inode; ++ struct inode *old_ea_inode = NULL; ++ struct inode *new_ea_inode = NULL; +++ struct delayed_iput_work *diwork = NULL; ++ size_t old_size, new_size; ++ int ret; ++ ++@@ -1690,7 +1721,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, ++ * Finish that work before doing any modifications to the xattr data. ++ */ ++ if (!s->not_found && here->e_value_inum) { ++- ret = ext4_xattr_inode_iget(inode, +++ diwork = kmalloc(sizeof(*diwork), GFP_NOFS); +++ if (!diwork) +++ ret = -ENOMEM; +++ else +++ ret = ext4_xattr_inode_iget(inode, ++ le32_to_cpu(here->e_value_inum), ++ le32_to_cpu(here->e_hash), ++ &old_ea_inode); ++@@ -1843,7 +1878,7 @@ update_hash: ++ ++ ret = 0; ++ out: ++- iput(old_ea_inode); +++ delayed_iput(old_ea_inode, diwork); ++ iput(new_ea_inode); ++ return ret; ++ } ++-- ++2.33.0 ++ +diff --git a/ldiskfs/kernel_patches/patches/oe2203/ext4-inode-version.patch b/ldiskfs/kernel_patches/patches/oe2203/ext4-inode-version.patch +new file mode 100644 +index 0000000000..e487696f68 +--- /dev/null ++++ b/ldiskfs/kernel_patches/patches/oe2203/ext4-inode-version.patch +@@ -0,0 +1,50 @@ ++diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h ++index d248a01..a154919 100644 ++--- a/fs/ext4/ext4.h +++++ b/fs/ext4/ext4.h ++@@ -1165,6 +1165,8 @@ struct ext4_inode_info { ++ struct dquot *i_dquot[MAXQUOTAS]; ++ #endif ++ +++ __u64 i_fs_version; +++ ++ /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ ++ __u32 i_csum_seed; ++ ++diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c ++index f73e5eb..5d0a11d 100644 ++--- a/fs/ext4/ialloc.c +++++ b/fs/ext4/ialloc.c ++@@ -1264,6 +1264,7 @@ got: ++ ei->i_dtime = 0; ++ ei->i_block_group = group; ++ ei->i_last_alloc_group = ~0; +++ ei->i_fs_version = 0; ++ ++ ext4_set_inode_flags(inode, true); ++ if (IS_DIRSYNC(inode)) ++diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c ++index 3bdfe01..b22628a 100644 ++--- a/fs/ext4/inode.c +++++ b/fs/ext4/inode.c ++@@ -4220,7 +4220,7 @@ static inline u64 ext4_inode_peek_iversion(const struct inode *inode) ++ if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) ++ return inode_peek_iversion_raw(inode); ++ else ++- return inode_peek_iversion(inode); +++ return EXT4_I(inode)->i_fs_version; ++ } ++ ++ static int ext4_inode_blocks_set(struct ext4_inode *raw_inode, ++@@ -4658,7 +4658,7 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) ++ if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) ++ inode_set_iversion_raw(inode, val); ++ else ++- inode_set_iversion_queried(inode, val); +++ EXT4_I(inode)->i_fs_version = val; ++ } ++ ++ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ++-- ++2.27.0 ++ +diff --git a/ldiskfs/kernel_patches/patches/oe2203/ext4-misc.patch b/ldiskfs/kernel_patches/patches/oe2203/ext4-misc.patch +new file mode 100644 +index 0000000000..0e1528ff18 +--- /dev/null ++++ b/ldiskfs/kernel_patches/patches/oe2203/ext4-misc.patch +@@ -0,0 +1,202 @@ ++diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h ++index 57169367362a..f3e369fc4dae 100644 ++--- a/fs/ext4/ext4.h +++++ b/fs/ext4/ext4.h ++@@ -1870,6 +1870,8 @@ static inline bool ext4_verity_in_progress(struct inode *inode) ++ ++ #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime ++ +++#define JOURNAL_START_HAS_3ARGS 1 +++ ++ /* ++ * Codes for operating systems ++ */ ++@@ -2110,7 +2112,21 @@ static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_bl ++ ++ EXTN_FEATURE_FUNCS(2) ++ EXTN_FEATURE_FUNCS(3) ++-EXTN_FEATURE_FUNCS(4) +++static inline bool ext4_has_unknown_ext4_compat_features(struct super_block *sb) +++{ +++ return ((EXT4_SB(sb)->s_es->s_feature_compat & +++ cpu_to_le32(~EXT4_FEATURE_COMPAT_SUPP)) != 0); +++} +++static inline bool ext4_has_unknown_ext4_ro_compat_features(struct super_block *sb) +++{ +++ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & +++ cpu_to_le32(~EXT4_FEATURE_RO_COMPAT_SUPP)) != 0); +++} +++static inline bool ext4_has_unknown_ext4_incompat_features(struct super_block *sb) +++{ +++ return ((EXT4_SB(sb)->s_es->s_feature_incompat & +++ cpu_to_le32(~EXT4_FEATURE_INCOMPAT_SUPP)) != 0); +++} ++ ++ static inline bool ext4_has_compat_features(struct super_block *sb) ++ { ++@@ -3570,6 +3586,13 @@ struct ext4_extent; ++ #define EXT_MAX_BLOCKS 0xffffffff ++ ++ extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode); +++extern struct buffer_head *ext4_read_inode_bitmap(struct super_block *sb, +++ ext4_group_t block_group); +++extern void ext4_inc_count(struct inode *inode); +++extern void ext4_dec_count(struct inode *inode); +++extern struct buffer_head *ext4_append(handle_t *handle, +++ struct inode *inode, +++ ext4_lblk_t *block); ++ extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); ++ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ++ struct ext4_map_blocks *map, int flags); ++diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c ++index 875af329c43e..646c00c81479 100644 ++--- a/fs/ext4/ialloc.c +++++ b/fs/ext4/ialloc.c ++@@ -120,7 +120,7 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, ++ * ++ * Return buffer_head of bitmap on success, or an ERR_PTR on error. ++ */ ++-static struct buffer_head * +++struct buffer_head * ++ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ++ { ++ struct ext4_group_desc *desc; ++@@ -215,6 +215,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ++ put_bh(bh); ++ return ERR_PTR(err); ++ } +++EXPORT_SYMBOL(ext4_read_inode_bitmap); ++ ++ /* ++ * NOTE! When we get the inode, we're the only people ++diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c ++index e868b33ed8f5..70f3bb4ef5eb 100644 ++--- a/fs/ext4/inode.c +++++ b/fs/ext4/inode.c ++@@ -6215,3 +6215,19 @@ vm_fault_t ext4_filemap_fault(struct vm_fault *vmf) ++ ++ return ret; ++ } +++EXPORT_SYMBOL(ext4_map_blocks); +++EXPORT_SYMBOL(ext4_truncate); +++EXPORT_SYMBOL(ext4_iget); +++EXPORT_SYMBOL(ext4_bread); +++EXPORT_SYMBOL(ext4_itable_unused_count); +++EXPORT_SYMBOL(ext4_force_commit); +++EXPORT_SYMBOL(__ext4_mark_inode_dirty); +++EXPORT_SYMBOL(ext4_get_group_desc); +++EXPORT_SYMBOL(__ext4_journal_get_write_access); +++EXPORT_SYMBOL(__ext4_journal_start_sb); +++EXPORT_SYMBOL(__ext4_journal_stop); +++EXPORT_SYMBOL(__ext4_handle_dirty_metadata); +++EXPORT_SYMBOL(__ext4_std_error); +++EXPORT_SYMBOL(ext4fs_dirhash); +++EXPORT_SYMBOL(ext4_get_inode_loc); +++EXPORT_SYMBOL(__ext4_journal_ensure_credits); ++diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c ++index f4dd3a7ee965..708822bdbfc1 100644 ++--- a/fs/ext4/namei.c +++++ b/fs/ext4/namei.c ++@@ -50,7 +50,7 @@ ++ #define NAMEI_RA_BLOCKS 4 ++ #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) ++ ++-static struct buffer_head *ext4_append(handle_t *handle, +++struct buffer_head *ext4_append(handle_t *handle, ++ struct inode *inode, ++ ext4_lblk_t *block) ++ { ++@@ -100,6 +100,8 @@ static struct buffer_head *ext4_append(handle_t *handle, ++ return ERR_PTR(err); ++ } ++ +++#define assert(test) J_ASSERT(test) +++ ++ static int ext4_dx_csum_verify(struct inode *inode, ++ struct ext4_dir_entry *dirent); ++ ++@@ -209,6 +211,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, ++ } ++ return bh; ++ } +++EXPORT_SYMBOL(ext4_append); ++ ++ #ifndef assert ++ #define assert(test) J_ASSERT(test) ++@@ -2626,23 +2629,25 @@ static int ext4_delete_entry(handle_t *handle, ++ * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set ++ * on regular files) and to avoid creating huge/slow non-HTREE directories. ++ */ ++-static void ext4_inc_count(struct inode *inode) +++void ext4_inc_count(struct inode *inode) ++ { ++ inc_nlink(inode); ++ if (is_dx(inode) && ++ (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2)) ++ set_nlink(inode, 1); ++ } +++EXPORT_SYMBOL(ext4_inc_count); ++ ++ /* ++ * If a directory had nlink == 1, then we should let it be 1. This indicates ++ * directory has >EXT4_LINK_MAX subdirs. ++ */ ++-static void ext4_dec_count(struct inode *inode) +++void ext4_dec_count(struct inode *inode) ++ { ++ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) ++ drop_nlink(inode); ++ } +++EXPORT_SYMBOL(ext4_dec_count); ++ ++ ++ /* ++diff --git a/fs/ext4/super.c b/fs/ext4/super.c ++index a461beea2aca..00437b8cd097 100644 ++--- a/fs/ext4/super.c +++++ b/fs/ext4/super.c ++@@ -5607,7 +5607,7 @@ static void ext4_update_super(struct super_block *sb) ++ __ext4_update_tstamp(&es->s_first_error_time, ++ &es->s_first_error_time_hi, ++ sbi->s_first_error_time); ++- strncpy(es->s_first_error_func, sbi->s_first_error_func, +++ strlcpy(es->s_first_error_func, sbi->s_first_error_func, ++ sizeof(es->s_first_error_func)); ++ es->s_first_error_line = ++ cpu_to_le32(sbi->s_first_error_line); ++@@ -5621,7 +5621,7 @@ static void ext4_update_super(struct super_block *sb) ++ __ext4_update_tstamp(&es->s_last_error_time, ++ &es->s_last_error_time_hi, ++ sbi->s_last_error_time); ++- strncpy(es->s_last_error_func, sbi->s_last_error_func, +++ strlcpy(es->s_last_error_func, sbi->s_last_error_func, ++ sizeof(es->s_last_error_func)); ++ es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); ++ es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); ++@@ -6837,8 +6837,6 @@ static int __init ext4_init_fs(void) ++ if (err) ++ goto out05; ++ ++- register_as_ext3(); ++- register_as_ext2(); ++ err = register_filesystem(&ext4_fs_type); ++ if (err) ++ goto out; ++@@ -6848,8 +6846,6 @@ static int __init ext4_init_fs(void) ++ printk(KERN_ERR "EXT4-fs: Cannot create netlink socket.\n"); ++ return 0; ++ out: ++- unregister_as_ext2(); ++- unregister_as_ext3(); ++ ext4_fc_destroy_dentry_cache(); ++ out05: ++ destroy_inodecache(); ++@@ -6874,8 +6870,6 @@ static int __init ext4_init_fs(void) ++ static void __exit ext4_exit_fs(void) ++ { ++ ext4_destroy_lazyinit_thread(); ++- unregister_as_ext2(); ++- unregister_as_ext3(); ++ unregister_filesystem(&ext4_fs_type); ++ ext4_fc_destroy_dentry_cache(); ++ destroy_inodecache(); +diff --git a/ldiskfs/kernel_patches/patches/oe2203/ext4-pdirop.patch b/ldiskfs/kernel_patches/patches/oe2203/ext4-pdirop.patch +new file mode 100644 +index 0000000000..183832c029 +--- /dev/null ++++ b/ldiskfs/kernel_patches/patches/oe2203/ext4-pdirop.patch +@@ -0,0 +1,926 @@ ++Single directory performance is a critical for HPC workloads. In a ++typical use case an application creates a separate output file for ++each node and task in a job. As nodes and tasks increase, hundreds ++of thousands of files may be created in a single directory within ++a short window of time. ++Today, both filename lookup and file system modifying operations ++(such as create and unlink) are protected with a single lock for ++an entire ldiskfs directory. PDO project will remove this ++bottleneck by introducing a parallel locking mechanism for entire ++ldiskfs directories. This work will enable multiple application ++threads to simultaneously lookup, create and unlink in parallel. ++ ++This patch contains: ++ - pdirops support for ldiskfs ++ - integrate with osd-ldiskfs ++--- ++ fs/ext4/Makefile | 1 + ++ fs/ext4/ext4.h | 78 ++++++++ ++ fs/ext4/namei.c | 464 +++++++++++++++++++++++++++++++++++++++++++---- ++ fs/ext4/super.c | 1 + ++ 4 files changed, 504 insertions(+), 40 deletions(-) ++ ++diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile ++index 49e7af6..f7ced03 100644 ++--- a/fs/ext4/Makefile +++++ b/fs/ext4/Makefile ++@@ -7,6 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ++ ++ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ ++ extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \ +++ htree_lock.o \ ++ indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ ++ mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ ++ super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \ ++diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h ++index 3c6fa2b..c4c5aae 100644 ++--- a/fs/ext4/ext4.h +++++ b/fs/ext4/ext4.h ++@@ -29,6 +29,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -1013,6 +1014,9 @@ struct ext4_inode_info { ++ __u32 i_dtime; ++ ext4_fsblk_t i_file_acl; ++ +++ /* following fields for parallel directory operations -bzzz */ +++ struct semaphore i_append_sem; +++ ++ /* ++ * i_block_group is the number of the block group which contains ++ * this file's inode. Constant across the lifetime of the inode, ++@@ -2429,6 +2433,72 @@ struct dx_hash_info ++ */ ++ #define HASH_NB_ALWAYS 1 ++ +++/* assume name-hash is protected by upper layer */ +++#define EXT4_HTREE_LOCK_HASH 0 +++ +++enum ext4_pdo_lk_types { +++#if EXT4_HTREE_LOCK_HASH +++ EXT4_LK_HASH, +++#endif +++ EXT4_LK_DX, /* index block */ +++ EXT4_LK_DE, /* directory entry block */ +++ EXT4_LK_SPIN, /* spinlock */ +++ EXT4_LK_MAX, +++}; +++ +++/* read-only bit */ +++#define EXT4_LB_RO(b) (1 << (b)) +++/* read + write, high bits for writer */ +++#define EXT4_LB_RW(b) ((1 << (b)) | (1 << (EXT4_LK_MAX + (b)))) +++ +++enum ext4_pdo_lock_bits { +++ /* DX lock bits */ +++ EXT4_LB_DX_RO = EXT4_LB_RO(EXT4_LK_DX), +++ EXT4_LB_DX = EXT4_LB_RW(EXT4_LK_DX), +++ /* DE lock bits */ +++ EXT4_LB_DE_RO = EXT4_LB_RO(EXT4_LK_DE), +++ EXT4_LB_DE = EXT4_LB_RW(EXT4_LK_DE), +++ /* DX spinlock bits */ +++ EXT4_LB_SPIN_RO = EXT4_LB_RO(EXT4_LK_SPIN), +++ EXT4_LB_SPIN = EXT4_LB_RW(EXT4_LK_SPIN), +++ /* accurate searching */ +++ EXT4_LB_EXACT = EXT4_LB_RO(EXT4_LK_MAX << 1), +++}; +++ +++enum ext4_pdo_lock_opc { +++ /* external */ +++ EXT4_HLOCK_READDIR = (EXT4_LB_DE_RO | EXT4_LB_DX_RO), +++ EXT4_HLOCK_LOOKUP = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO | +++ EXT4_LB_EXACT), +++ EXT4_HLOCK_DEL = (EXT4_LB_DE | EXT4_LB_SPIN_RO | +++ EXT4_LB_EXACT), +++ EXT4_HLOCK_ADD = (EXT4_LB_DE | EXT4_LB_SPIN_RO), +++ +++ /* internal */ +++ EXT4_HLOCK_LOOKUP_SAFE = (EXT4_LB_DE_RO | EXT4_LB_DX_RO | +++ EXT4_LB_EXACT), +++ EXT4_HLOCK_DEL_SAFE = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT), +++ EXT4_HLOCK_SPLIT = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN), +++}; +++ +++extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits); +++#define ext4_htree_lock_head_free(lhead) htree_lock_head_free(lhead) +++ +++extern struct htree_lock *ext4_htree_lock_alloc(void); +++#define ext4_htree_lock_free(lck) htree_lock_free(lck) +++ +++extern void ext4_htree_lock(struct htree_lock *lck, +++ struct htree_lock_head *lhead, +++ struct inode *dir, unsigned flags); +++#define ext4_htree_unlock(lck) htree_unlock(lck) +++ +++extern struct buffer_head *ext4_find_entry_locked(struct inode *dir, +++ const struct qstr *d_name, +++ struct ext4_dir_entry_2 **res_dir, +++ int *inlined, struct htree_lock *lck); +++extern int ext4_add_entry_locked(handle_t *handle, struct dentry *dentry, +++ struct inode *inode, struct htree_lock *lck); +++ ++ struct ext4_filename { ++ const struct qstr *usr_fname; ++ struct fscrypt_str disk_name; ++@@ -2803,12 +2873,20 @@ void ext4_insert_dentry(struct inode *inode, ++ struct ext4_filename *fname, void *data); ++ static inline void ext4_update_dx_flag(struct inode *inode) ++ { +++ /* Disable it for ldiskfs, because going from a DX directory to +++ * a non-DX directory while it is in use will completely break +++ * the htree-locking. +++ * If we really want to support this operation in the future, +++ * we need to exclusively lock the directory at here which will +++ * increase complexity of code */ +++#if 0 ++ if (!ext4_has_feature_dir_index(inode->i_sb) && ++ ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ++ /* ext4_iget() should have caught this... */ ++ WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); ++ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); ++ } +++#endif ++ } ++ static const unsigned char ext4_filetype_table[] = { ++ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK ++diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c ++index 24e1276..ae94c33 100644 ++--- a/fs/ext4/namei.c +++++ b/fs/ext4/namei.c ++@@ -56,6 +56,7 @@ struct buffer_head *ext4_append(handle_t *handle, ++ { ++ struct ext4_map_blocks map; ++ struct buffer_head *bh; +++ struct ext4_inode_info *ei = EXT4_I(inode); ++ int err; ++ ++ if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && ++@@ -63,6 +64,10 @@ struct buffer_head *ext4_append(handle_t *handle, ++ EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) ++ return ERR_PTR(-ENOSPC); ++ +++ /* with parallel dir operations all appends +++ * have to be serialized -bzzz */ +++ down(&ei->i_append_sem); +++ ++ *block = inode->i_size >> inode->i_sb->s_blocksize_bits; ++ map.m_lblk = *block; ++ map.m_len = 1; ++@@ -73,16 +78,21 @@ struct buffer_head *ext4_append(handle_t *handle, ++ * directory. ++ */ ++ err = ext4_map_blocks(NULL, inode, &map, 0); ++- if (err < 0) +++ if (err < 0) { +++ up(&ei->i_append_sem); ++ return ERR_PTR(err); +++ } ++ if (err) { +++ up(&ei->i_append_sem); ++ EXT4_ERROR_INODE(inode, "Logical block already allocated"); ++ return ERR_PTR(-EFSCORRUPTED); ++ } ++ ++ bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); ++- if (IS_ERR(bh)) +++ if (IS_ERR(bh)) { +++ up(&ei->i_append_sem); ++ return bh; +++ } ++ inode->i_size += inode->i_sb->s_blocksize; ++ EXT4_I(inode)->i_disksize = inode->i_size; ++ err = ext4_mark_inode_dirty(handle, inode); ++@@ -92,9 +102,11 @@ struct buffer_head *ext4_append(handle_t *handle, ++ err = ext4_journal_get_write_access(handle, bh); ++ if (err) ++ goto out; +++ up(&ei->i_append_sem); ++ return bh; ++ ++ out: +++ up(&ei->i_append_sem); ++ brelse(bh); ++ ext4_std_error(inode->i_sb, err); ++ return ERR_PTR(err); ++@@ -301,7 +313,8 @@ static unsigned dx_node_limit(struct inode *dir); ++ static struct dx_frame *dx_probe(struct ext4_filename *fname, ++ struct inode *dir, ++ struct dx_hash_info *hinfo, ++- struct dx_frame *frame); +++ struct dx_frame *frame, +++ struct htree_lock *lck); ++ static void dx_release(struct dx_frame *frames); ++ static int dx_make_map(struct inode *dir, struct buffer_head *bh, ++ struct dx_hash_info *hinfo, ++@@ -315,12 +328,13 @@ static void dx_insert_block(struct dx_frame *frame, ++ static int ext4_htree_next_block(struct inode *dir, __u32 hash, ++ struct dx_frame *frame, ++ struct dx_frame *frames, ++- __u32 *start_hash); +++ __u32 *start_hash, struct htree_lock *lck); ++ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, ++ struct ext4_filename *fname, ++- struct ext4_dir_entry_2 **res_dir); +++ struct ext4_dir_entry_2 **res_dir, struct htree_lock *lck); ++ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, ++- struct inode *dir, struct inode *inode); +++ struct inode *dir, struct inode *inode, +++ struct htree_lock *lck); ++ ++ /* checksumming functions */ ++ void ext4_initialize_dirent_tail(struct buffer_head *bh, ++@@ -784,6 +798,227 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, ++ } ++ #endif /* DX_DEBUG */ ++ +++/* private data for htree_lock */ +++struct ext4_dir_lock_data { +++ unsigned ld_flags; /* bits-map for lock types */ +++ unsigned ld_count; /* # entries of the last DX block */ +++ struct dx_entry ld_at_entry; /* copy of leaf dx_entry */ +++ struct dx_entry *ld_at; /* position of leaf dx_entry */ +++}; +++ +++#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) +++#define ext4_find_entry(dir, name, dirent, inline) \ +++ ext4_find_entry_locked(dir, name, dirent, inline, NULL) +++#define ext4_add_entry(handle, dentry, inode) \ +++ ext4_add_entry_locked(handle, dentry, inode, NULL) +++ +++/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */ +++#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32) +++ +++static void ext4_htree_event_cb(void *target, void *event) +++{ +++ u64 *block = (u64 *)target; +++ +++ if (*block == dx_get_block((struct dx_entry *)event)) +++ *block = EXT4_HTREE_NODE_CHANGED; +++} +++ +++struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits) +++{ +++ struct htree_lock_head *lhead; +++ +++ lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0); +++ if (lhead != NULL) { +++ htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR, +++ ext4_htree_event_cb); +++ } +++ return lhead; +++} +++EXPORT_SYMBOL(ext4_htree_lock_head_alloc); +++ +++struct htree_lock *ext4_htree_lock_alloc(void) +++{ +++ return htree_lock_alloc(EXT4_LK_MAX, +++ sizeof(struct ext4_dir_lock_data)); +++} +++EXPORT_SYMBOL(ext4_htree_lock_alloc); +++ +++static htree_lock_mode_t ext4_htree_mode(unsigned flags) +++{ +++ switch (flags) { +++ default: /* 0 or unknown flags require EX lock */ +++ return HTREE_LOCK_EX; +++ case EXT4_HLOCK_READDIR: +++ return HTREE_LOCK_PR; +++ case EXT4_HLOCK_LOOKUP: +++ return HTREE_LOCK_CR; +++ case EXT4_HLOCK_DEL: +++ case EXT4_HLOCK_ADD: +++ return HTREE_LOCK_CW; +++ } +++} +++ +++/* return PR for read-only operations, otherwise return EX */ +++static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags) +++{ +++ int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE; +++ +++ /* 0 requires EX lock */ +++ return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR; +++} +++ +++static int ext4_htree_safe_locked(struct htree_lock *lck) +++{ +++ int writer; +++ +++ if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX) +++ return 1; +++ +++ writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) == +++ EXT4_LB_DE; +++ if (writer) /* all readers & writers are excluded? */ +++ return lck->lk_mode == HTREE_LOCK_EX; +++ +++ /* all writers are excluded? */ +++ return lck->lk_mode == HTREE_LOCK_PR || +++ lck->lk_mode == HTREE_LOCK_PW || +++ lck->lk_mode == HTREE_LOCK_EX; +++} +++ +++/* relock htree_lock with EX mode if it's change operation, otherwise +++ * relock it with PR mode. It's noop if PDO is disabled. */ +++static void ext4_htree_safe_relock(struct htree_lock *lck) +++{ +++ if (!ext4_htree_safe_locked(lck)) { +++ unsigned flags = ext4_htree_lock_data(lck)->ld_flags; +++ +++ htree_change_lock(lck, ext4_htree_safe_mode(flags)); +++ } +++} +++ +++void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead, +++ struct inode *dir, unsigned flags) +++{ +++ htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) : +++ ext4_htree_safe_mode(flags); +++ +++ ext4_htree_lock_data(lck)->ld_flags = flags; +++ htree_lock(lck, lhead, mode); +++ if (!is_dx(dir)) +++ ext4_htree_safe_relock(lck); /* make sure it's safe locked */ +++} +++EXPORT_SYMBOL(ext4_htree_lock); +++ +++static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at, +++ unsigned lmask, int wait, void *ev) +++{ +++ u32 key = (at == NULL) ? 0 : dx_get_block(at); +++ u32 mode; +++ +++ /* NOOP if htree is well protected or caller doesn't require the lock */ +++ if (ext4_htree_safe_locked(lck) || +++ !(ext4_htree_lock_data(lck)->ld_flags & lmask)) +++ return 1; +++ +++ mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ? +++ HTREE_LOCK_PW : HTREE_LOCK_PR; +++ while (1) { +++ if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev)) +++ return 1; +++ if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */ +++ return 0; +++ cpu_relax(); /* spin until granted */ +++ } +++} +++ +++static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask) +++{ +++ return ext4_htree_safe_locked(lck) || +++ htree_node_is_granted(lck, ffz(~lmask)); +++} +++ +++static void ext4_htree_node_unlock(struct htree_lock *lck, +++ unsigned lmask, void *buf) +++{ +++ /* NB: it's safe to call mutiple times or even it's not locked */ +++ if (!ext4_htree_safe_locked(lck) && +++ htree_node_is_granted(lck, ffz(~lmask))) +++ htree_node_unlock(lck, ffz(~lmask), buf); +++} +++ +++#define ext4_htree_dx_lock(lck, key) \ +++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL) +++#define ext4_htree_dx_lock_try(lck, key) \ +++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL) +++#define ext4_htree_dx_unlock(lck) \ +++ ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL) +++#define ext4_htree_dx_locked(lck) \ +++ ext4_htree_node_locked(lck, EXT4_LB_DX) +++ +++static void ext4_htree_dx_need_lock(struct htree_lock *lck) +++{ +++ struct ext4_dir_lock_data *ld; +++ +++ if (ext4_htree_safe_locked(lck)) +++ return; +++ +++ ld = ext4_htree_lock_data(lck); +++ switch (ld->ld_flags) { +++ default: +++ return; +++ case EXT4_HLOCK_LOOKUP: +++ ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE; +++ return; +++ case EXT4_HLOCK_DEL: +++ ld->ld_flags = EXT4_HLOCK_DEL_SAFE; +++ return; +++ case EXT4_HLOCK_ADD: +++ ld->ld_flags = EXT4_HLOCK_SPLIT; +++ return; +++ } +++} +++ +++#define ext4_htree_de_lock(lck, key) \ +++ ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL) +++#define ext4_htree_de_unlock(lck) \ +++ ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL) +++ +++#define ext4_htree_spin_lock(lck, key, event) \ +++ ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event) +++#define ext4_htree_spin_unlock(lck) \ +++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL) +++#define ext4_htree_spin_unlock_listen(lck, p) \ +++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p) +++ +++static void ext4_htree_spin_stop_listen(struct htree_lock *lck) +++{ +++ if (!ext4_htree_safe_locked(lck) && +++ htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN))) +++ htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN)); +++} +++ +++enum { +++ DX_HASH_COL_IGNORE, /* ignore collision while probing frames */ +++ DX_HASH_COL_YES, /* there is collision and it does matter */ +++ DX_HASH_COL_NO, /* there is no collision */ +++}; +++ +++static int dx_probe_hash_collision(struct htree_lock *lck, +++ struct dx_entry *entries, +++ struct dx_entry *at, u32 hash) +++{ +++ if (!(lck && ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) { +++ return DX_HASH_COL_IGNORE; /* don't care about collision */ +++ +++ } else if (at == entries + dx_get_count(entries) - 1) { +++ return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */ +++ +++ } else { /* hash collision? */ +++ return ((dx_get_hash(at + 1) & ~1) == hash) ? +++ DX_HASH_COL_YES : DX_HASH_COL_NO; +++ } +++} +++ ++ /* ++ * Probe for a directory leaf block to search. ++ * ++@@ -795,10 +1030,11 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, ++ */ ++ static struct dx_frame * ++ dx_probe(struct ext4_filename *fname, struct inode *dir, ++- struct dx_hash_info *hinfo, struct dx_frame *frame_in) +++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, +++ struct htree_lock *lck) ++ { ++ unsigned count, indirect, level, i; ++- struct dx_entry *at, *entries, *p, *q, *m; +++ struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL; ++ struct dx_root_info *info; ++ struct dx_frame *frame = frame_in; ++ struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); ++@@ -864,8 +1100,16 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, ++ level = 0; ++ blocks[0] = 0; ++ while (1) { +++ if (indirect == level) { /* the last index level */ +++ /* NB: ext4_htree_dx_lock() could be noop if +++ * DX-lock flag is not set for current operation +++ */ +++ ext4_htree_dx_lock(lck, dx); +++ ext4_htree_spin_lock(lck, dx, NULL); +++ } ++ count = dx_get_count(entries); ++ if (!count || count > dx_get_limit(entries)) { +++ ext4_htree_spin_unlock(lck); /* release spin */ ++ ext4_warning_inode(dir, ++ "dx entry: count %u beyond limit %u", ++ count, dx_get_limit(entries)); ++@@ -914,8 +1158,75 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, ++ goto fail; ++ } ++ } ++- if (++level > indirect) +++ +++ if (indirect == level) { /* the last index level */ +++ struct ext4_dir_lock_data *ld; +++ u64 myblock; +++ +++ /* By default we only lock DE-block, however, we will +++ * also lock the last level DX-block if: +++ * a) there is hash collision +++ * we will set DX-lock flag (a few lines below) +++ * and redo to lock DX-block +++ * see detail in dx_probe_hash_collision() +++ * b) it's a retry from splitting +++ * we need to lock the last level DX-block so nobody +++ * else can split any leaf blocks under the same +++ * DX-block, see detail in ext4_dx_add_entry() +++ */ +++ if (ext4_htree_dx_locked(lck)) { +++ /* DX-block is locked, just lock DE-block +++ * and return +++ */ +++ ext4_htree_spin_unlock(lck); +++ if (!ext4_htree_safe_locked(lck)) +++ ext4_htree_de_lock(lck, frame->at); +++ return frame; +++ } +++ /* it's pdirop and no DX lock */ +++ if (dx_probe_hash_collision(lck, entries, at, hash) == +++ DX_HASH_COL_YES) { +++ /* found hash collision, set DX-lock flag +++ * and retry to abtain DX-lock +++ */ +++ ext4_htree_spin_unlock(lck); +++ ext4_htree_dx_need_lock(lck); +++ continue; +++ } +++ ld = ext4_htree_lock_data(lck); +++ /* because I don't lock DX, so @at can't be trusted +++ * after I release spinlock so I have to save it +++ */ +++ ld->ld_at = at; +++ ld->ld_at_entry = *at; +++ ld->ld_count = dx_get_count(entries); +++ +++ frame->at = &ld->ld_at_entry; +++ myblock = dx_get_block(at); +++ +++ /* NB: ordering locking */ +++ ext4_htree_spin_unlock_listen(lck, &myblock); +++ /* other thread can split this DE-block because: +++ * a) I don't have lock for the DE-block yet +++ * b) I released spinlock on DX-block +++ * if it happened I can detect it by listening +++ * splitting event on this DE-block +++ */ +++ ext4_htree_de_lock(lck, frame->at); +++ ext4_htree_spin_stop_listen(lck); +++ +++ if (myblock == EXT4_HTREE_NODE_CHANGED) { +++ /* someone split this DE-block before +++ * I locked it, I need to retry and lock +++ * valid DE-block +++ */ +++ ext4_htree_de_unlock(lck); +++ continue; +++ } ++ return frame; +++ } +++ dx = at; +++ ++level; ++ blocks[level] = block; ++ frame++; ++ frame->bh = ext4_read_dirblock(dir, block, INDEX); ++@@ -986,7 +1297,7 @@ static void dx_release(struct dx_frame *frames) ++ static int ext4_htree_next_block(struct inode *dir, __u32 hash, ++ struct dx_frame *frame, ++ struct dx_frame *frames, ++- __u32 *start_hash) +++ __u32 *start_hash, struct htree_lock *lck) ++ { ++ struct dx_frame *p; ++ struct buffer_head *bh; ++@@ -1001,12 +1312,22 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, ++ * this loop, num_frames indicates the number of interior ++ * nodes need to be read. ++ */ +++ ext4_htree_de_unlock(lck); ++ while (1) { ++- if (++(p->at) < p->entries + dx_get_count(p->entries)) ++- break; +++ if (num_frames > 0 || ext4_htree_dx_locked(lck)) { +++ /* num_frames > 0 : +++ * DX block +++ * ext4_htree_dx_locked: +++ * frame->at is reliable pointer returned by dx_probe, +++ * otherwise dx_probe already knew no collision */ +++ if (++(p->at) < p->entries + dx_get_count(p->entries)) +++ break; +++ } ++ if (p == frames) ++ return 0; ++ num_frames++; +++ if (num_frames == 1) +++ ext4_htree_dx_unlock(lck); ++ p--; ++ } ++ ++@@ -1029,6 +1350,13 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, ++ * block so no check is necessary ++ */ ++ while (num_frames--) { +++ if (num_frames == 0) { +++ /* it's not always necessary, we just don't want to +++ * detect hash collision again */ +++ ext4_htree_dx_need_lock(lck); +++ ext4_htree_dx_lock(lck, p->at); +++ } +++ ++ bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); ++ if (IS_ERR(bh)) ++ return PTR_ERR(bh); ++@@ -1037,6 +1365,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, ++ p->bh = bh; ++ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; ++ } +++ ext4_htree_de_lock(lck, p->at); ++ return 1; ++ } ++ ++@@ -1181,10 +1510,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, ++ } ++ hinfo.hash = start_hash; ++ hinfo.minor_hash = 0; ++- frame = dx_probe(NULL, dir, &hinfo, frames); +++ /* assume it's PR locked */ +++ frame = dx_probe(NULL, dir, &hinfo, frames, NULL); ++ if (IS_ERR(frame)) ++ return PTR_ERR(frame); ++- ++ /* Add '.' and '..' from the htree header */ ++ if (!start_hash && !start_minor_hash) { ++ de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; ++@@ -1224,7 +1553,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, ++ count += ret; ++ hashval = ~0; ++ ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, ++- frame, frames, &hashval); +++ frame, frames, &hashval, NULL); ++ *next_hash = hashval; ++ if (ret < 0) { ++ err = ret; ++@@ -1507,7 +1836,7 @@ static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, ++ static struct buffer_head *__ext4_find_entry(struct inode *dir, ++ struct ext4_filename *fname, ++ struct ext4_dir_entry_2 **res_dir, ++- int *inlined) +++ int *inlined, struct htree_lock *lck) ++ { ++ struct super_block *sb; ++ struct buffer_head *bh_use[NAMEI_RA_SIZE]; ++@@ -1549,7 +1878,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, ++ goto restart; ++ } ++ if (is_dx(dir)) { ++- ret = ext4_dx_find_entry(dir, fname, res_dir); +++ ret = ext4_dx_find_entry(dir, fname, res_dir, lck); ++ /* ++ * On success, or if the error was file not found, ++ * return. Otherwise, fall back to doing a search the ++@@ -1559,6 +1888,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, ++ goto cleanup_and_exit; ++ dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " ++ "falling back\n")); +++ ext4_htree_safe_relock(lck); ++ ret = NULL; ++ } ++ nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); ++@@ -1649,10 +1979,10 @@ cleanup_and_exit: ++ return ret; ++ } ++ ++-static struct buffer_head *ext4_find_entry(struct inode *dir, +++struct buffer_head *ext4_find_entry_locked(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++- int *inlined) +++ int *inlined, struct htree_lock *lck) ++ { ++ int err; ++ struct ext4_filename fname; ++@@ -1664,12 +1994,14 @@ static struct buffer_head *ext4_find_entry(struct inode *dir, ++ if (err) ++ return ERR_PTR(err); ++ ++- bh = __ext4_find_entry(dir, &fname, res_dir, inlined); +++ bh = __ext4_find_entry(dir, &fname, res_dir, inlined, lck); ++ ++ ext4_fname_free_filename(&fname); ++ return bh; ++ } ++ +++EXPORT_SYMBOL(ext4_find_entry_locked); +++ ++ static struct buffer_head *ext4_lookup_entry(struct inode *dir, ++ struct dentry *dentry, ++ struct ext4_dir_entry_2 **res_dir) ++@@ -1684,7 +2016,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, ++ if (err) ++ return ERR_PTR(err); ++ ++- bh = __ext4_find_entry(dir, &fname, res_dir, NULL); +++ bh = __ext4_find_entry(dir, &fname, res_dir, NULL, NULL); ++ ++ ext4_fname_free_filename(&fname); ++ return bh; ++@@ -1692,7 +2024,8 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, ++ ++ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, ++ struct ext4_filename *fname, ++- struct ext4_dir_entry_2 **res_dir) +++ struct ext4_dir_entry_2 **res_dir, +++ struct htree_lock *lck) ++ { ++ struct super_block * sb = dir->i_sb; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; ++@@ -1703,7 +2036,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, ++ #ifdef CONFIG_FS_ENCRYPTION ++ *res_dir = NULL; ++ #endif ++- frame = dx_probe(fname, dir, NULL, frames); +++ frame = dx_probe(fname, dir, NULL, frames, lck); ++ if (IS_ERR(frame)) ++ return (struct buffer_head *) frame; ++ do { ++@@ -1725,7 +2058,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, ++ ++ /* Check to see if we should continue to search */ ++ retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, ++- frames, NULL); +++ frames, NULL, lck); ++ if (retval < 0) { ++ ext4_warning_inode(dir, ++ "error %d reading directory index block", ++@@ -1912,8 +2245,9 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) ++ * Returns pointer to de in block into which the new entry will be inserted. ++ */ ++ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, ++- struct buffer_head **bh,struct dx_frame *frame, ++- struct dx_hash_info *hinfo) +++ struct buffer_head **bh, struct dx_frame *frames, +++ struct dx_frame *frame, struct dx_hash_info *hinfo, +++ struct htree_lock *lck) ++ { ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned continued; ++@@ -1988,8 +2322,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, ++ hash2, split, count-split)); ++ ++ /* Fancy dance to stay within two buffers */ ++- de2 = dx_move_dirents(data1, data2, map + split, count - split, ++- blocksize); +++ if (hinfo->hash < hash2) { +++ de2 = dx_move_dirents(data1, data2, map + split, +++ count - split, blocksize); +++ } else { +++ /* make sure we will add entry to the same block which +++ * we have already locked */ +++ de2 = dx_move_dirents(data1, data2, map, split, blocksize); +++ } ++ de = dx_pack_dirents(data1, blocksize); ++ de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - ++ (char *) de, ++@@ -2007,12 +2347,21 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, ++ dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2, ++ blocksize, 1)); ++ ++- /* Which block gets the new entry? */ ++- if (hinfo->hash >= hash2) { ++- swap(*bh, bh2); ++- de = de2; +++ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL, +++ frame->at); /* notify block is being split */ +++ if (hinfo->hash < hash2) { +++ dx_insert_block(frame, hash2 + continued, newblock); +++ +++ } else { +++ /* switch block number */ +++ dx_insert_block(frame, hash2 + continued, +++ dx_get_block(frame->at)); +++ dx_set_block(frame->at, newblock); +++ (frame->at)++; ++ } ++- dx_insert_block(frame, hash2 + continued, newblock); +++ ext4_htree_spin_unlock(lck); +++ ext4_htree_dx_unlock(lck); +++ ++ err = ext4_handle_dirty_dirblock(handle, dir, bh2); ++ if (err) ++ goto journal_error; ++@@ -2283,7 +2632,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, ++ if (retval) ++ goto out_frames; ++ ++- de = do_split(handle,dir, &bh2, frame, &fname->hinfo); +++ de = do_split(handle, dir, &bh2, frames, frame, &fname->hinfo, NULL); ++ if (IS_ERR(de)) { ++ retval = PTR_ERR(de); ++ goto out_frames; ++@@ -2393,8 +2742,8 @@ out: ++ * may not sleep between calling this and putting something into ++ * the entry, as someone else might have used it while you slept. ++ */ ++-static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ++- struct inode *inode) +++int ext4_add_entry_locked(handle_t *handle, struct dentry *dentry, +++ struct inode *inode, struct htree_lock *lck) ++ { ++ struct inode *dir = d_inode(dentry->d_parent); ++ struct buffer_head *bh = NULL; ++@@ -2443,9 +2792,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ if (dentry->d_name.len == 2 && ++ memcmp(dentry->d_name.name, "..", 2) == 0) ++ return ext4_update_dotdot(handle, dentry, inode); ++- retval = ext4_dx_add_entry(handle, &fname, dir, inode); +++ retval = ext4_dx_add_entry(handle, &fname, dir, inode, lck); ++ if (!retval || (retval != ERR_BAD_DX_DIR)) ++ goto out; +++ ext4_htree_safe_relock(lck); ++ /* Can we just ignore htree data? */ ++ if (ext4_has_metadata_csum(sb)) { ++ EXT4_ERROR_INODE(dir, ++@@ -2508,12 +2858,14 @@ out: ++ ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); ++ return retval; ++ } +++EXPORT_SYMBOL(ext4_add_entry_locked); ++ ++ /* ++ * Returns 0 for success, or a negative error value ++ */ ++ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, ++- struct inode *dir, struct inode *inode) +++ struct inode *dir, struct inode *inode, +++ struct htree_lock *lck) ++ { ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; ++ struct dx_entry *entries, *at; ++@@ -2525,7 +2877,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, ++ ++ again: ++ restart = 0; ++- frame = dx_probe(fname, dir, NULL, frames); +++ frame = dx_probe(fname, dir, NULL, frames, lck); ++ if (IS_ERR(frame)) ++ return PTR_ERR(frame); ++ entries = frame->entries; ++@@ -2560,6 +2912,12 @@ again: ++ struct dx_node *node2; ++ struct buffer_head *bh2; ++ +++ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */ +++ ext4_htree_safe_relock(lck); +++ restart = 1; +++ goto cleanup; +++ } +++ ++ while (frame > frames) { ++ if (dx_get_count((frame - 1)->entries) < ++ dx_get_limit((frame - 1)->entries)) { ++@@ -2661,8 +3019,32 @@ again: ++ restart = 1; ++ goto journal_error; ++ } +++ } else if (!ext4_htree_dx_locked(lck)) { +++ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck); +++ +++ /* not well protected, require DX lock */ +++ ext4_htree_dx_need_lock(lck); +++ at = frame > frames ? (frame - 1)->at : NULL; +++ +++ /* NB: no risk of deadlock because it's just a try. +++ * +++ * NB: we check ld_count for twice, the first time before +++ * having DX lock, the second time after holding DX lock. +++ * +++ * NB: We never free blocks for directory so far, which +++ * means value returned by dx_get_count() should equal to +++ * ld->ld_count if nobody split any DE-block under @at, +++ * and ld->ld_at still points to valid dx_entry. */ +++ if ((ld->ld_count != dx_get_count(entries)) || +++ !ext4_htree_dx_lock_try(lck, at) || +++ (ld->ld_count != dx_get_count(entries))) { +++ restart = 1; +++ goto cleanup; +++ } +++ /* OK, I've got DX lock and nothing changed */ +++ frame->at = ld->ld_at; ++ } ++- de = do_split(handle, dir, &bh, frame, &fname->hinfo); +++ de = do_split(handle, dir, &bh, frames, frame, &fname->hinfo, lck); ++ if (IS_ERR(de)) { ++ err = PTR_ERR(de); ++ goto cleanup; ++@@ -2673,6 +3055,8 @@ again: ++ journal_error: ++ ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ ++ cleanup: +++ ext4_htree_dx_unlock(lck); +++ ext4_htree_de_unlock(lck); ++ brelse(bh); ++ dx_release(frames); ++ /* @restart is true means htree-path has been changed, we need to ++diff --git a/fs/ext4/super.c b/fs/ext4/super.c ++index f7614a5..3af5d10 100644 ++--- a/fs/ext4/super.c +++++ b/fs/ext4/super.c ++@@ -1336,6 +1336,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ++ ++ inode_set_iversion(&ei->vfs_inode, 1); ++ spin_lock_init(&ei->i_raw_lock); +++ sema_init(&ei->i_append_sem, 1); ++ INIT_LIST_HEAD(&ei->i_prealloc_list); ++ atomic_set(&ei->i_prealloc_active, 0); ++ spin_lock_init(&ei->i_prealloc_lock); ++-- ++2.33.0 ++ +diff --git a/ldiskfs/kernel_patches/patches/oe2203/ext4-simple-blockalloc.patch b/ldiskfs/kernel_patches/patches/oe2203/ext4-simple-blockalloc.patch +new file mode 100644 +index 0000000000..66603b7a49 +--- /dev/null ++++ b/ldiskfs/kernel_patches/patches/oe2203/ext4-simple-blockalloc.patch +@@ -0,0 +1,340 @@ ++diff -ur a/fs/ext4/ext4.h b/fs/ext4/ext4.h ++--- a/fs/ext4/ext4.h 2021-12-02 15:38:37.084207460 -0700 +++++ b/fs/ext4/ext4.h 2021-12-02 15:41:51.939182417 -0700 ++@@ -1554,6 +1554,9 @@ ++ unsigned int s_mb_min_to_scan; ++ unsigned int s_mb_stats; ++ unsigned int s_mb_order2_reqs; +++ ext4_fsblk_t s_mb_c1_blocks; +++ ext4_fsblk_t s_mb_c2_blocks; +++ ext4_fsblk_t s_mb_c3_blocks; ++ unsigned long *s_mb_prealloc_table; ++ unsigned int s_mb_group_prealloc; ++ unsigned int s_mb_max_inode_prealloc; ++@@ -1573,6 +1576,9 @@ ++ atomic_t s_bal_goals; /* goal hits */ ++ atomic_t s_bal_breaks; /* too long searches */ ++ atomic_t s_bal_2orders; /* 2^order hits */ +++ /* cX loop didn't find blocks */ +++ atomic64_t s_bal_cX_failed[3]; +++ atomic64_t s_bal_cX_skipped[3]; ++ spinlock_t s_bal_lock; ++ unsigned long s_mb_buddies_generated; ++ unsigned long long s_mb_generation_time; ++@@ -2977,6 +2983,7 @@ ++ /* mballoc.c */ ++ extern const struct proc_ops ext4_seq_prealloc_table_fops; ++ extern const struct seq_operations ext4_mb_seq_groups_ops; +++extern const struct proc_ops ext4_mb_seq_alloc_fops; ++ extern const struct proc_ops ext4_seq_mb_last_group_fops; ++ extern int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v); ++ extern long ext4_mb_stats; ++diff -ur a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c ++--- a/fs/ext4/mballoc.c 2021-12-02 15:38:37.044207688 -0700 +++++ b/fs/ext4/mballoc.c 2021-12-02 15:41:51.943182397 -0700 ++@@ -2281,6 +2281,20 @@ ++ } ++ } ++ +++static u64 available_blocks_count(struct ext4_sb_info *sbi) +++{ +++ ext4_fsblk_t resv_blocks; +++ u64 bfree; +++ struct ext4_super_block *es = sbi->s_es; +++ +++ resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); +++ bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - +++ percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); +++ +++ bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); +++ return bfree - (ext4_r_blocks_count(es) + resv_blocks); +++} +++ ++ static noinline_for_stack int ++ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ++ { ++@@ -2291,6 +2305,7 @@ ++ struct ext4_sb_info *sbi; ++ struct super_block *sb; ++ struct ext4_buddy e4b; +++ ext4_fsblk_t avail_blocks; ++ int lost; ++ ++ sb = ac->ac_sb; ++@@ -2344,6 +2359,21 @@ ++ ++ /* Let's just scan groups to find more-less suitable blocks */ ++ cr = ac->ac_2order ? 0 : 1; +++ +++ /* Choose what loop to pass based on disk fullness */ +++ avail_blocks = available_blocks_count(sbi) ; +++ +++ if (avail_blocks < sbi->s_mb_c3_blocks) { +++ cr = 3; +++ atomic64_inc(&sbi->s_bal_cX_skipped[2]); +++ } else if(avail_blocks < sbi->s_mb_c2_blocks) { +++ cr = 2; +++ atomic64_inc(&sbi->s_bal_cX_skipped[1]); +++ } else if(avail_blocks < sbi->s_mb_c1_blocks) { +++ cr = 1; +++ atomic64_inc(&sbi->s_bal_cX_skipped[0]); +++ } +++ ++ /* ++ * cr == 0 try to get exact allocation, ++ * cr == 3 try to get anything ++@@ -2431,6 +2461,9 @@ ++ if (ac->ac_status != AC_STATUS_CONTINUE) ++ break; ++ } +++ /* Processed all groups and haven't found blocks */ +++ if (i == ngroups) +++ atomic64_inc(&sbi->s_bal_cX_failed[cr]); ++ } ++ ++ if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && ++@@ -2719,6 +2752,92 @@ ++ .proc_write = ext4_mb_last_group_write, ++ }; ++ +++static int mb_seq_alloc_show(struct seq_file *seq, void *v) +++{ +++ struct super_block *sb = seq->private; +++ struct ext4_sb_info *sbi = EXT4_SB(sb); +++ +++ seq_printf(seq, "mballoc:\n"); +++ seq_printf(seq, "\tblocks: %u\n", atomic_read(&sbi->s_bal_allocated)); +++ seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); +++ seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); +++ +++ seq_printf(seq, "\textents_scanned: %u\n", +++ atomic_read(&sbi->s_bal_ex_scanned)); +++ seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); +++ seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); +++ seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); +++ seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); +++ +++ seq_printf(seq, "\tuseless_c1_loops: %llu\n", +++ atomic64_read(&sbi->s_bal_cX_failed[0])); +++ seq_printf(seq, "\tuseless_c2_loops: %llu\n", +++ atomic64_read(&sbi->s_bal_cX_failed[1])); +++ seq_printf(seq, "\tuseless_c3_loops: %llu\n", +++ atomic64_read(&sbi->s_bal_cX_failed[2])); +++ seq_printf(seq, "\tskipped_c1_loops: %llu\n", +++ atomic64_read(&sbi->s_bal_cX_skipped[0])); +++ seq_printf(seq, "\tskipped_c2_loops: %llu\n", +++ atomic64_read(&sbi->s_bal_cX_skipped[1])); +++ seq_printf(seq, "\tskipped_c3_loops: %llu\n", +++ atomic64_read(&sbi->s_bal_cX_skipped[2])); +++ seq_printf(seq, "\tbuddies_generated: %lu\n", +++ sbi->s_mb_buddies_generated); +++ seq_printf(seq, "\tbuddies_time_used: %llu\n", sbi->s_mb_generation_time); +++ seq_printf(seq, "\tpreallocated: %u\n", +++ atomic_read(&sbi->s_mb_preallocated)); +++ seq_printf(seq, "\tdiscarded: %u\n", +++ atomic_read(&sbi->s_mb_discarded)); +++ return 0; +++} +++ +++static ssize_t mb_seq_alloc_write(struct file *file, +++ const char __user *buf, +++ size_t cnt, loff_t *pos) +++{ +++ struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file))); +++ +++ atomic_set(&sbi->s_bal_allocated, 0), +++ atomic_set(&sbi->s_bal_reqs, 0), +++ atomic_set(&sbi->s_bal_success, 0); +++ +++ atomic_set(&sbi->s_bal_ex_scanned, 0), +++ atomic_set(&sbi->s_bal_goals, 0), +++ atomic_set(&sbi->s_bal_2orders, 0), +++ atomic_set(&sbi->s_bal_breaks, 0), +++ atomic_set(&sbi->s_mb_lost_chunks, 0); +++ +++ atomic64_set(&sbi->s_bal_cX_failed[0], 0), +++ atomic64_set(&sbi->s_bal_cX_failed[1], 0), +++ atomic64_set(&sbi->s_bal_cX_failed[2], 0); +++ +++ atomic64_set(&sbi->s_bal_cX_skipped[0], 0), +++ atomic64_set(&sbi->s_bal_cX_skipped[1], 0), +++ atomic64_set(&sbi->s_bal_cX_skipped[2], 0); +++ +++ +++ sbi->s_mb_buddies_generated = 0; +++ sbi->s_mb_generation_time = 0; +++ +++ atomic_set(&sbi->s_mb_preallocated, 0), +++ atomic_set(&sbi->s_mb_discarded, 0); +++ +++ return cnt; +++} +++ +++static int mb_seq_alloc_open(struct inode *inode, struct file *file) +++{ +++ return single_open(file, mb_seq_alloc_show, PDE_DATA(inode)); +++} +++ +++const struct proc_ops ext4_mb_seq_alloc_fops = { +++ .proc_open = mb_seq_alloc_open, +++ .proc_read = seq_read, +++ .proc_lseek = seq_lseek, +++ .proc_release = single_release, +++ .proc_write = mb_seq_alloc_write, +++}; +++ ++ int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v) ++ { ++ struct ext4_sb_info *sbi = EXT4_SB(m->private); ++@@ -2973,6 +3092,7 @@ ++ return 0; ++ } ++ +++#define THRESHOLD_BLOCKS(ts) (ext4_blocks_count(sbi->s_es) / 100 * ts) ++ int ext4_mb_init(struct super_block *sb) ++ { ++ struct ext4_sb_info *sbi = EXT4_SB(sb); ++@@ -3027,6 +3147,9 @@ ++ sbi->s_mb_stats = MB_DEFAULT_STATS; ++ sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; ++ sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC; +++ sbi->s_mb_c1_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C1_THRESHOLD); +++ sbi->s_mb_c2_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C2_THRESHOLD); +++ sbi->s_mb_c3_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C3_THRESHOLD); ++ /* ++ * The default group preallocation is 512, which for 4k block ++ * sizes translates to 2 megabytes. However for bigalloc file ++@@ -3166,6 +3289,16 @@ ++ atomic_read(&sbi->s_bal_reqs), ++ atomic_read(&sbi->s_bal_success)); ++ ext4_msg(sb, KERN_INFO, +++ "mballoc: (%llu, %llu, %llu) useless c(0,1,2) loops", +++ atomic64_read(&sbi->s_bal_cX_failed[0]), +++ atomic64_read(&sbi->s_bal_cX_failed[1]), +++ atomic64_read(&sbi->s_bal_cX_failed[2])); +++ ext4_msg(sb, KERN_INFO, +++ "mballoc: (%llu, %llu, %llu) skipped c(0,1,2) loops", +++ atomic64_read(&sbi->s_bal_cX_skipped[0]), +++ atomic64_read(&sbi->s_bal_cX_skipped[1]), +++ atomic64_read(&sbi->s_bal_cX_skipped[2])); +++ ext4_msg(sb, KERN_INFO, ++ "mballoc: %u extents scanned, %u goal hits, " ++ "%u 2^N hits, %u breaks, %u lost", ++ atomic_read(&sbi->s_bal_ex_scanned), ++diff -ur a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h ++--- a/fs/ext4/mballoc.h 2021-12-02 15:38:36.772209242 -0700 +++++ b/fs/ext4/mballoc.h 2021-12-02 15:41:51.943182397 -0700 ++@@ -68,6 +68,9 @@ ++ * for which requests use 2^N search using buddies ++ */ ++ #define MB_DEFAULT_ORDER2_REQS 8 +++#define MB_DEFAULT_C1_THRESHOLD 25 +++#define MB_DEFAULT_C2_THRESHOLD 15 +++#define MB_DEFAULT_C3_THRESHOLD 5 ++ ++ /* ++ * default group prealloc size 512 blocks ++diff -ur a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c ++--- a/fs/ext4/sysfs.c 2021-12-02 15:38:37.044207688 -0700 +++++ b/fs/ext4/sysfs.c 2021-12-02 15:43:17.050780832 -0700 ++@@ -21,6 +21,9 @@ ++ typedef enum { ++ attr_noop, ++ attr_delayed_allocation_blocks, +++ attr_mb_c1_threshold, +++ attr_mb_c2_threshold, +++ attr_mb_c3_threshold, ++ attr_session_write_kbytes, ++ attr_lifetime_write_kbytes, ++ attr_reserved_clusters, ++@@ -135,6 +138,32 @@ ++ task_pid_vnr(sbi->s_journal->j_task)); ++ } ++ +++#define THRESHOLD_PERCENT(ts) (ts * 100 / ext4_blocks_count(sbi->s_es)) +++ +++static int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf, +++ ext4_fsblk_t *blocks) +++{ +++ unsigned long long val; +++ +++ int ret; +++ +++ ret = kstrtoull(skip_spaces(buf), 0, &val); +++ if (ret || val > 100) +++ return -EINVAL; +++ +++ *blocks = val * ext4_blocks_count(sbi->s_es) / 100; +++ return 0; +++} +++ +++static ssize_t mb_threshold_store(struct ext4_sb_info *sbi, +++ const char *buf, size_t count, +++ ext4_fsblk_t *blocks) +++{ +++ int ret = save_threshold_percent(sbi, buf, blocks); +++ +++ return ret ?: count; +++} +++ ++ #define EXT4_ATTR(_name,_mode,_id) \ ++ static struct ext4_attr ext4_attr_##_name = { \ ++ .attr = {.name = __stringify(_name), .mode = _mode }, \ ++@@ -204,6 +233,9 @@ ++ EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444); ++ EXT4_ATTR_FUNC(reserved_clusters, 0644); ++ EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444); +++EXT4_ATTR_FUNC(mb_c1_threshold, 0644); +++EXT4_ATTR_FUNC(mb_c2_threshold, 0644); +++EXT4_ATTR_FUNC(mb_c3_threshold, 0644); ++ ++ EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, ++ ext4_sb_info, s_inode_readahead_blks); ++@@ -258,6 +290,9 @@ ++ ATTR_LIST(lifetime_write_kbytes), ++ ATTR_LIST(reserved_clusters), ++ ATTR_LIST(sra_exceeded_retry_limit), +++ ATTR_LIST(mb_c1_threshold), +++ ATTR_LIST(mb_c2_threshold), +++ ATTR_LIST(mb_c3_threshold), ++ ATTR_LIST(inode_readahead_blks), ++ ATTR_LIST(inode_goal), ++ ATTR_LIST(max_dir_size), ++@@ -377,6 +412,15 @@ ++ return snprintf(buf, PAGE_SIZE, "%llu\n", ++ (s64) EXT4_C2B(sbi, ++ percpu_counter_sum(&sbi->s_dirtyclusters_counter))); +++ case attr_mb_c1_threshold: +++ return scnprintf(buf, PAGE_SIZE, "%llu\n", +++ THRESHOLD_PERCENT(sbi->s_mb_c1_blocks)); +++ case attr_mb_c2_threshold: +++ return scnprintf(buf, PAGE_SIZE, "%llu\n", +++ THRESHOLD_PERCENT(sbi->s_mb_c2_blocks)); +++ case attr_mb_c3_threshold: +++ return scnprintf(buf, PAGE_SIZE, "%llu\n", +++ THRESHOLD_PERCENT(sbi->s_mb_c3_blocks)); ++ case attr_session_write_kbytes: ++ return session_write_kbytes_show(sbi, buf); ++ case attr_lifetime_write_kbytes: ++@@ -482,6 +526,12 @@ ++ return inode_readahead_blks_store(sbi, buf, len); ++ case attr_trigger_test_error: ++ return trigger_test_error(sbi, buf, len); +++ case attr_mb_c1_threshold: +++ return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c1_blocks); +++ case attr_mb_c2_threshold: +++ return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c2_blocks); +++ case attr_mb_c3_threshold: +++ return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c3_blocks); ++ } ++ return 0; ++ } ++@@ -546,6 +596,8 @@ ++ &ext4_seq_mb_last_group_fops, sb); ++ proc_create_single_data("mb_last_start", S_IRUGO, sbi->s_proc, ++ ext4_mb_seq_last_start_seq_show, sb); +++ proc_create_data("mb_alloc_stats", S_IFREG | S_IRUGO | S_IWUSR, +++ sbi->s_proc, &ext4_mb_seq_alloc_fops, sb); ++ } ++ return 0; ++ } +diff --git a/ldiskfs/kernel_patches/patches/oe2203/ext4-xattr-disable-credits-check.patch b/ldiskfs/kernel_patches/patches/oe2203/ext4-xattr-disable-credits-check.patch +new file mode 100644 +index 0000000000..e281a7ca91 +--- /dev/null ++++ b/ldiskfs/kernel_patches/patches/oe2203/ext4-xattr-disable-credits-check.patch +@@ -0,0 +1,24 @@ ++Subject: [PATCH] ext4-xattr-disable-credits-check ++ ++--- ++ fs/ext4/xattr.c | 4 ---- ++ 1 file changed, 4 deletions(-) ++ ++diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c ++index 850b9cf..f29de20 100644 ++--- a/fs/ext4/xattr.c +++++ b/fs/ext4/xattr.c ++@@ -2387,10 +2387,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, ++ flags & XATTR_CREATE); ++ brelse(bh); ++ ++- if (jbd2_handle_buffer_credits(handle) < credits) { ++- error = -ENOSPC; ++- goto cleanup; ++- } ++ WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); ++ } ++ ++-- ++2.27.0 ++ +diff --git a/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203.series b/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203.series +new file mode 100644 +index 0000000000..65d1f3a312 +--- /dev/null ++++ b/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203.series +@@ -0,0 +1,33 @@ ++oe2203/ext4-inode-version.patch ++linux-5.4/ext4-lookup-dotdot.patch ++suse15/ext4-print-inum-in-htree-warning.patch ++linux-5.8/ext4-prealloc.patch ++ubuntu18/ext4-osd-iop-common.patch ++oe2203/ext4-misc.patch ++linux-5.8/ext4-mballoc-extra-checks.patch ++linux-5.4/ext4-hash-indexed-dir-dotdot-update.patch ++linux-5.8/ext4-kill-dx-root.patch ++linux-5.8/ext4-mballoc-pa-free-mismatch.patch ++linux-5.10/ext4-data-in-dirent.patch ++rhel8/ext4-nocmtime.patch ++base/ext4-htree-lock.patch ++oe2203/ext4-pdirop.patch ++linux-5.8/ext4-max-dir-size.patch ++linux-5.8/ext4-corrupted-inode-block-bitmaps-handling-patches.patch ++linux-5.10/ext4-give-warning-with-dir-htree-growing.patch ++ubuntu18/ext4-jcb-optimization.patch ++linux-5.10/ext4-attach-jinode-in-writepages.patch ++rhel8/ext4-dont-check-before-replay.patch ++rhel7.6/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch ++rhel7.6/ext4-export-orphan-add.patch ++linux-5.8/ext4-export-mb-stream-allocator-variables.patch ++ubuntu19/ext4-iget-with-flags.patch ++linux-5.4/export-ext4fs-dirhash-helper.patch ++oe2203/ext4-simple-blockalloc.patch ++oe2203/ext4-xattr-disable-credits-check.patch ++linux-5.8/ext4-no-max-dir-size-limit-for-iam-objects.patch ++rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch ++base/ext4-projid-xattrs.patch ++linux-5.8/ext4-enc-flag.patch ++oe2203/ext4-delayed-iput.patch ++rhel8/ext4-old_ea_inodes_handling_fix.patch +diff --git a/lustre/ChangeLog b/lustre/ChangeLog +index 192662875e..9401e95187 100644 +--- a/lustre/ChangeLog ++++ b/lustre/ChangeLog +@@ -100,6 +100,7 @@ TBD Whamcloud + vanilla linux 5.4.0 (ZFS + ldiskfs) + vanilla linux 5.4.21 (ZFS + ldiskfs) + vanilla linux 5.4.136 (ZFS + ldiskfs) ++ 5.10.0-60.94.0.118.oe2203 (openEuler 22.03 LTS) + * ldiskfs needs an ldiskfs patch series for that kernel, ZFS does not + * Client primary kernels built and tested during release cycle: + 5.14.0-284.11.1.el9 (RHEL9.2) +@@ -206,7 +207,7 @@ TBD Whamcloud + 5.8.0-53 (Ubuntu 20.04.2 HWE) + 5.11.0-31 (Ubuntu 20.04.3 HWE) + 5.11.0 (vanilla kernel.org) +- 5.10.0-60.56.0.84.oe2203 (openEuler 22.03 LTS) ++ 5.10.0 (openEuler 22.03 LTS) + * Recommended e2fsprogs version: 1.46.5.wc1 or newer + * Recommended ZFS version: 2.1.5 + * NFS export disabled when stack size < 8192 (32-bit Lustre clients), +diff --git a/lustre/kernel_patches/targets/5.10-oe2203.target.in b/lustre/kernel_patches/targets/5.10-oe2203.target.in +index 5ba89d235f..69c7e332d0 100644 +--- a/lustre/kernel_patches/targets/5.10-oe2203.target.in ++++ b/lustre/kernel_patches/targets/5.10-oe2203.target.in +@@ -1,5 +1,5 @@ + lnxmaj="5.10.0" +-lnxrel="60.56.0.84.oe2203" ++lnxrel="60.94.0.118.oe2203" + + KERNEL_SRPM=kernel-${lnxmaj}-${lnxrel}.src.rpm + SERIES="" +-- +2.33.0 + diff --git a/0040-LU-16824-ldiskfs-add-support-for-openEuler-22.03-LTS.patch b/0040-LU-16824-ldiskfs-add-support-for-openEuler-22.03-LTS.patch new file mode 100644 index 0000000000000000000000000000000000000000..2e51ba4e5d2990eaae5679505d8cb82b612534d4 --- /dev/null +++ b/0040-LU-16824-ldiskfs-add-support-for-openEuler-22.03-LTS.patch @@ -0,0 +1,1032 @@ +From 1d3b7b31f3f0f4c9040e16ade9b333e32c94bbae Mon Sep 17 00:00:00 2001 +From: Xinliang Liu +Date: Wed, 10 May 2023 10:08:38 +0000 +Subject: [PATCH 40/61] LU-16824 ldiskfs: add support for openEuler 22.03 LTS + SP1 + +Add openEuler 22.03 LTS SP1 config target file. +Fix tiny conflicts for patch ext4-delayed-iput.patch and +ext4-data-in-dirent.patch. +Add missing patch ext4-encdata.patch. +Add build required pkg kernel-debugsource for ldiskfs build. + +Change-Id: I68314c9df17ce991a5e46f2ed4746ce1703b1587 +Test-Parameters: trivial +Signed-off-by: Xinliang Liu +--- + config/lustre-build-ldiskfs.m4 | 1 + + contrib/lbuild/funcs.sh | 10 + + contrib/lbuild/lbuild | 3 + + contrib/lbuild/lbuild-oe2203 | 12 - + contrib/lbuild/lbuild-oe2203sp1 | 40 + + .../oe2203sp1/ext4-data-in-dirent.patch | 764 ++++++++++++++++++ + .../series/ldiskfs-5.10.0-oe2203sp1.series | 33 + + lustre.spec.in | 3 + + lustre/ChangeLog | 1 + + lustre/autoconf/lustre-core.m4 | 1 + + .../targets/5.10-oe2203sp1.target.in | 21 + + 11 files changed, 877 insertions(+), 12 deletions(-) + create mode 100644 contrib/lbuild/lbuild-oe2203sp1 + create mode 100644 ldiskfs/kernel_patches/patches/oe2203sp1/ext4-data-in-dirent.patch + create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203sp1.series + create mode 100644 lustre/kernel_patches/targets/5.10-oe2203sp1.target.in + +diff --git a/config/lustre-build-ldiskfs.m4 b/config/lustre-build-ldiskfs.m4 +index 8bc334b951..f0e987b010 100644 +--- a/config/lustre-build-ldiskfs.m4 ++++ b/config/lustre-build-ldiskfs.m4 +@@ -126,6 +126,7 @@ AS_IF([test x$RHEL_KERNEL = xyes], [ + ], [test x$OPENEULER_KERNEL = xyes], [ + case $OPENEULER_VERSION_NO in + 2203.0) LDISKFS_SERIES="5.10.0-oe2203.series" ;; ++ 2203.1) LDISKFS_SERIES="5.10.0-oe2203sp1.series" ;; + esac + ]) + ]) +diff --git a/contrib/lbuild/funcs.sh b/contrib/lbuild/funcs.sh +index 78e2af9468..8db9532b67 100644 +--- a/contrib/lbuild/funcs.sh ++++ b/contrib/lbuild/funcs.sh +@@ -156,8 +156,17 @@ autodetect_distro() { + ;; + "openEuler") + name="oe" ++ # For LTS SP release the codename is 'LTS-SPx' e.g. 'LTS-SP1' ++ # otherwise the codename is 'n/a'. ++ lts_sp=$(lsb_release -s -c) + # Change from YY.MM to YYMM, let DISTROMAJ contain MM part + version=${version/./} ++ # Append LTS SP ++ if [[ "$lts_sp" != "n/a" ]]; then ++ lts_sp=${lts_sp##*-} ++ lts_sp=${lts_sp,,} ++ version="${version}${lts_sp}" ++ fi + ;; + *) + fatal 1 "I don't know what distro name $name and version $version is.\nEither update autodetect_distro() or use the --distro argument." +@@ -213,6 +222,7 @@ autodetect_target() { + sles15.4) target="$(uname -r | cut -d . -f 1,2)-sles15sp4";; + fc18) target="3.x-fc18";; + oe2203) target="5.10-oe2203";; ++ oe2203sp1) target="5.10-oe2203sp1";; + *) fatal 1 "I don't know what distro $distro is.\nEither update autodetect_target() or use the --target argument.";; + esac + +diff --git a/contrib/lbuild/lbuild b/contrib/lbuild/lbuild +index 0ad235afeb..1170c4ffb3 100755 +--- a/contrib/lbuild/lbuild ++++ b/contrib/lbuild/lbuild +@@ -332,6 +332,9 @@ check_options() { + 5.10-oe2203) + CANONICAL_TARGET="oe2203" + ;; ++ 5.10-oe2203sp1) ++ CANONICAL_TARGET="oe2203sp1" ++ ;; + esac + + local timestampnodig=$(echo $TIMESTAMP | sed -e s/[0-9]*//g) +diff --git a/contrib/lbuild/lbuild-oe2203 b/contrib/lbuild/lbuild-oe2203 +index d49a1e67f8..1f4dc54947 100644 +--- a/contrib/lbuild/lbuild-oe2203 ++++ b/contrib/lbuild/lbuild-oe2203 +@@ -37,15 +37,3 @@ kernel_debuginfo_location() { + + echo "$base_os/update/$TARGET_ARCH/Packages/" + } +- +-cleanup_rpmmacros() { +- sed -i "/^%kernel_module_package/,/^)}$/d" $RMAC +-} +- +-apply_kmod_requires_conflicts() { +- if $PATCHLESS; then +- # don't allow the patched kernel to be considered as +- # a valid kernel for the patchless client +- echo "Conflicts: kernel-lustre" >> rpm/kmp-lustre.preamble +- fi +-} +diff --git a/contrib/lbuild/lbuild-oe2203sp1 b/contrib/lbuild/lbuild-oe2203sp1 +new file mode 100644 +index 0000000000..b115a347fc +--- /dev/null ++++ b/contrib/lbuild/lbuild-oe2203sp1 +@@ -0,0 +1,40 @@ ++source ${LBUILD_DIR}/lbuild-rhel ++ ++# increment this if you have made a change that should force a new kernel ++# to build built ++BUILD_GEN+=".0" ++ ++SPEC_NAME="kernel.spec" ++DEVEL_PATH_ARCH_DELIMETER="." ++USE_KABI=false ++RPM_HELPERS_DIR="/usr/lib/rpm/openEuler" ++# Pkg which contains ext4 source code ++KERNEL_DEBUGINFO="kernel-debugsource-${lnxmaj}-${lnxrel}.${TARGET_ARCH}.rpm" ++DISTRO_REPO_MIRROR=${DISTRO_REPO_MIRROR:-"https://repo.openeuler.org"} ++ ++# force local definition of %dist into ~/.rpmmacros ++# to avoid verbose extended strings like ".el9.centos" ++# in kernel version and rpm names ++# ++RMAC=$HOME/.rpmmacros ++grep '^%dist' $RMAC &> /dev/null || echo '%dist .oe2203sp1' >> $RMAC ++ ++unpack_linux_devel_rpm-oe2203sp1() { ++ local callers_rpm="$1" ++ ++ unpack_linux_devel_rpm-rhel "$callers_rpm" ++} ++ ++find_linux_rpm-oe2203sp1() { ++ local prefix="$1" ++ local wanted_kernel="$2" ++ local pathtorpms=${3:-"$KERNELRPMSBASE/$lnxmaj/$DISTROMAJ/$TARGET_ARCH"} ++ ++ find_linux_rpm-rhel "$prefix" "$wanted_kernel" "$pathtorpms" ++} ++ ++kernel_debuginfo_location() { ++ local base_os="$DISTRO_REPO_MIRROR/openEuler-22.03-LTS-SP1" ++ ++ echo "$base_os/update/$TARGET_ARCH/Packages/" ++} +diff --git a/ldiskfs/kernel_patches/patches/oe2203sp1/ext4-data-in-dirent.patch b/ldiskfs/kernel_patches/patches/oe2203sp1/ext4-data-in-dirent.patch +new file mode 100644 +index 0000000000..9512f94634 +--- /dev/null ++++ b/ldiskfs/kernel_patches/patches/oe2203sp1/ext4-data-in-dirent.patch +@@ -0,0 +1,764 @@ ++From ef3b0235735794064352d9b053802b368ecdfcc9 Mon Sep 17 00:00:00 2001 ++From: Xinliang Liu ++Date: Thu, 11 May 2023 09:57:05 +0000 ++Subject: [PATCH] ext4 data in dirent ++ ++this patch implements feature which allows ext4 fs users (e.g. Lustre) ++to store data in ext4 dirent. ++data is stored in ext4 dirent after file-name, this space is accounted ++in de->rec_len. flag EXT4_DIRENT_LUFID added to d_type if extra data ++is present. ++ ++make use of dentry->d_fsdata to pass fid to ext4. so no ++changes in ext4_add_entry() interface required. ++ ++Signed-off-by: Xinliang Liu ++--- ++ fs/ext4/dir.c | 13 ++- ++ fs/ext4/ext4.h | 100 +++++++++++++++++++-- ++ fs/ext4/fast_commit.c | 2 +- ++ fs/ext4/inline.c | 8 +- ++ fs/ext4/namei.c | 201 +++++++++++++++++++++++++++++++++--------- ++ fs/ext4/super.c | 4 +- ++ 6 files changed, 270 insertions(+), 58 deletions(-) ++ ++diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c ++index 70a0f5e..ff7d8c3 100644 ++--- a/fs/ext4/dir.c +++++ b/fs/ext4/dir.c ++@@ -78,7 +78,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, ++ error_msg = "rec_len is smaller than minimal"; ++ else if (unlikely(rlen % 4 != 0)) ++ error_msg = "rec_len % 4 != 0"; ++- else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) +++ else if (unlikely(rlen < EXT4_DIR_ENTRY_LEN(de))) ++ error_msg = "rec_len is too small for name_len"; ++ else if (unlikely(next_offset > size)) ++ error_msg = "directory entry overrun"; ++@@ -226,7 +226,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) ++ * failure will be detected in the ++ * dirent test below. */ ++ if (ext4_rec_len_from_disk(de->rec_len, ++- sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) +++ sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) ++ break; ++ i += ext4_rec_len_from_disk(de->rec_len, ++ sb->s_blocksize); ++@@ -449,12 +449,17 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, ++ struct fname *fname, *new_fn; ++ struct dir_private_info *info; ++ int len; +++ int extra_data = 0; ++ ++ info = dir_file->private_data; ++ p = &info->root.rb_node; ++ ++ /* Create and allocate the fname structure */ ++- len = sizeof(struct fname) + ent_name->len + 1; +++ if (dirent->file_type & EXT4_DIRENT_LUFID) +++ extra_data = ext4_get_dirent_data_len(dirent); +++ +++ len = sizeof(struct fname) + ent_name->len + extra_data + 1; +++ ++ new_fn = kzalloc(len, GFP_KERNEL); ++ if (!new_fn) ++ return -ENOMEM; ++@@ -463,7 +468,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, ++ new_fn->inode = le32_to_cpu(dirent->inode); ++ new_fn->name_len = ent_name->len; ++ new_fn->file_type = dirent->file_type; ++- memcpy(new_fn->name, ent_name->name, ent_name->len); +++ memcpy(new_fn->name, ent_name->name, ent_name->len + extra_data); ++ ++ while (*p) { ++ parent = *p; ++diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h ++index 143ce00..98786d8 100644 ++--- a/fs/ext4/ext4.h +++++ b/fs/ext4/ext4.h ++@@ -1167,6 +1167,7 @@ struct ext4_inode_info { ++ __u32 i_csum_seed; ++ ++ kprojid_t i_projid; +++ void *i_dirdata; ++ ++ /* Protect concurrent add cluster delayed block and remove block */ ++ struct mutex i_clu_lock; ++@@ -1191,6 +1192,7 @@ struct ext4_inode_info { ++ * Mount flags set via mount options or defaults ++ */ ++ #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ +++#define EXT4_MOUNT_DIRDATA 0x00002 /* Data in directory entries */ ++ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ ++ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ ++ #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ ++@@ -2086,6 +2088,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) ++ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ ++ EXT4_FEATURE_INCOMPAT_EA_INODE| \ ++ EXT4_FEATURE_INCOMPAT_MMP | \ +++ EXT4_FEATURE_INCOMPAT_DIRDATA| \ ++ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ ++ EXT4_FEATURE_INCOMPAT_ENCRYPT | \ ++ EXT4_FEATURE_INCOMPAT_CASEFOLD | \ ++@@ -2268,6 +2271,43 @@ struct ext4_dir_entry_tail { ++ #define EXT4_FT_SYMLINK 7 ++ ++ #define EXT4_FT_MAX 8 +++#define EXT4_FT_MASK 0xf +++ +++#if EXT4_FT_MAX > EXT4_FT_MASK +++#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK" +++#endif +++ +++/* +++ * d_type has 4 unused bits, so it can hold four types data. these different +++ * type of data (e.g. lustre data, high 32 bits of 64-bit inode number) can be +++ * stored, in flag order, after file-name in ext4 dirent. +++*/ +++/* +++ * this flag is added to d_type if ext4 dirent has extra data after +++ * filename. this data length is variable and length is stored in first byte +++ * of data. data start after filename NUL byte. +++ * This is used by Lustre FS. +++ */ +++#define EXT4_DIRENT_LUFID 0x10 +++ +++#define EXT4_LUFID_MAGIC 0xAD200907UL +++struct ext4_dentry_param { +++ __u32 edp_magic; /* EXT4_LUFID_MAGIC */ +++ char edp_len; /* size of edp_data in bytes */ +++ char edp_data[0]; /* packed array of data */ +++} __packed; +++ +++static inline unsigned char *ext4_dentry_get_data(struct super_block *sb, +++ struct ext4_dentry_param *p) +++ +++{ +++ if (!ext4_has_feature_dirdata(sb)) +++ return NULL; +++ if (p && p->edp_magic == EXT4_LUFID_MAGIC) +++ return &p->edp_len; +++ else +++ return NULL; +++} ++ ++ #define EXT4_FT_DIR_CSUM 0xDE ++ ++@@ -2278,8 +2318,16 @@ struct ext4_dir_entry_tail { ++ */ ++ #define EXT4_DIR_PAD 4 ++ #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) ++-#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ +++#define EXT4_DIR_REC_LEN_(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ ++ ~EXT4_DIR_ROUND) +++#define EXT4_DIR_ENTRY_LEN_(de) (EXT4_DIR_REC_LEN_((de)->name_len +\ +++ ext4_get_dirent_data_len(de))) +++/* ldiskfs */ +++#define EXT4_DIR_REC_LEN(name_len) EXT4_DIR_REC_LEN_((name_len)) +++#define EXT4_DIR_ENTRY_LEN(de) EXT4_DIR_ENTRY_LEN_((de)) +++/* lustre osd_handler compat */ +++#define __EXT4_DIR_REC_LEN(name_len) EXT4_DIR_REC_LEN_((name_len)) +++ ++ #define EXT4_MAX_REC_LEN ((1<<16)-1) ++ ++ /* ++@@ -2746,11 +2794,11 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, ++ struct buffer_head *bh, ++ void *buf, int buf_size, ++ struct ext4_filename *fname, ++- struct ext4_dir_entry_2 **dest_de); +++ struct ext4_dir_entry_2 **dest_de, int *dlen); ++ void ext4_insert_dentry(struct inode *inode, ++ struct ext4_dir_entry_2 *de, ++ int buf_size, ++- struct ext4_filename *fname); +++ struct ext4_filename *fname, void *data); ++ static inline void ext4_update_dx_flag(struct inode *inode) ++ { ++ if (!ext4_has_feature_dir_index(inode->i_sb) && ++@@ -2766,10 +2814,17 @@ static const unsigned char ext4_filetype_table[] = { ++ ++ static inline unsigned char get_dtype(struct super_block *sb, int filetype) ++ { ++- if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) +++ int fl_index = filetype & EXT4_FT_MASK; +++ +++ if (!ext4_has_feature_filetype(sb) || fl_index >= EXT4_FT_MAX) ++ return DT_UNKNOWN; ++ ++- return ext4_filetype_table[filetype]; +++ if (!test_opt(sb, DIRDATA)) +++ return ext4_filetype_table[fl_index]; +++ +++ return (ext4_filetype_table[fl_index]) | +++ (filetype & EXT4_DIRENT_LUFID); +++ ++ } ++ extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, ++ void *buf, int buf_size); ++@@ -2965,7 +3020,8 @@ extern int ext4_ind_migrate(struct inode *inode); ++ ++ /* namei.c */ ++ extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, ++- struct inode *inode); +++ struct inode *inode, +++ const void *data1, const void *data2); ++ extern int ext4_dirblock_csum_verify(struct inode *inode, ++ struct buffer_head *bh); ++ extern int ext4_orphan_add(handle_t *, struct inode *); ++@@ -2976,6 +3032,8 @@ extern struct inode *ext4_create_inode(handle_t *handle, ++ extern int ext4_delete_entry(handle_t *handle, struct inode * dir, ++ struct ext4_dir_entry_2 *de_del, ++ struct buffer_head *bh); +++extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, +++ struct inode *inode, const void *, const void *); ++ extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, ++ __u32 start_minor_hash, __u32 *next_hash); ++ extern int ext4_search_dir(struct buffer_head *bh, ++@@ -3765,6 +3823,36 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) ++ return buffer_uptodate(bh); ++ } ++ +++/* +++ * Compute the total directory entry data length. +++ * This includes the filename and an implicit NUL terminator (always present), +++ * and optional extensions. Each extension has a bit set in the high 4 bits of +++ * de->file_type, and the extension length is the first byte in each entry. +++ */ +++static inline int ext4_get_dirent_data_len(struct ext4_dir_entry_2 *de) +++{ +++ char *len = de->name + de->name_len + 1 /* NUL terminator */; +++ int dlen = 0; +++ __u8 extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4; +++ struct ext4_dir_entry_tail *t = (struct ext4_dir_entry_tail *)de; +++ +++ if (!t->det_reserved_zero1 && +++ le16_to_cpu(t->det_rec_len) == +++ sizeof(struct ext4_dir_entry_tail) && +++ !t->det_reserved_zero2 && +++ t->det_reserved_ft == EXT4_FT_DIR_CSUM) +++ return 0; +++ +++ while (extra_data_flags) { +++ if (extra_data_flags & 1) { +++ dlen += *len + (dlen == 0); +++ len += *len; +++ } +++ extra_data_flags >>= 1; +++ } +++ return dlen; +++} +++ ++ #endif /* __KERNEL__ */ ++ ++ #define EFSBADCRC EBADMSG /* Bad CRC detected */ ++diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c ++index 41dcf21..1023ace 100644 ++--- a/fs/ext4/fast_commit.c +++++ b/fs/ext4/fast_commit.c ++@@ -1547,7 +1547,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, ++ jbd_debug(1, "Dir %d not found.", darg.ino); ++ goto out; ++ } ++- ret = ext4_init_new_dir(NULL, dir, inode); +++ ret = ext4_init_new_dir(NULL, dir, inode, NULL, NULL); ++ iput(dir); ++ if (ret) { ++ ret = 0; ++diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c ++index c2c688c..686d14a 100644 ++--- a/fs/ext4/inline.c +++++ b/fs/ext4/inline.c ++@@ -1033,7 +1033,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, ++ struct ext4_dir_entry_2 *de; ++ ++ err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, ++- inline_size, fname, &de); +++ inline_size, fname, &de, NULL); ++ if (err) ++ return err; ++ ++@@ -1041,7 +1041,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, ++ err = ext4_journal_get_write_access(handle, iloc->bh); ++ if (err) ++ return err; ++- ext4_insert_dentry(inode, de, inline_size, fname); +++ ext4_insert_dentry(inode, de, inline_size, fname, NULL); ++ ++ ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); ++ ++@@ -1398,7 +1398,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, ++ fake.name_len = 1; ++ strcpy(fake.name, "."); ++ fake.rec_len = ext4_rec_len_to_disk( ++- EXT4_DIR_REC_LEN(fake.name_len), +++ EXT4_DIR_ENTRY_LEN(&fake), ++ inline_size); ++ ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); ++ de = &fake; ++@@ -1408,7 +1408,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, ++ fake.name_len = 2; ++ strcpy(fake.name, ".."); ++ fake.rec_len = ext4_rec_len_to_disk( ++- EXT4_DIR_REC_LEN(fake.name_len), +++ EXT4_DIR_ENTRY_LEN(&fake), ++ inline_size); ++ ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); ++ de = &fake; ++diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c ++index 1537a76..24e1276 100644 ++--- a/fs/ext4/namei.c +++++ b/fs/ext4/namei.c ++@@ -295,7 +295,8 @@ static unsigned dx_get_count(struct dx_entry *entries); ++ static unsigned dx_get_limit(struct dx_entry *entries); ++ static void dx_set_count(struct dx_entry *entries, unsigned value); ++ static void dx_set_limit(struct dx_entry *entries, unsigned value); ++-static unsigned dx_root_limit(struct inode *dir, unsigned infosize); +++static inline unsigned dx_root_limit(struct inode *dir, +++ struct ext4_dir_entry_2 *dot_de, unsigned infosize); ++ static unsigned dx_node_limit(struct inode *dir); ++ static struct dx_frame *dx_probe(struct ext4_filename *fname, ++ struct inode *dir, ++@@ -439,22 +440,23 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, ++ { ++ struct ext4_dir_entry *dp; ++ struct dx_root_info *root; ++- int count_offset; +++ int count_offset, dot_rec_len, dotdot_rec_len; ++ ++ if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb)) ++ count_offset = 8; ++- else if (le16_to_cpu(dirent->rec_len) == 12) { ++- dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); +++ else { +++ dot_rec_len = le16_to_cpu(dirent->rec_len); +++ dp = (struct ext4_dir_entry *)(((void *)dirent) + dot_rec_len); ++ if (le16_to_cpu(dp->rec_len) != ++- EXT4_BLOCK_SIZE(inode->i_sb) - 12) +++ EXT4_BLOCK_SIZE(inode->i_sb) - dot_rec_len) ++ return NULL; ++- root = (struct dx_root_info *)(((void *)dp + 12)); +++ dotdot_rec_len = EXT4_DIR_ENTRY_LEN((struct ext4_dir_entry_2 *)dp); +++ root = (struct dx_root_info *)(((void *)dp + dotdot_rec_len)); ++ if (root->reserved_zero || ++ root->info_length != sizeof(struct dx_root_info)) ++ return NULL; ++- count_offset = 32; ++- } else ++- return NULL; +++ count_offset = 8 + dot_rec_len + dotdot_rec_len; +++ } ++ ++ if (offset) ++ *offset = count_offset; ++@@ -559,11 +561,12 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) ++ */ ++ struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de) ++ { +++ BUG_ON(de->name_len != 1); ++ /* get dotdot first */ ++- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1)); +++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_ENTRY_LEN(de)); ++ ++ /* dx root info is after dotdot entry */ ++- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2)); +++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_ENTRY_LEN(de)); ++ ++ return (struct dx_root_info *)de; ++ } ++@@ -608,10 +611,16 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value) ++ ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); ++ } ++ ++-static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) +++static inline unsigned dx_root_limit(struct inode *dir, +++ struct ext4_dir_entry_2 *dot_de, unsigned infosize) ++ { ++- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - ++- EXT4_DIR_REC_LEN(2) - infosize; +++ struct ext4_dir_entry_2 *dotdot_de; +++ unsigned entry_space; +++ +++ BUG_ON(dot_de->name_len != 1); +++ dotdot_de = ext4_next_entry(dot_de, dir->i_sb->s_blocksize); +++ entry_space = dir->i_sb->s_blocksize - EXT4_DIR_ENTRY_LEN(dot_de) - +++ EXT4_DIR_ENTRY_LEN(dotdot_de) - infosize; ++ ++ if (ext4_has_metadata_csum(dir->i_sb)) ++ entry_space -= sizeof(struct dx_tail); ++@@ -731,7 +740,7 @@ static struct stats dx_show_leaf(struct inode *dir, ++ (unsigned) ((char *) de - base)); ++ #endif ++ } ++- space += EXT4_DIR_REC_LEN(de->name_len); +++ space += EXT4_DIR_ENTRY_LEN(de); ++ names++; ++ } ++ de = ext4_next_entry(de, size); ++@@ -840,11 +849,14 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, ++ ++ entries = (struct dx_entry *)(((char *)info) + info->info_length); ++ ++- if (dx_get_limit(entries) != dx_root_limit(dir, ++- info->info_length)) { +++ if (dx_get_limit(entries) != +++ dx_root_limit(dir, (struct ext4_dir_entry_2 *)frame->bh->b_data, +++ info->info_length)) { ++ ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", ++ dx_get_limit(entries), ++- dx_root_limit(dir, info->info_length)); +++ dx_root_limit(dir, +++ (struct ext4_dir_entry_2 *)frame->bh->b_data, +++ info->info_length)); ++ goto fail; ++ } ++ ++@@ -1851,7 +1863,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, ++ while (count--) { ++ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) ++ (from + (map->offs<<2)); ++- rec_len = EXT4_DIR_REC_LEN(de->name_len); +++ rec_len = EXT4_DIR_ENTRY_LEN(de); ++ memcpy (to, de, rec_len); ++ ((struct ext4_dir_entry_2 *) to)->rec_len = ++ ext4_rec_len_to_disk(rec_len, blocksize); ++@@ -1882,7 +1894,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) ++ while ((char*)de < base + blocksize) { ++ next = ext4_next_entry(de, blocksize); ++ if (de->inode && de->name_len) { ++- rec_len = EXT4_DIR_REC_LEN(de->name_len); +++ rec_len = EXT4_DIR_ENTRY_LEN(de); ++ if (de > to) ++ memmove(to, de, rec_len); ++ to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); ++@@ -2023,14 +2035,16 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, ++ struct buffer_head *bh, ++ void *buf, int buf_size, ++ struct ext4_filename *fname, ++- struct ext4_dir_entry_2 **dest_de) +++ struct ext4_dir_entry_2 **dest_de, int *dlen) ++ { ++ struct ext4_dir_entry_2 *de; ++- unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)); +++ unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)) + +++ (dlen ? *dlen : 0); ++ int nlen, rlen; ++ unsigned int offset = 0; ++ char *top; ++ +++ dlen ? *dlen = 0 : 0; /* default set to 0 */ ++ de = (struct ext4_dir_entry_2 *)buf; ++ top = buf + buf_size - reclen; ++ while ((char *) de <= top) { ++@@ -2039,10 +2053,26 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, ++ return -EFSCORRUPTED; ++ if (ext4_match(dir, fname, de)) ++ return -EEXIST; ++- nlen = EXT4_DIR_REC_LEN(de->name_len); +++ nlen = EXT4_DIR_ENTRY_LEN(de); ++ rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); ++ if ((de->inode ? rlen - nlen : rlen) >= reclen) ++ break; +++ /* Then for dotdot entries, check for the smaller space +++ * required for just the entry, no FID */ +++ if (fname_len(fname) == 2 && memcmp(fname_name(fname), "..", 2) == 0) { +++ if ((de->inode ? rlen - nlen : rlen) >= +++ EXT4_DIR_REC_LEN(fname_len(fname))) { +++ /* set dlen=1 to indicate not +++ * enough space store fid */ +++ dlen ? *dlen = 1 : 0; +++ break; +++ } +++ /* The new ".." entry must be written over the +++ * previous ".." entry, which is the first +++ * entry traversed by this scan. If it doesn't +++ * fit, something is badly wrong, so -EIO. */ +++ return -EIO; +++ } ++ de = (struct ext4_dir_entry_2 *)((char *)de + rlen); ++ offset += rlen; ++ } ++@@ -2056,12 +2086,12 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, ++ void ext4_insert_dentry(struct inode *inode, ++ struct ext4_dir_entry_2 *de, ++ int buf_size, ++- struct ext4_filename *fname) +++ struct ext4_filename *fname, void *data) ++ { ++ ++ int nlen, rlen; ++ ++- nlen = EXT4_DIR_REC_LEN(de->name_len); +++ nlen = EXT4_DIR_ENTRY_LEN(de); ++ rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); ++ if (de->inode) { ++ struct ext4_dir_entry_2 *de1 = ++@@ -2075,6 +2105,11 @@ void ext4_insert_dentry(struct inode *inode, ++ ext4_set_de_type(inode->i_sb, de, inode->i_mode); ++ de->name_len = fname_len(fname); ++ memcpy(de->name, fname_name(fname), fname_len(fname)); +++ if (data) { +++ de->name[fname_len(fname)] = 0; +++ memcpy(&de->name[fname_len(fname) + 1], data, *(char *)data); +++ de->file_type |= EXT4_DIRENT_LUFID; +++ } ++ } ++ ++ /* ++@@ -2092,14 +2127,19 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, ++ { ++ unsigned int blocksize = dir->i_sb->s_blocksize; ++ int csum_size = 0; ++- int err, err2; +++ int err, err2, dlen = 0; +++ unsigned char *data; ++ +++ data = ext4_dentry_get_data(inode->i_sb, (struct ext4_dentry_param *) +++ EXT4_I(inode)->i_dirdata); ++ if (ext4_has_metadata_csum(inode->i_sb)) ++ csum_size = sizeof(struct ext4_dir_entry_tail); ++ ++ if (!de) { +++ if (data) +++ dlen = (*data) + 1; ++ err = ext4_find_dest_de(dir, inode, bh, bh->b_data, ++- blocksize - csum_size, fname, &de); +++ blocksize - csum_size, fname, &de, &dlen); ++ if (err) ++ return err; ++ } ++@@ -2111,7 +2151,10 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, ++ } ++ ++ /* By now the buffer is marked for journaling */ ++- ext4_insert_dentry(inode, de, blocksize, fname); +++ /* If writing the short form of "dotdot", don't add the data section */ +++ if (dlen == 1) +++ data = NULL; +++ ext4_insert_dentry(inode, de, blocksize, fname, data); ++ ++ /* ++ * XXX shouldn't update any times until successful ++@@ -2217,7 +2260,8 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, ++ ++ dx_set_block(entries, 1); ++ dx_set_count(entries, 1); ++- dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info))); +++ dx_set_limit(entries, dx_root_limit(dir, +++ dot_de, sizeof(*dx_info))); ++ ++ /* Initialize as for dx_probe */ ++ fname->hinfo.hash_version = dx_info->hash_version; ++@@ -2267,6 +2311,8 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, ++ struct buffer_head *dir_block; ++ struct ext4_dir_entry_2 *de; ++ int len, journal = 0, err = 0; +++ int dlen = 0; +++ char *data; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++@@ -2292,11 +2338,16 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, ++ goto out_journal; ++ ++ journal = 1; ++- de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(1)); +++ de->rec_len = cpu_to_le16(EXT4_DIR_ENTRY_LEN(de)); ++ } ++ ++- len -= EXT4_DIR_REC_LEN(1); ++- assert(len == 0 || len >= EXT4_DIR_REC_LEN(2)); +++ len -= EXT4_DIR_ENTRY_LEN(de); +++ data = ext4_dentry_get_data(dir->i_sb, +++ (struct ext4_dentry_param *)dentry->d_fsdata); +++ if (data) +++ dlen = *data + 1; +++ assert(len == 0 || len >= EXT4_DIR_REC_LEN(2 + dlen)); +++ ++ de = (struct ext4_dir_entry_2 *) ++ ((char *) de + le16_to_cpu(de->rec_len)); ++ if (!journal) { ++@@ -2313,7 +2364,12 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, ++ assert(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2)); ++ de->name_len = 2; ++ strcpy(de->name, ".."); ++- ext4_set_de_type(dir->i_sb, de, S_IFDIR); +++ if (data != NULL && ext4_get_dirent_data_len(de) >= dlen) { +++ de->name[2] = 0; +++ memcpy(&de->name[2 + 1], data, *data); +++ ext4_set_de_type(dir->i_sb, de, S_IFDIR); +++ de->file_type |= EXT4_DIRENT_LUFID; +++ } ++ ++ out_journal: ++ if (journal) { ++@@ -2351,6 +2407,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ ext4_lblk_t block, blocks; ++ int csum_size = 0; ++ +++ EXT4_I(inode)->i_dirdata = dentry->d_fsdata; ++ if (ext4_has_metadata_csum(inode->i_sb)) ++ csum_size = sizeof(struct ext4_dir_entry_tail); ++ ++@@ -2918,37 +2975,70 @@ err_unlock_inode: ++ return err; ++ } ++ +++struct tp_block { +++ struct inode *inode; +++ void *data1; +++ void *data2; +++}; +++ ++ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, ++ struct ext4_dir_entry_2 *de, ++ int blocksize, int csum_size, ++ unsigned int parent_ino, int dotdot_real_len) ++ { +++ void *data1 = NULL, *data2 = NULL; +++ int dot_reclen = 0; +++ +++ if (dotdot_real_len == 10) { +++ struct tp_block *tpb = (struct tp_block *)inode; +++ data1 = tpb->data1; +++ data2 = tpb->data2; +++ inode = tpb->inode; +++ dotdot_real_len = 0; +++ } ++ de->inode = cpu_to_le32(inode->i_ino); ++ de->name_len = 1; ++- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), ++- blocksize); ++ strcpy(de->name, "."); ++ ext4_set_de_type(inode->i_sb, de, S_IFDIR); ++ +++ /* get packed fid data*/ +++ data1 = ext4_dentry_get_data(inode->i_sb, +++ (struct ext4_dentry_param *) data1); +++ if (data1) { +++ de->name[1] = 0; +++ memcpy(&de->name[2], data1, *(char *) data1); +++ de->file_type |= EXT4_DIRENT_LUFID; +++ } +++ de->rec_len = cpu_to_le16(EXT4_DIR_ENTRY_LEN(de)); +++ dot_reclen = cpu_to_le16(de->rec_len); ++ de = ext4_next_entry(de, blocksize); ++ de->inode = cpu_to_le32(parent_ino); ++ de->name_len = 2; +++ strcpy(de->name, ".."); +++ ext4_set_de_type(inode->i_sb, de, S_IFDIR); +++ data2 = ext4_dentry_get_data(inode->i_sb, +++ (struct ext4_dentry_param *) data2); +++ if (data2) { +++ de->name[2] = 0; +++ memcpy(&de->name[3], data2, *(char *) data2); +++ de->file_type |= EXT4_DIRENT_LUFID; +++ } ++ if (!dotdot_real_len) ++ de->rec_len = ext4_rec_len_to_disk(blocksize - ++- (csum_size + EXT4_DIR_REC_LEN(1)), +++ (csum_size + dot_reclen), ++ blocksize); ++ else ++ de->rec_len = ext4_rec_len_to_disk( ++- EXT4_DIR_REC_LEN(de->name_len), blocksize); ++- strcpy(de->name, ".."); ++- ext4_set_de_type(inode->i_sb, de, S_IFDIR); +++ EXT4_DIR_ENTRY_LEN(de), blocksize); ++ ++ return ext4_next_entry(de, blocksize); ++ } ++ ++ int ext4_init_new_dir(handle_t *handle, struct inode *dir, ++- struct inode *inode) +++ struct inode *inode, +++ const void *data1, const void *data2) ++ { +++ struct tp_block param; ++ struct buffer_head *dir_block = NULL; ++ struct ext4_dir_entry_2 *de; ++ ext4_lblk_t block = 0; ++@@ -2972,7 +3062,11 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, ++ if (IS_ERR(dir_block)) ++ return PTR_ERR(dir_block); ++ de = (struct ext4_dir_entry_2 *)dir_block->b_data; ++- ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); +++ param.inode = inode; +++ param.data1 = (void *)data1; +++ param.data2 = (void *)data2; +++ ext4_init_dot_dotdot((struct inode *)(¶m), de, blocksize, +++ csum_size, dir->i_ino, 10); ++ set_nlink(inode, 2); ++ if (csum_size) ++ ext4_initialize_dirent_tail(dir_block, blocksize); ++@@ -2987,6 +3081,29 @@ out: ++ return err; ++ } ++ +++/* Initialize @inode as a subdirectory of @dir, and add the +++ * "." and ".." entries into the first directory block. */ +++int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, +++ struct inode *inode, +++ const void *data1, const void *data2) +++{ +++ int rc; +++ +++ if (IS_ERR(handle)) +++ return PTR_ERR(handle); +++ +++ if (IS_DIRSYNC(dir)) +++ ext4_handle_sync(handle); +++ +++ inode->i_op = &ext4_dir_inode_operations; +++ inode->i_fop = &ext4_dir_operations; +++ rc = ext4_init_new_dir(handle, dir, inode, data1, data2); +++ if (!rc) +++ rc = ext4_mark_inode_dirty(handle, inode); +++ return rc; +++} +++EXPORT_SYMBOL(ext4_add_dot_dotdot); +++ ++ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ++ { ++ handle_t *handle; ++@@ -3013,7 +3130,7 @@ retry: ++ ++ inode->i_op = &ext4_dir_inode_operations; ++ inode->i_fop = &ext4_dir_operations; ++- err = ext4_init_new_dir(handle, dir, inode); +++ err = ext4_init_new_dir(handle, dir, inode, NULL, NULL); ++ if (err) ++ goto out_clear_inode; ++ err = ext4_mark_inode_dirty(handle, inode); ++diff --git a/fs/ext4/super.c b/fs/ext4/super.c ++index 6c33a10..59b87b4 100644 ++--- a/fs/ext4/super.c +++++ b/fs/ext4/super.c ++@@ -1719,7 +1719,7 @@ enum { ++ Opt_inlinecrypt, ++ Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, ++ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, ++- Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, +++ Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_dirdata, ++ Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, ++ Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, ++ Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, ++@@ -1803,6 +1803,7 @@ static const match_table_t tokens = { ++ {Opt_nolazytime, "nolazytime"}, ++ {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"}, ++ {Opt_nodelalloc, "nodelalloc"}, +++ {Opt_dirdata, "dirdata"}, ++ {Opt_removed, "mblk_io_submit"}, ++ {Opt_removed, "nomblk_io_submit"}, ++ {Opt_block_validity, "block_validity"}, ++@@ -2043,6 +2044,7 @@ static const struct mount_opts { ++ {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING}, ++ {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING}, ++ {Opt_offusrjquota, 0, MOPT_Q}, +++ {Opt_dirdata, EXT4_MOUNT_DIRDATA, MOPT_SET}, ++ {Opt_offgrpjquota, 0, MOPT_Q}, ++ {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, ++ {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, ++-- ++2.33.0 ++ +diff --git a/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203sp1.series b/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203sp1.series +new file mode 100644 +index 0000000000..1cc20c8e58 +--- /dev/null ++++ b/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203sp1.series +@@ -0,0 +1,33 @@ ++oe2203/ext4-inode-version.patch ++linux-5.4/ext4-lookup-dotdot.patch ++suse15/ext4-print-inum-in-htree-warning.patch ++linux-5.8/ext4-prealloc.patch ++ubuntu18/ext4-osd-iop-common.patch ++oe2203/ext4-misc.patch ++linux-5.8/ext4-mballoc-extra-checks.patch ++linux-5.4/ext4-hash-indexed-dir-dotdot-update.patch ++linux-5.8/ext4-kill-dx-root.patch ++linux-5.8/ext4-mballoc-pa-free-mismatch.patch ++oe2203sp1/ext4-data-in-dirent.patch ++rhel8/ext4-nocmtime.patch ++base/ext4-htree-lock.patch ++oe2203/ext4-pdirop.patch ++linux-5.8/ext4-max-dir-size.patch ++linux-5.8/ext4-corrupted-inode-block-bitmaps-handling-patches.patch ++linux-5.10/ext4-give-warning-with-dir-htree-growing.patch ++ubuntu18/ext4-jcb-optimization.patch ++linux-5.10/ext4-attach-jinode-in-writepages.patch ++rhel8/ext4-dont-check-before-replay.patch ++rhel7.6/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch ++rhel7.6/ext4-export-orphan-add.patch ++linux-5.8/ext4-export-mb-stream-allocator-variables.patch ++ubuntu19/ext4-iget-with-flags.patch ++linux-5.4/export-ext4fs-dirhash-helper.patch ++oe2203/ext4-simple-blockalloc.patch ++oe2203/ext4-xattr-disable-credits-check.patch ++linux-5.8/ext4-no-max-dir-size-limit-for-iam-objects.patch ++rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch ++base/ext4-projid-xattrs.patch ++linux-5.8/ext4-enc-flag.patch ++oe2203/ext4-delayed-iput.patch ++rhel8/ext4-old_ea_inodes_handling_fix.patch +diff --git a/lustre.spec.in b/lustre.spec.in +index a8c0934428..6ef7960300 100644 +--- a/lustre.spec.in ++++ b/lustre.spec.in +@@ -214,6 +214,9 @@ BuildRequires: pkgconfig + %else + %if "%{_vendor}" == "openEuler" + BuildRequires: openEuler-rpm-config ++%if %{with ldiskfs} ++BuildRequires: kernel-debugsource ++%endif + %endif + BuildRequires: pkg-config + %endif +diff --git a/lustre/ChangeLog b/lustre/ChangeLog +index 9401e95187..18c0649a11 100644 +--- a/lustre/ChangeLog ++++ b/lustre/ChangeLog +@@ -101,6 +101,7 @@ TBD Whamcloud + vanilla linux 5.4.21 (ZFS + ldiskfs) + vanilla linux 5.4.136 (ZFS + ldiskfs) + 5.10.0-60.94.0.118.oe2203 (openEuler 22.03 LTS) ++ 5.10.0-136.32.0.108.oe2203sp1 (openEuler 22.03 LTS SP1) + * ldiskfs needs an ldiskfs patch series for that kernel, ZFS does not + * Client primary kernels built and tested during release cycle: + 5.14.0-284.11.1.el9 (RHEL9.2) +diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 +index 12afe95fbc..2d4ad45d71 100644 +--- a/lustre/autoconf/lustre-core.m4 ++++ b/lustre/autoconf/lustre-core.m4 +@@ -3497,6 +3497,7 @@ lustre/kernel_patches/targets/5.3-sles15sp3.target + lustre/kernel_patches/targets/5.14-sles15sp4.target + lustre/kernel_patches/targets/3.x-fc18.target + lustre/kernel_patches/targets/5.10-oe2203.target ++lustre/kernel_patches/targets/5.10-oe2203sp1.target + lustre/ldlm/Makefile + lustre/fid/Makefile + lustre/fid/autoMakefile +diff --git a/lustre/kernel_patches/targets/5.10-oe2203sp1.target.in b/lustre/kernel_patches/targets/5.10-oe2203sp1.target.in +new file mode 100644 +index 0000000000..81112d1897 +--- /dev/null ++++ b/lustre/kernel_patches/targets/5.10-oe2203sp1.target.in +@@ -0,0 +1,21 @@ ++lnxmaj="5.10.0" ++lnxrel="136.32.0.108.oe2203sp1" ++ ++KERNEL_SRPM=kernel-${lnxmaj}-${lnxrel}.src.rpm ++SERIES="" ++EXTRA_VERSION=${lnxrel}_lustre.@VERSION@ ++LUSTRE_VERSION=@VERSION@ ++ ++DEVEL_PATH_ARCH_DELIMETER="." ++OFED_VERSION=inkernel ++ ++#SMP_ARCHS="i686 x86_64 ia64 ppc64" ++# openEuler doesn't use smp specific kernels ++SMP_ARCHS="" ++ ++for cc in gcc ; do ++ if which $cc >/dev/null 2>/dev/null ; then ++ export CC=$cc ++ break ++ fi ++done +-- +2.33.0 + diff --git a/0041-LU-16976-ldiskfs-add-support-for-openEuler-22.03-SP2.patch b/0041-LU-16976-ldiskfs-add-support-for-openEuler-22.03-SP2.patch new file mode 100644 index 0000000000000000000000000000000000000000..26a1bc1b2910293de8a1e9cc50399b186b8a83f4 --- /dev/null +++ b/0041-LU-16976-ldiskfs-add-support-for-openEuler-22.03-SP2.patch @@ -0,0 +1,282 @@ +From 38a03676580e5f2925d30fd1a2459cc9b0237d60 Mon Sep 17 00:00:00 2001 +From: Xinliang Liu +Date: Tue, 18 Jul 2023 03:42:19 +0000 +Subject: [PATCH 41/61] LU-16976 ldiskfs: add support for openEuler 22.03 SP2 + +Add ldiskfs server support for oe2203sp2. +Also refine openEuler lbuild scripts. + +Change-Id: I91841a7140a9f8f3182a4a329b9f04639a85e94d +Test-Parameters: trivial +Signed-off-by: Xinliang Liu +--- + config/lustre-build-ldiskfs.m4 | 2 +- + contrib/lbuild/funcs.sh | 5 ++- + contrib/lbuild/lbuild | 5 +-- + contrib/lbuild/lbuild-oe2203 | 40 +----------------- + contrib/lbuild/lbuild-oe2203sp1 | 40 ------------------ + contrib/lbuild/lbuild-openeuler | 42 +++++++++++++++++++ + lustre/ChangeLog | 1 + + lustre/autoconf/lustre-core.m4 | 1 + + .../targets/5.10-oe2203sp2.target.in | 21 ++++++++++ + 9 files changed, 71 insertions(+), 86 deletions(-) + mode change 100644 => 120000 contrib/lbuild/lbuild-oe2203 + delete mode 100644 contrib/lbuild/lbuild-oe2203sp1 + create mode 100644 contrib/lbuild/lbuild-openeuler + create mode 100644 lustre/kernel_patches/targets/5.10-oe2203sp2.target.in + +diff --git a/config/lustre-build-ldiskfs.m4 b/config/lustre-build-ldiskfs.m4 +index f0e987b010..c225a6abaa 100644 +--- a/config/lustre-build-ldiskfs.m4 ++++ b/config/lustre-build-ldiskfs.m4 +@@ -126,7 +126,7 @@ AS_IF([test x$RHEL_KERNEL = xyes], [ + ], [test x$OPENEULER_KERNEL = xyes], [ + case $OPENEULER_VERSION_NO in + 2203.0) LDISKFS_SERIES="5.10.0-oe2203.series" ;; +- 2203.1) LDISKFS_SERIES="5.10.0-oe2203sp1.series" ;; ++ 2203.1|2203.2) LDISKFS_SERIES="5.10.0-oe2203sp1.series" ;; + esac + ]) + ]) +diff --git a/contrib/lbuild/funcs.sh b/contrib/lbuild/funcs.sh +index 8db9532b67..78389a5d96 100644 +--- a/contrib/lbuild/funcs.sh ++++ b/contrib/lbuild/funcs.sh +@@ -165,7 +165,7 @@ autodetect_distro() { + if [[ "$lts_sp" != "n/a" ]]; then + lts_sp=${lts_sp##*-} + lts_sp=${lts_sp,,} +- version="${version}${lts_sp}" ++ version="${version}.${lts_sp}" + fi + ;; + *) +@@ -222,7 +222,8 @@ autodetect_target() { + sles15.4) target="$(uname -r | cut -d . -f 1,2)-sles15sp4";; + fc18) target="3.x-fc18";; + oe2203) target="5.10-oe2203";; +- oe2203sp1) target="5.10-oe2203sp1";; ++ oe2203.sp1) target="5.10-oe2203sp1";; ++ oe2203.sp2) target="5.10-oe2203sp2";; + *) fatal 1 "I don't know what distro $distro is.\nEither update autodetect_target() or use the --target argument.";; + esac + +diff --git a/contrib/lbuild/lbuild b/contrib/lbuild/lbuild +index 1170c4ffb3..e4ccb32879 100755 +--- a/contrib/lbuild/lbuild ++++ b/contrib/lbuild/lbuild +@@ -329,12 +329,9 @@ check_options() { + 3.0-sles11) + CANONICAL_TARGET="sles11" + ;; +- 5.10-oe2203) ++ 5.10-oe2203*) + CANONICAL_TARGET="oe2203" + ;; +- 5.10-oe2203sp1) +- CANONICAL_TARGET="oe2203sp1" +- ;; + esac + + local timestampnodig=$(echo $TIMESTAMP | sed -e s/[0-9]*//g) +diff --git a/contrib/lbuild/lbuild-oe2203 b/contrib/lbuild/lbuild-oe2203 +deleted file mode 100644 +index 1f4dc54947..0000000000 +--- a/contrib/lbuild/lbuild-oe2203 ++++ /dev/null +@@ -1,39 +0,0 @@ +-source ${LBUILD_DIR}/lbuild-rhel +- +-# increment this if you have made a change that should force a new kernel +-# to build built +-BUILD_GEN+=".0" +- +-SPEC_NAME="kernel.spec" +-DEVEL_PATH_ARCH_DELIMETER="." +-USE_KABI=false +-RPM_HELPERS_DIR="/usr/lib/rpm/openEuler" +-# Pkg which contains ext4 source code +-KERNEL_DEBUGINFO="kernel-debugsource-${lnxmaj}-${lnxrel}.${TARGET_ARCH}.rpm" +- +-# force local definition of %dist into ~/.rpmmacros +-# to avoid verbose extended strings like ".el9.centos" +-# in kernel version and rpm names +-# +-RMAC=$HOME/.rpmmacros +-grep '^%dist' $RMAC &> /dev/null || echo '%dist .oe2203' >> $RMAC +- +-unpack_linux_devel_rpm-oe2203() { +- local callers_rpm="$1" +- +- unpack_linux_devel_rpm-rhel "$callers_rpm" +-} +- +-find_linux_rpm-oe2203() { +- local prefix="$1" +- local wanted_kernel="$2" +- local pathtorpms=${3:-"$KERNELRPMSBASE/$lnxmaj/$DISTROMAJ/$TARGET_ARCH"} +- +- find_linux_rpm-rhel "$prefix" "$wanted_kernel" "$pathtorpms" +-} +- +-kernel_debuginfo_location() { +- local base_os="https://repo.openeuler.org/openEuler-22.03-LTS" +- +- echo "$base_os/update/$TARGET_ARCH/Packages/" +-} +diff --git a/contrib/lbuild/lbuild-oe2203 b/contrib/lbuild/lbuild-oe2203 +new file mode 120000 +index 0000000000..4f83bd2f91 +--- /dev/null ++++ b/contrib/lbuild/lbuild-oe2203 +@@ -0,0 +1 @@ ++lbuild-openeuler +\ No newline at end of file +diff --git a/contrib/lbuild/lbuild-oe2203sp1 b/contrib/lbuild/lbuild-oe2203sp1 +deleted file mode 100644 +index b115a347fc..0000000000 +--- a/contrib/lbuild/lbuild-oe2203sp1 ++++ /dev/null +@@ -1,40 +0,0 @@ +-source ${LBUILD_DIR}/lbuild-rhel +- +-# increment this if you have made a change that should force a new kernel +-# to build built +-BUILD_GEN+=".0" +- +-SPEC_NAME="kernel.spec" +-DEVEL_PATH_ARCH_DELIMETER="." +-USE_KABI=false +-RPM_HELPERS_DIR="/usr/lib/rpm/openEuler" +-# Pkg which contains ext4 source code +-KERNEL_DEBUGINFO="kernel-debugsource-${lnxmaj}-${lnxrel}.${TARGET_ARCH}.rpm" +-DISTRO_REPO_MIRROR=${DISTRO_REPO_MIRROR:-"https://repo.openeuler.org"} +- +-# force local definition of %dist into ~/.rpmmacros +-# to avoid verbose extended strings like ".el9.centos" +-# in kernel version and rpm names +-# +-RMAC=$HOME/.rpmmacros +-grep '^%dist' $RMAC &> /dev/null || echo '%dist .oe2203sp1' >> $RMAC +- +-unpack_linux_devel_rpm-oe2203sp1() { +- local callers_rpm="$1" +- +- unpack_linux_devel_rpm-rhel "$callers_rpm" +-} +- +-find_linux_rpm-oe2203sp1() { +- local prefix="$1" +- local wanted_kernel="$2" +- local pathtorpms=${3:-"$KERNELRPMSBASE/$lnxmaj/$DISTROMAJ/$TARGET_ARCH"} +- +- find_linux_rpm-rhel "$prefix" "$wanted_kernel" "$pathtorpms" +-} +- +-kernel_debuginfo_location() { +- local base_os="$DISTRO_REPO_MIRROR/openEuler-22.03-LTS-SP1" +- +- echo "$base_os/update/$TARGET_ARCH/Packages/" +-} +diff --git a/contrib/lbuild/lbuild-openeuler b/contrib/lbuild/lbuild-openeuler +new file mode 100644 +index 0000000000..35845f336b +--- /dev/null ++++ b/contrib/lbuild/lbuild-openeuler +@@ -0,0 +1,42 @@ ++# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4: ++source ${LBUILD_DIR}/lbuild-rhel ++ ++DEVEL_PATH_ARCH_DELIMETER="." ++RPM_HELPERS_DIR="/usr/lib/rpm/openEuler" ++# Pkg which contains ext4 source code ++KERNEL_DEBUGINFO="kernel-debugsource-${lnxmaj}-${lnxrel}.${TARGET_ARCH}.rpm" ++ ++# force local definition of %dist into ~/.rpmmacros ++# to avoid verbose extended strings like ".el9.centos" ++# in kernel version and rpm names ++RMAC=$HOME/.rpmmacros ++grep '^%dist' $RMAC &> /dev/null || echo "%dist .${DISTRO/./}" >> $RMAC ++ ++kernel_debuginfo_location() { ++ local base_url="https://repo.openeuler.org" ++ local distro=${DISTRO^^} ++ # convert OEYYMM.SPx to openEuler-YY.MM-LTS-SPx ++ distro=$(echo $distro | sed -E -e 's/OE/openEuler-/' \ ++ -e 's/([0-9]{2})([0-9]{2})/\1.\2-/' \ ++ -e 's/.(SP[0-9]+)/LTS-\1/') ++ ++ echo "${base_url}/${distro}/update/$TARGET_ARCH/Packages" ++} ++ ++ ++eval "$(cat </dev/null 2>/dev/null ; then ++ export CC=$cc ++ break ++ fi ++done +-- +2.33.0 + diff --git a/0042-lustre.spec.in-match-rpm-macro-openEuler-for-openEul.patch b/0042-lustre.spec.in-match-rpm-macro-openEuler-for-openEul.patch new file mode 100644 index 0000000000000000000000000000000000000000..0b8d21c49bb91236186d2b9fc399a342aa883fe2 --- /dev/null +++ b/0042-lustre.spec.in-match-rpm-macro-openEuler-for-openEul.patch @@ -0,0 +1,98 @@ +From bf5b9a174b2ec1d1a26c09e7c5d4b928a24aaa8c Mon Sep 17 00:00:00 2001 +From: Xinliang Liu +Date: Mon, 7 Aug 2023 10:18:49 +0000 +Subject: [PATCH 42/61] lustre.spec.in: match rpm macro openEuler for openEuler + Linux + +So that it can handle openEuler derived OSes, like KylinOS. + +Signed-off-by: Xinliang Liu +--- + lustre.spec.in | 18 +++++++++--------- + 1 file changed, 9 insertions(+), 9 deletions(-) + +diff --git a/lustre.spec.in b/lustre.spec.in +index 6ef7960300..9aa271f504 100644 +--- a/lustre.spec.in ++++ b/lustre.spec.in +@@ -136,7 +136,7 @@ + # requires want to set a version including epoch + %global krequires %(echo %{kver} | sed -e 's/\.x86_64$//' -e 's/\.i[3456]86$//' -e 's/-smp$//' -e 's/-bigsmp$//' -e 's/[-.]ppc64$//' -e 's/\.aarch64$//' -e 's/-default$//' -e 's/-%{_flavor}//') + +-%if "%{_vendor}" == "redhat" || "%{_vendor}" == "fedora" || "%{_vendor}" == "openEuler" ++%if "%{_vendor}" == "redhat" || "%{_vendor}" == "fedora" || 0%{?openEuler} + %global requires_kmod_name kmod-%{lustre_name} + %global requires_kmod_osd_zfs_name kmod-%{lustre_name}-osd-zfs + %if %{with lustre_tests} +@@ -176,7 +176,7 @@ + %endif + + # openEuler comes with systemd +-%if "%{_vendor}" == "openEuler" ++%if 0%{?openEuler} + %define with_systemd 1 + %endif + +@@ -212,7 +212,7 @@ BuildRequires: libtool libyaml-devel zlib-devel libnl3-devel flex bison + BuildRequires: redhat-rpm-config + BuildRequires: pkgconfig + %else +-%if "%{_vendor}" == "openEuler" ++%if 0%{?openEuler} + BuildRequires: openEuler-rpm-config + %if %{with ldiskfs} + BuildRequires: kernel-debugsource +@@ -231,20 +231,20 @@ Provides: lustre-server = %{version}-%{release} + %endif + Obsoletes: lustre-client < %{version} + Provides: lustre-client = %{version}-%{release} +-%if "%{_vendor}" == "redhat" || "%{_vendor}" == "fedora" || "%{_vendor}" == "openEuler" ++%if "%{_vendor}" == "redhat" || "%{_vendor}" == "fedora" || 0%{?openEuler} + #suse don't support selinux + BuildRequires: libselinux-devel + %endif + %if %{with lustre_modules} + %if %{with mofed} + BuildRequires: mlnx-ofa_kernel-devel +-%if "%{_vendor}" == "redhat" || "%{_vendor}" == "openEuler" ++%if "%{_vendor}" == "redhat" || 0%{?openEuler} + Requires: kmod-mlnx-ofa_kernel + %else + Requires: mlnx-ofa_kernel-kmp + %endif + %endif +-%if 0%{?rhel} >= 8 || "%{_vendor}" == "openEuler" ++%if 0%{?rhel} >= 8 || 0%{?openEuler} + BuildRequires: kernel-rpm-macros + %endif + BuildRequires: %kernel_module_package_buildreqs +@@ -369,7 +369,7 @@ Provides: lustre-tests = %{version} + Requires: lustre-iokit + %endif + Requires: lustre-devel = %{version} +-%if 0%{?rhel} >= 8 || 0%{?suse_version} >= 1500 ++%if 0%{?rhel} >= 8 || 0%{?suse_version} >= 1500 || 0%{?openEuler} + Requires: python3 >= 3.6.0, python3-PyYAML + %endif + %if %{with lustre_modules} +@@ -385,7 +385,7 @@ Requires: attr, rsync, perl, lsof, /usr/bin/getconf + BuildRequires: mpich-devel + %endif + %if "%{mpi_name}" == "openmpi" +-%if "%{_vendor}" == "redhat" || "%{_vendor}" == "openEuler" || 0%{?suse_version} < 1500 ++%if "%{_vendor}" == "redhat" || 0%{?openEuler} || 0%{?suse_version} < 1500 + BuildRequires: openmpi-devel + %else + BuildRequires: openmpi2-devel +@@ -637,7 +637,7 @@ echo '%{_sysconfdir}/ha.d/resource.d/Lustre' >>lustre.files + echo '%{_unitdir}/lnet.service' >>lustre.files + %endif + +-%if "%{_vendor}" == "redhat" || "%{_vendor}" == "openEuler" ++%if "%{_vendor}" == "redhat" || 0%{?openEuler} + # The following scripts are Red Hat specific + %if %{with servers} + echo '%{_sysconfdir}/init.d/lustre' >>lustre.files +-- +2.33.0 + diff --git a/0043-LU-15722-osd-ldiskfs-fix-IO-write-gets-stuck-for-64K.patch b/0043-LU-15722-osd-ldiskfs-fix-IO-write-gets-stuck-for-64K.patch new file mode 100644 index 0000000000000000000000000000000000000000..a86193740904d83d1284ac9de0c2a949b8f81b01 --- /dev/null +++ b/0043-LU-15722-osd-ldiskfs-fix-IO-write-gets-stuck-for-64K.patch @@ -0,0 +1,118 @@ +From deacd917d127dda1ec2f22d8f4451205b03860ae Mon Sep 17 00:00:00 2001 +From: Xinliang Liu +Date: Wed, 6 Apr 2022 08:06:33 +0000 +Subject: [PATCH 43/61] LU-15722 osd-ldiskfs: fix IO write gets stuck for 64K + PAGE_SIZE + +This fixes below IO write stuck issue: +----- +[606895.151765] LustreError: +334886:0:(ofd_io.c:1389:ofd_commitrw_write()) lustre-OST0000: restart IO +write too many times: 10000 +[606895.207345] LustreError: +334886:0:(ofd_io.c:1389:ofd_commitrw_write()) Skipped 8 previous similar +messages +------- + +Which goes into an infinite loop: +ofd_commitrw_write()->osd_write_commit()->osd_ldiskfs_map_inode_pages() + ->ldiskfs_map_blocks()->ofd_commitrw_write() + +The cause is that: +For 64K PAGE_SIZE blocks allocation/mapping. m_lblk should be the +first un-allocated block if m_lblk points at an already allocated +block when create = 1, ldiskfs_map_blocks() will just return with +already allocated blocks and without allocating any new requested +blocks for the extent. + +This stuck issue won't happen on 4K PAGE_SIZE. Because for +PAGE_SIZE = blocksize case, if m_lblk points at an already +allocated block it will point at an un-allocated block in next +restart transaction, because the already mapped block/page will +be filtered out in next restart transaction via flag +OBD_BRW_DONE in osd_declare_write_commit(). + +Change-Id: Iadba0be8875a15a2e2f158ec9571f5ece5637ae0 +Signed-off-by: Xinliang Liu +Reviewed-on: https://review.whamcloud.com/47004 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Alex Zhuravlev +Reviewed-by: James Simmons +Reviewed-by: Oleg Drokin +--- + lustre/osd-ldiskfs/osd_io.c | 44 +++++++++++++++++++++++++++++++++++++ + 1 file changed, 44 insertions(+) + +diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c +index 7131add58d..dcc1081e0a 100644 +--- a/lustre/osd-ldiskfs/osd_io.c ++++ b/lustre/osd-ldiskfs/osd_io.c +@@ -1091,6 +1091,7 @@ static int osd_ldiskfs_map_inode_pages(struct inode *inode, + struct thandle *thandle) + { + int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; ++ int blocksize = 1 << inode->i_blkbits; + int rc = 0, i = 0, mapped_index = 0; + struct page *fp = NULL; + int clen = 0; +@@ -1147,6 +1148,48 @@ static int osd_ldiskfs_map_inode_pages(struct inode *inode, + /* process found extent */ + map.m_lblk = fp->index * blocks_per_page; + map.m_len = blen = clen * blocks_per_page; ++ ++ /* ++ * For PAGE_SIZE > blocksize block allocation mapping, the ++ * ldiskfs_map_blocks() aims at looking up already mapped ++ * blocks, recording them to iobuf->dr_blocks and fixing up ++ * m_lblk, m_len for un-allocated blocks to be created/mapped ++ * in the second ldiskfs_map_blocks(). ++ * ++ * M_lblk should be the first un-allocated block if m_lblk ++ * points at an already allocated block when create = 1, ++ * ldiskfs_map_blocks() will just return with already ++ * allocated blocks and without allocating any requested ++ * new blocks for the extent. For PAGE_SIZE = blocksize ++ * case, if m_lblk points at an already allocated block it ++ * will point at an un-allocated block in next restart ++ * transaction, because the already mapped block/page will ++ * be filtered out in next restart transaction via flag ++ * OBD_BRW_DONE in osd_declare_write_commit(). ++ */ ++ if (create && PAGE_SIZE > blocksize) { ++ /* With flags=0 just for already mapped blocks lookup */ ++ rc = ldiskfs_map_blocks(handle, inode, &map, 0); ++ if (rc > 0 && map.m_flags & LDISKFS_MAP_MAPPED) { ++ for (; total < blen && total < map.m_len; ++ total++) ++ *(blocks + total) = map.m_pblk + total; ++ ++ /* The extent is already full mapped */ ++ if (total == blen) { ++ rc = 0; ++ goto ext_already_mapped; ++ } ++ } ++ /* ++ * Fixup or reset m_lblk and m_len for un-mapped blocks. ++ * The second ldiskfs_map_blocks() will create and map ++ * them. ++ */ ++ map.m_lblk = fp->index * blocks_per_page + total; ++ map.m_len = blen - total; ++ } ++ + cont_map: + /** + * We might restart transaction for block allocations, +@@ -1211,6 +1254,7 @@ cont_map: + rc = 0; + } + ++ext_already_mapped: + if (rc == 0 && create) { + count += (total - previous_total); + mapped_index = (count + blocks_per_page - +-- +2.33.0 + diff --git a/0044-LU-15722-osd-ldiskfs-fix-write-stuck-for-64K-PAGE_SI.patch b/0044-LU-15722-osd-ldiskfs-fix-write-stuck-for-64K-PAGE_SI.patch new file mode 100644 index 0000000000000000000000000000000000000000..33a5ba1b9a891e5cf454c12433d2c6e3facd6ccb --- /dev/null +++ b/0044-LU-15722-osd-ldiskfs-fix-write-stuck-for-64K-PAGE_SI.patch @@ -0,0 +1,179 @@ +From 7b3bbf6fb4dace0af738bc69500558180a8e7227 Mon Sep 17 00:00:00 2001 +From: Xinliang Liu +Date: Mon, 6 Jun 2022 08:59:54 +0000 +Subject: [PATCH 44/61] LU-15722 osd-ldiskfs: fix write stuck for 64K PAGE_SIZE + +This reverts a previous commit for large PAGE_SIZE to fix a stuck IO +issue in another way. + +One more ldiskfs_map_blocks() can't fix the write stuck for PAGE_SIZE +> BLOCK_SIZE. It still gets stuck in some tests like sanity-dom fsx. +Because each time ldiskfs_map_blocks() lookup it only return a +continuous range physical blocks. If a page has multiple continuous +range blocks, then it needs multiple ldiskfs_map_blocks() lookups to +find out all the already mapped blocks. + +The fixed idea here is to record the already written blocks of the +start page and skip them at the next write retry. + +This also fix and cleanup osd_mark_page_io_done() when start_blocks +is non-zero. + +Fixes: 176ea3a4599e ("LU-15722 osd-ldiskfs: fix IO write gets stuck for 64K PAGE_SIZE") +Change-Id: I9c14d5d0aa23e81837dacb01d050c091e6a79148 +Signed-off-by: Xinliang Liu +--- + lustre/osd-ldiskfs/osd_internal.h | 2 + + lustre/osd-ldiskfs/osd_io.c | 83 +++++++++++-------------------- + 2 files changed, 31 insertions(+), 54 deletions(-) + +diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h +index f06dde1b00..7300278b80 100644 +--- a/lustre/osd-ldiskfs/osd_internal.h ++++ b/lustre/osd-ldiskfs/osd_internal.h +@@ -594,6 +594,8 @@ struct osd_iobuf { + ktime_t dr_elapsed; /* how long io took */ + struct osd_device *dr_dev; + unsigned int dr_init_at; /* the line iobuf was initialized */ ++ /* Already written blocks of the start page */ ++ unsigned int dr_start_pg_wblks; + }; + + #define osd_dirty_inode(inode, flag) (inode)->i_sb->s_op->dirty_inode((inode), flag) +diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c +index dcc1081e0a..233d88397c 100644 +--- a/lustre/osd-ldiskfs/osd_io.c ++++ b/lustre/osd-ldiskfs/osd_io.c +@@ -90,6 +90,14 @@ static int __osd_init_iobuf(struct osd_device *d, struct osd_iobuf *iobuf, + iobuf->dr_rw = rw; + iobuf->dr_init_at = line; + ++ /* Init dr_start_pg_wblks to 0 for osd_read/write_prep(). ++ * For osd_write_commit() need to keep the value assigned in ++ * osd_ldiskfs_map_inode_pages() during retries, and before it , ++ * init dr_start_pg_wblks to 0 in osd_write_prep() is sufficient. ++ */ ++ if (rw == 0) ++ iobuf->dr_start_pg_wblks = 0; ++ + blocks = pages * (PAGE_SIZE >> osd_sb(d)->s_blocksize_bits); + if (iobuf->dr_bl_buf.lb_len >= blocks * sizeof(iobuf->dr_blocks[0])) { + LASSERT(iobuf->dr_pg_buf.lb_len >= +@@ -465,22 +473,14 @@ static void osd_mark_page_io_done(struct osd_iobuf *iobuf, + sector_t start_blocks, + sector_t count) + { +- struct niobuf_local *lnb; ++ struct niobuf_local **lnbs = iobuf->dr_lnbs; + int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; +- pgoff_t pg_start, pg_end; +- +- pg_start = start_blocks / blocks_per_page; +- if (start_blocks % blocks_per_page) +- pg_start++; +- if (count >= blocks_per_page) +- pg_end = (start_blocks + count - +- blocks_per_page) / blocks_per_page; +- else +- return; /* nothing to mark */ +- for ( ; pg_start <= pg_end; pg_start++) { +- lnb = iobuf->dr_lnbs[pg_start]; +- lnb->lnb_flags |= OBD_BRW_DONE; +- } ++ int i, end; ++ ++ i = start_blocks / blocks_per_page; ++ end = (start_blocks + count) / blocks_per_page; ++ for ( ; i < end; i++) ++ lnbs[i]->lnb_flags |= OBD_BRW_DONE; + } + + static int osd_do_bio(struct osd_device *osd, struct inode *inode, +@@ -1091,7 +1091,6 @@ static int osd_ldiskfs_map_inode_pages(struct inode *inode, + struct thandle *thandle) + { + int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; +- int blocksize = 1 << inode->i_blkbits; + int rc = 0, i = 0, mapped_index = 0; + struct page *fp = NULL; + int clen = 0; +@@ -1150,44 +1149,19 @@ static int osd_ldiskfs_map_inode_pages(struct inode *inode, + map.m_len = blen = clen * blocks_per_page; + + /* +- * For PAGE_SIZE > blocksize block allocation mapping, the +- * ldiskfs_map_blocks() aims at looking up already mapped +- * blocks, recording them to iobuf->dr_blocks and fixing up +- * m_lblk, m_len for un-allocated blocks to be created/mapped +- * in the second ldiskfs_map_blocks(). +- * +- * M_lblk should be the first un-allocated block if m_lblk +- * points at an already allocated block when create = 1, +- * ldiskfs_map_blocks() will just return with already +- * allocated blocks and without allocating any requested +- * new blocks for the extent. For PAGE_SIZE = blocksize +- * case, if m_lblk points at an already allocated block it +- * will point at an un-allocated block in next restart +- * transaction, because the already mapped block/page will +- * be filtered out in next restart transaction via flag +- * OBD_BRW_DONE in osd_declare_write_commit(). ++ * Skip already written blocks of the start page. ++ * Note that this branch will not go into for 4K PAGE_SIZE. ++ * Because dr_start_pg_wblks is always 0 for 4K PAGE_SIZE. ++ * iobuf->dr_start_pg_wblks = (start_blocks + count) % ++ * blocks_per_page. + */ +- if (create && PAGE_SIZE > blocksize) { +- /* With flags=0 just for already mapped blocks lookup */ +- rc = ldiskfs_map_blocks(handle, inode, &map, 0); +- if (rc > 0 && map.m_flags & LDISKFS_MAP_MAPPED) { +- for (; total < blen && total < map.m_len; +- total++) +- *(blocks + total) = map.m_pblk + total; +- +- /* The extent is already full mapped */ +- if (total == blen) { +- rc = 0; +- goto ext_already_mapped; +- } +- } +- /* +- * Fixup or reset m_lblk and m_len for un-mapped blocks. +- * The second ldiskfs_map_blocks() will create and map +- * them. +- */ +- map.m_lblk = fp->index * blocks_per_page + total; ++ if (iobuf->dr_start_pg_wblks > 0) { ++ total = previous_total = start_blocks = ++ iobuf->dr_start_pg_wblks; ++ map.m_lblk = fp->index * blocks_per_page + ++ total; + map.m_len = blen - total; ++ iobuf->dr_start_pg_wblks = 0; + } + + cont_map: +@@ -1215,6 +1189,8 @@ cont_map: + if (rc) + GOTO(cleanup, rc); + thandle->th_restart_tran = 1; ++ iobuf->dr_start_pg_wblks = (start_blocks + ++ count) % blocks_per_page; + GOTO(cleanup, rc = -EAGAIN); + } + +@@ -1254,10 +1230,9 @@ cont_map: + rc = 0; + } + +-ext_already_mapped: + if (rc == 0 && create) { + count += (total - previous_total); +- mapped_index = (count + blocks_per_page - ++ mapped_index = (start_blocks + count + blocks_per_page - + 1) / blocks_per_page - 1; + lnb1 = iobuf->dr_lnbs[i - clen]; + lnb2 = iobuf->dr_lnbs[mapped_index]; +-- +2.33.0 + diff --git a/0045-LU-15978-osp-fix-striped-directory-deletion-fails-fo.patch b/0045-LU-15978-osp-fix-striped-directory-deletion-fails-fo.patch new file mode 100644 index 0000000000000000000000000000000000000000..eec8f3a7972faf3d18effba28be949b6067227c7 --- /dev/null +++ b/0045-LU-15978-osp-fix-striped-directory-deletion-fails-fo.patch @@ -0,0 +1,223 @@ +From 3ef6abe3067ba1cb61d655d33ef129cf004a7ebc Mon Sep 17 00:00:00 2001 +From: Xinliang Liu +Date: Tue, 28 Jun 2022 08:34:46 +0000 +Subject: [PATCH 45/61] LU-15978 osp: fix striped directory deletion fails for + 64K PAGE_SIZE + +This fixes the rmdir errors below: +rmdir: failed to remove '/mnt/lustre/d1.sanity/d2': Invalid argument +LustreError: 381691:0:(osp_object.c:1998:osp_it_next_page()) +lustre-MDT0000-osp-MDT0001: invalid magic (0 != 8a6d6b6c) for page 0/1 +while read layout orphan index. + +For 64K PAGE_SIZE, when created an striped directory, e.g. created +with function test_mkdir() defined in test-framework.sh when MDSCOUNT +>= 2, deleting it will fail. + +For PAGE_SIZE > LU_PAGE_SIZE, if the end system page fills less than +LU_PAGE_COUNT lu_idxpages, init the header of the remain lu_idxpages. +So that the clients handle this partial filling correctly. + +Also make goto labels meaningful and avoid not freeing pages for +lip_nr == 0 in osp_it_next_page(). + +This patch also fixes wrong page idx for page kunmap in +dt_index_walk(). + +This server end fix also necessary for the idxpage reading clients +nodemap_process_idx_pages() and qsd_reint_entries(). So this patch also +includes fix for LU-15992: nodemap create and check failed on 64K page +size. + +Fixes: 77eea1985bb1 ("LU-3336 lfsck: orphan OST-objects iteration") +Change-Id: I75bd9603c31bed8ea15fdba693677d41affaf61c +Signed-off-by: Xinliang Liu +Co-authored-by: Kevin Zhao +--- + lustre/include/dt_object.h | 2 ++ + lustre/obdclass/dt_object.c | 51 ++++++++++++++++++++++++++++++++- + lustre/osp/osp_object.c | 15 +++++----- + lustre/ptlrpc/nodemap_storage.c | 9 ++++++ + 4 files changed, 69 insertions(+), 8 deletions(-) + +diff --git a/lustre/include/dt_object.h b/lustre/include/dt_object.h +index f24d7d3594..3957f2cdbb 100644 +--- a/lustre/include/dt_object.h ++++ b/lustre/include/dt_object.h +@@ -2255,6 +2255,8 @@ int dt_index_walk(const struct lu_env *env, struct dt_object *obj, + void *arg); + int dt_index_read(const struct lu_env *env, struct dt_device *dev, + struct idx_info *ii, const struct lu_rdpg *rdpg); ++void dt_index_page_adjust(struct page **pages, const u32 npages, ++ const size_t nlupgs); + + static inline struct thandle *dt_trans_create(const struct lu_env *env, + struct dt_device *d) +diff --git a/lustre/obdclass/dt_object.c b/lustre/obdclass/dt_object.c +index ee17b36c9b..ef1da47aed 100644 +--- a/lustre/obdclass/dt_object.c ++++ b/lustre/obdclass/dt_object.c +@@ -882,7 +882,7 @@ int dt_index_walk(const struct lu_env *env, struct dt_object *obj, + /* end of index */ + break; + } +- kunmap(rdpg->rp_pages[i]); ++ kunmap(rdpg->rp_pages[pageidx]); + } + + out: +@@ -980,6 +980,14 @@ int dt_index_read(const struct lu_env *env, struct dt_device *dev, + ii->ii_hash_end = II_END_OFF; + } + ++ /* ++ * For partial lu_idxpage filling of the end system page, ++ * init the header of the remain lu_idxpages. ++ */ ++ if (rc > 0) ++ dt_index_page_adjust(rdpg->rp_pages, rdpg->rp_npages, ++ ii->ii_count); ++ + GOTO(out, rc); + out: + dt_object_put(env, obj); +@@ -987,6 +995,47 @@ out: + } + EXPORT_SYMBOL(dt_index_read); + ++#if PAGE_SIZE > LU_PAGE_SIZE ++/* ++ * For partial lu_idxpage filling of the end system page, init the header of the ++ * remain lu_idxpages. So that the clients handle partial filling correctly. ++ * Current lu_idxpage read clients are osp_it_next_page(), ++ * nodemap_process_idx_pages() and qsd_reint_entries(). ++ */ ++void dt_index_page_adjust(struct page **pages, const u32 npages, ++ const size_t nlupgs) ++{ ++ u32 nlupgs_mod = nlupgs % LU_PAGE_COUNT; ++ u32 remain_nlupgs; ++ u32 pgidx; ++ struct lu_idxpage *lip; ++ union lu_page *lp; ++ int i; ++ ++ if (nlupgs_mod) { ++ pgidx = nlupgs / LU_PAGE_COUNT; ++ LASSERT(pgidx < npages); ++ lp = kmap(pages[pgidx]); ++ remain_nlupgs = LU_PAGE_COUNT - nlupgs_mod; ++ ++ /* initialize the header for the remain lu_pages */ ++ for (i = 0, lp += nlupgs_mod; i < remain_nlupgs; i++, lp++) { ++ lip = &lp->lp_idx; ++ memset(lip, 0, LIP_HDR_SIZE); ++ lip->lip_magic = LIP_MAGIC; ++ } ++ ++ kunmap(pages[pgidx]); ++ } ++} ++#else ++void dt_index_page_adjust(struct page **pages, const u32 npages, ++ const size_t nlupgs) ++{ ++} ++#endif ++EXPORT_SYMBOL(dt_index_page_adjust); ++ + #ifdef CONFIG_PROC_FS + int lprocfs_dt_blksize_seq_show(struct seq_file *m, void *v) + { +diff --git a/lustre/osp/osp_object.c b/lustre/osp/osp_object.c +index a2f245b801..a5a53a3405 100644 +--- a/lustre/osp/osp_object.c ++++ b/lustre/osp/osp_object.c +@@ -1970,21 +1970,22 @@ int osp_it_next_page(const struct lu_env *env, struct dt_it *di) + int i; + ENTRY; + +-again2: ++process_idxpage: + idxpage = it->ooi_cur_idxpage; + if (idxpage != NULL) { + if (idxpage->lip_nr == 0) +- RETURN(1); ++ goto finish_cur_idxpage; + + if (it->ooi_pos_ent < idxpage->lip_nr) { + CDEBUG(D_INFO, "ooi_pos %d nr %d\n", + (int)it->ooi_pos_ent, (int)idxpage->lip_nr); + RETURN(0); + } ++finish_cur_idxpage: + it->ooi_cur_idxpage = NULL; + it->ooi_pos_lu_page++; + +-again1: ++process_page: + if (it->ooi_pos_lu_page < LU_PAGE_COUNT) { + it->ooi_cur_idxpage = (void *)it->ooi_cur_page + + LU_PAGE_SIZE * it->ooi_pos_lu_page; +@@ -2005,19 +2006,19 @@ again1: + RETURN(-EINVAL); + } + it->ooi_pos_ent = -1; +- goto again2; ++ goto process_idxpage; + } + + kunmap(it->ooi_cur_page); + it->ooi_cur_page = NULL; + it->ooi_pos_page++; + +-again0: ++start: + pages = it->ooi_pages; + if (it->ooi_pos_page < it->ooi_valid_npages) { + it->ooi_cur_page = kmap(pages[it->ooi_pos_page]); + it->ooi_pos_lu_page = 0; +- goto again1; ++ goto process_page; + } + + for (i = 0; i < it->ooi_total_npages; i++) { +@@ -2041,7 +2042,7 @@ again0: + + rc = osp_it_fetch(env, it); + if (rc == 0) +- goto again0; ++ goto start; + + RETURN(rc); + } +diff --git a/lustre/ptlrpc/nodemap_storage.c b/lustre/ptlrpc/nodemap_storage.c +index 387c2026c3..9c9d95416d 100644 +--- a/lustre/ptlrpc/nodemap_storage.c ++++ b/lustre/ptlrpc/nodemap_storage.c +@@ -1503,6 +1503,14 @@ int nodemap_index_read(struct lu_env *env, + if (rc >= 0) + ii->ii_version = version; + ++ /* ++ * For partial lu_idxpage filling of the end system page, ++ * init the header of the remain lu_idxpages. ++ */ ++ if (rc > 0) ++ dt_index_page_adjust(rdpg->rp_pages, rdpg->rp_npages, ++ ii->ii_count); ++ + dt_read_unlock(env, nodemap_idx); + return rc; + } +@@ -1565,6 +1573,7 @@ int nodemap_get_config_req(struct obd_device *mgs_obd, + nodemap_ii.ii_flags = II_FL_NOHASH; + nodemap_ii.ii_version = rqexp_ted->ted_nodemap_version; + nodemap_ii.ii_attrs = body->mcb_nm_cur_pass; ++ nodemap_ii.ii_count = 0; + + bytes = nodemap_index_read(req->rq_svc_thread->t_env, + mgs_obd->u.obt.obt_nodemap_config_file, +-- +2.33.0 + diff --git a/0046-ldiskfs-add-support-for-oe2003.patch b/0046-ldiskfs-add-support-for-oe2003.patch new file mode 100644 index 0000000000000000000000000000000000000000..70416f1ff726cd168f8736224a76869068b572a9 --- /dev/null +++ b/0046-ldiskfs-add-support-for-oe2003.patch @@ -0,0 +1,2512 @@ +From 7325c6cf0aeb80aeefe1cc864decc4ce02a9fe01 Mon Sep 17 00:00:00 2001 +From: Xinliang Liu +Date: Thu, 10 Aug 2023 09:34:02 +0000 +Subject: [PATCH 46/61] ldiskfs: add support for oe2003 + +Signed-off-by: Xinliang Liu +--- + config/lustre-build-ldiskfs.m4 | 6 + + config/lustre-build-linux.m4 | 16 +- + contrib/lbuild/funcs.sh | 1 + + contrib/lbuild/lbuild | 3 + + contrib/lbuild/lbuild-oe2003 | 1 + + .../patches/oe2003/ext4-filename-encode.patch | 407 ++++++++ + .../oe2003/ext4-mballoc-extra-checks.patch | 317 ++++++ + .../ext4-mballoc-pa-free-mismatch.patch | 125 +++ + .../oe2003/ext4-mballoc-prefetch.patch | 288 ++++++ + .../patches/oe2003/ext4-misc.patch | 210 ++++ + .../patches/oe2003/ext4-pdirop.patch | 907 ++++++++++++++++++ + .../series/ldiskfs-4.19.90-oe2003.series | 36 + + lustre/ChangeLog | 2 + + lustre/autoconf/lustre-core.m4 | 1 + + .../targets/4.19-oe2003sp3.target.in | 21 + + 15 files changed, 2340 insertions(+), 1 deletion(-) + create mode 120000 contrib/lbuild/lbuild-oe2003 + create mode 100644 ldiskfs/kernel_patches/patches/oe2003/ext4-filename-encode.patch + create mode 100644 ldiskfs/kernel_patches/patches/oe2003/ext4-mballoc-extra-checks.patch + create mode 100644 ldiskfs/kernel_patches/patches/oe2003/ext4-mballoc-pa-free-mismatch.patch + create mode 100644 ldiskfs/kernel_patches/patches/oe2003/ext4-mballoc-prefetch.patch + create mode 100644 ldiskfs/kernel_patches/patches/oe2003/ext4-misc.patch + create mode 100644 ldiskfs/kernel_patches/patches/oe2003/ext4-pdirop.patch + create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-4.19.90-oe2003.series + create mode 100644 lustre/kernel_patches/targets/4.19-oe2003sp3.target.in + +diff --git a/config/lustre-build-ldiskfs.m4 b/config/lustre-build-ldiskfs.m4 +index c225a6abaa..51ddcd07e1 100644 +--- a/config/lustre-build-ldiskfs.m4 ++++ b/config/lustre-build-ldiskfs.m4 +@@ -128,6 +128,12 @@ AS_IF([test x$RHEL_KERNEL = xyes], [ + 2203.0) LDISKFS_SERIES="5.10.0-oe2203.series" ;; + 2203.1|2203.2) LDISKFS_SERIES="5.10.0-oe2203sp1.series" ;; + esac ++ ++ AS_IF([test -z "$OPENEULER_VERSION_NO"], [ ++ case $LINUXRELEASE in ++ 4.19.90-*) LDISKFS_SERIES="4.19.90-oe2003.series" ;; ++ esac ++ ]) + ]) + ]) + # Not RHEL/SLES/openEuler or Ubuntu .. probably mainline +diff --git a/config/lustre-build-linux.m4 b/config/lustre-build-linux.m4 +index 1ff6c47856..6cdcba0b60 100644 +--- a/config/lustre-build-linux.m4 ++++ b/config/lustre-build-linux.m4 +@@ -172,7 +172,7 @@ AC_DEFUN([LB_LINUX_RELEASE], [ + ]) + ]) + +- # Check for openEuler ++ # Check for openEuler 22.03+ kernel which with openEuer version number + AS_IF([test "x$KERNEL_FOUND" = "xno"], [ + AC_CACHE_CHECK([for openEuler kernel version number], lb_cv_openeuler_kernel_version, [ + lb_cv_openeuler_kernel_version="" +@@ -190,6 +190,20 @@ AC_DEFUN([LB_LINUX_RELEASE], [ + ]) + ]) + ++ # Check for openEuler 20.03 kernel which without openEuer version number ++ AS_IF([test "x$KERNEL_FOUND" = "xno"], [ ++ AC_CACHE_CHECK([for openEuler kernel signature], lb_cv_openeuler_kernel_sig, [ ++ lb_cv_openeuler_kernel_sig="" ++ AS_IF([fgrep -q Euler $LINUX_OBJ/include/linux/kabi.h], [ ++ lb_cv_openeuler_kernel_sig="yes" ++ ]) ++ ]) ++ AS_IF([test "x$lb_cv_openeuler_kernel_sig" = "xyes"], [ ++ OPENEULER_KERNEL="yes" ++ KERNEL_FOUND="yes" ++ ]) ++ ]) ++ + # If still no kernel was found, a warning is issued + AS_IF([test "x$KERNEL_FOUND" = "xno"], [ + AC_MSG_WARN([Kernel Distro seems to be neither RedHat, SuSE, openEuler nor Ubuntu]) +diff --git a/contrib/lbuild/funcs.sh b/contrib/lbuild/funcs.sh +index 78389a5d96..7889ff072e 100644 +--- a/contrib/lbuild/funcs.sh ++++ b/contrib/lbuild/funcs.sh +@@ -221,6 +221,7 @@ autodetect_target() { + sles15.3) target="$(uname -r | cut -d . -f 1,2)-sles15sp3";; + sles15.4) target="$(uname -r | cut -d . -f 1,2)-sles15sp4";; + fc18) target="3.x-fc18";; ++ oe2003.sp3) target="4.19-oe2003sp3";; + oe2203) target="5.10-oe2203";; + oe2203.sp1) target="5.10-oe2203sp1";; + oe2203.sp2) target="5.10-oe2203sp2";; +diff --git a/contrib/lbuild/lbuild b/contrib/lbuild/lbuild +index e4ccb32879..f6f5b79cfc 100755 +--- a/contrib/lbuild/lbuild ++++ b/contrib/lbuild/lbuild +@@ -329,6 +329,9 @@ check_options() { + 3.0-sles11) + CANONICAL_TARGET="sles11" + ;; ++ 4.19-oe2003*) ++ CANONICAL_TARGET="oe2003" ++ ;; + 5.10-oe2203*) + CANONICAL_TARGET="oe2203" + ;; +diff --git a/contrib/lbuild/lbuild-oe2003 b/contrib/lbuild/lbuild-oe2003 +new file mode 120000 +index 0000000000..4f83bd2f91 +--- /dev/null ++++ b/contrib/lbuild/lbuild-oe2003 +@@ -0,0 +1 @@ ++lbuild-openeuler +\ No newline at end of file +diff --git a/ldiskfs/kernel_patches/patches/oe2003/ext4-filename-encode.patch b/ldiskfs/kernel_patches/patches/oe2003/ext4-filename-encode.patch +new file mode 100644 +index 0000000000..7a35072d3d +--- /dev/null ++++ b/ldiskfs/kernel_patches/patches/oe2003/ext4-filename-encode.patch +@@ -0,0 +1,407 @@ ++From b9fa1d821cddbc0bf9762fb1cbc7c765cf1ce433 Mon Sep 17 00:00:00 2001 ++From: Xinliang Liu ++Date: Thu, 10 Aug 2023 02:32:11 +0000 ++Subject: [PATCH] ext4 filename encode ++ ++Signed-off-by: Xinliang Liu ++--- ++ fs/ext4/critical_encode.h | 166 ++++++++++++++++++++++++++++++++++++++ ++ fs/ext4/dir.c | 27 +++++-- ++ fs/ext4/ialloc.c | 1 + ++ fs/ext4/namei.c | 49 ++++++++--- ++ 4 files changed, 224 insertions(+), 19 deletions(-) ++ create mode 100644 fs/ext4/critical_encode.h ++ ++diff --git a/fs/ext4/critical_encode.h b/fs/ext4/critical_encode.h ++new file mode 100644 ++index 0000000..71318cf ++--- /dev/null +++++ b/fs/ext4/critical_encode.h ++@@ -0,0 +1,166 @@ +++/* +++ * critical_encode.h +++ * +++ * Copyright (c) 2022 Whamcloud +++ */ +++ +++#ifndef _CRITICAL_ENCODE_H +++#define _CRITICAL_ENCODE_H +++ +++#include +++ +++/* Encoding/decoding routines inspired from yEnc principles. +++ * We just take care of a few critical characters: +++ * NULL, LF, CR, /, DEL and =. +++ * If such a char is found, it is replaced with '=' followed by +++ * the char value + 64. +++ * All other chars are left untouched. +++ * Efficiency of this encoding depends on the occurences of the +++ * critical chars, but statistically on binary data it can be much higher +++ * than base64 for instance. +++ */ +++static inline int critical_encode(const u8 *src, int len, char *dst) +++{ +++ u8 *p = (u8 *)src, *q = dst; +++ +++ while (p - src < len) { +++ /* escape NULL, LF, CR, /, DEL and = */ +++ if (unlikely(*p == 0x0 || *p == 0xA || *p == 0xD || +++ *p == '/' || *p == 0x7F || *p == '=')) { +++ *(q++) = '='; +++ *(q++) = *(p++) + 64; +++ } else { +++ *(q++) = *(p++); +++ } +++ } +++ +++ return (char *)q - dst; +++} +++ +++/* returns the number of chars encoding would produce */ +++static inline int critical_chars(const u8 *src, int len) +++{ +++ u8 *p = (u8 *)src; +++ int newlen = len; +++ +++ while (p - src < len) { +++ /* NULL, LF, CR, /, DEL and = cost an additional '=' */ +++ if (unlikely(*p == 0x0 || *p == 0xA || *p == 0xD || +++ *p == '/' || *p == 0x7F || *p == '=')) +++ newlen++; +++ p++; +++ } +++ +++ return newlen; +++} +++ +++/* decoding routine - returns the number of chars in output */ +++static inline int critical_decode(const u8 *src, int len, char *dst) +++{ +++ u8 *p = (u8 *)src, *q = dst; +++ +++ while (p - src < len) { +++ if (unlikely(*p == '=')) { +++ *(q++) = *(++p) - 64; +++ p++; +++ } else { +++ *(q++) = *(p++); +++ } +++ } +++ +++ return (char *)q - dst; +++} +++ +++#define fscrypt_get_encryption_info(inode) \ +++ (unlikely(!IS_LUSTRE_MOUNT(inode->i_sb)) ? 0 : -EOPNOTSUPP) +++ +++static inline int ext4_has_permitted_context(struct inode *parent, +++ struct inode *child) +++{ +++ if (unlikely(!IS_LUSTRE_MOUNT(parent->i_sb))) +++ return 1; +++ return fscrypt_has_permitted_context(parent, child); +++} +++ +++static inline int ext4_prepare_lookup(struct inode *dir, +++ struct dentry *dentry, +++ unsigned int flags) +++{ +++ if (unlikely(!IS_LUSTRE_MOUNT(dir->i_sb))) +++ return 0; +++ return fscrypt_prepare_lookup(dir, dentry, flags); +++} +++ +++static inline int ext4_fname_alloc_buffer(const struct inode *inode, +++ u32 max_encrypted_len, +++ struct fscrypt_str *crypto_str) +++{ +++ crypto_str->name = kmalloc(max_encrypted_len + 1, GFP_NOFS); +++ if (!crypto_str->name) +++ return -ENOMEM; +++ crypto_str->len = max_encrypted_len; +++ return 0; +++} +++ +++static inline void ext4_fname_free_buffer(struct fscrypt_str *crypto_str) +++{ +++ if (!crypto_str) +++ return; +++ kfree(crypto_str->name); +++ crypto_str->name = NULL; +++} +++ +++static inline int ext4_fname_disk_to_usr(struct inode *inode, +++ u32 hash, u32 minor_hash, +++ const struct fscrypt_str *iname, +++ struct fscrypt_str *oname) +++{ +++ int presented_len; +++ +++ presented_len = critical_encode(iname->name, iname->len, oname->name); +++ if (presented_len > NAME_MAX) { +++ /* truncate at NAME_MAX, +++ * or NAME_MAX-1 if name ends with '=' to avoid decoding issue +++ */ +++ presented_len = NAME_MAX; +++ if (oname->name[presented_len - 1] == '=') +++ presented_len--; +++ oname->len = presented_len; +++ } +++ oname->name[presented_len] = '\0'; +++ +++ return 0; +++} +++ +++static inline int ext4_setup_filename(struct inode *dir, +++ const struct qstr *iname, +++ int lookup, +++ struct ext4_filename *fname) +++{ +++ fname->usr_fname = iname; +++ +++ if (lookup && IS_ENCRYPTED(dir) && +++ unlikely(!IS_LUSTRE_MOUNT(dir->i_sb) && +++ strnchr(iname->name, iname->len, '='))) { +++ /* Only proceed to critical decode if +++ * iname contains escape char '='. +++ */ +++ int len = iname->len; +++ char *buf; +++ +++ buf = kmalloc(len, GFP_NOFS); +++ if (!buf) +++ return -ENOMEM; +++ +++ len = critical_decode(iname->name, len, buf); +++ fname->disk_name.name = (unsigned char *)buf; +++ fname->disk_name.len = len; +++ return 0; +++ } +++ +++ fname->disk_name.name = (unsigned char *) iname->name; +++ fname->disk_name.len = iname->len; +++ return 0; +++} +++ +++#endif /* _CRITICAL_ENCODE_H */ ++diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c ++index fe7149b..ed2ecb5 100644 ++--- a/fs/ext4/dir.c +++++ b/fs/ext4/dir.c ++@@ -28,6 +28,7 @@ ++ #include ++ #include "ext4.h" ++ #include "xattr.h" +++#include "critical_encode.h" ++ ++ static int ext4_dx_readdir(struct file *, struct dir_context *); ++ ++@@ -144,7 +145,8 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) ++ return err; ++ } ++ ++- if (ext4_encrypted_inode(inode)) { +++ /* disable decryption of filename, present only escaped name */ +++ if (0 && ext4_encrypted_inode(inode)) { ++ err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr); ++ if (err < 0) ++ return err; ++@@ -258,22 +260,33 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) ++ get_dtype(sb, de->file_type))) ++ goto done; ++ } else { ++- int save_len = fstr.len; ++ struct fscrypt_str de_name = ++ FSTR_INIT(de->name, ++ de->name_len); +++ int presented_len; ++ ++ /* Directory is encrypted */ ++- err = fscrypt_fname_disk_to_usr(inode, +++ presented_len = critical_chars(de->name, +++ de->name_len); +++ err = ext4_fname_alloc_buffer(inode, +++ presented_len, +++ &fstr); +++ if (err) +++ goto errout; +++ +++ err = ext4_fname_disk_to_usr(inode, ++ 0, 0, &de_name, &fstr); ++ de_name = fstr; ++- fstr.len = save_len; ++- if (err) +++ if (err) { +++ ext4_fname_free_buffer(&fstr); ++ goto errout; ++- if (!dir_emit(ctx, +++ } +++ err = dir_emit(ctx, ++ de_name.name, de_name.len, ++ le32_to_cpu(de->inode), ++- get_dtype(sb, de->file_type))) +++ get_dtype(sb, de->file_type)); +++ ext4_fname_free_buffer(&fstr); +++ if (!err) ++ goto done; ++ } ++ } ++diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c ++index 82a5bd6..2a592b3 100644 ++--- a/fs/ext4/ialloc.c +++++ b/fs/ext4/ialloc.c ++@@ -30,6 +30,7 @@ ++ #include "ext4_jbd2.h" ++ #include "xattr.h" ++ #include "acl.h" +++#include "critical_encode.h" ++ ++ #include ++ ++diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c ++index 9a3db3c..7e22eae 100644 ++--- a/fs/ext4/namei.c +++++ b/fs/ext4/namei.c ++@@ -40,6 +40,7 @@ ++ ++ #include "xattr.h" ++ #include "acl.h" +++#include "critical_encode.h" ++ ++ #include ++ /* ++@@ -1429,22 +1430,31 @@ static int htree_dirblock_to_tree(struct file *dir_file, ++ hinfo->hash, hinfo->minor_hash, de, ++ &tmp_str); ++ } else { ++- int save_len = fname_crypto_str.len; ++ struct fscrypt_str de_name = FSTR_INIT(de->name, ++ de->name_len); +++ int presented_len; ++ ++ /* Directory is encrypted */ ++- err = fscrypt_fname_disk_to_usr(dir, hinfo->hash, +++ presented_len = critical_chars(de->name, de->name_len); +++ err = ext4_fname_alloc_buffer(dir, presented_len, +++ &fname_crypto_str); +++ if (err) { +++ count = err; +++ goto errout; +++ } +++ +++ err = ext4_fname_disk_to_usr(dir, hinfo->hash, ++ hinfo->minor_hash, &de_name, ++ &fname_crypto_str); ++ if (err) { +++ ext4_fname_free_buffer(&fname_crypto_str); ++ count = err; ++ goto errout; ++ } ++ err = ext4_htree_store_dirent(dir_file, ++ hinfo->hash, hinfo->minor_hash, de, ++ &fname_crypto_str); ++- fname_crypto_str.len = save_len; +++ ext4_fname_free_buffer(&fname_crypto_str); ++ } ++ if (err != 0) { ++ count = err; ++@@ -1672,7 +1682,7 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) ++ * Return: %true if the directory entry matches, otherwise %false. ++ */ ++ static inline bool ext4_match(const struct ext4_filename *fname, ++- const struct ext4_dir_entry_2 *de) +++ const struct ext4_dir_entry_2 *de, int denamelen) ++ { ++ struct fscrypt_name f; ++ ++@@ -1684,7 +1694,7 @@ static inline bool ext4_match(const struct ext4_filename *fname, ++ #ifdef CONFIG_EXT4_FS_ENCRYPTION ++ f.crypto_buf = fname->crypto_buf; ++ #endif ++- return fscrypt_match_name(&f, de->name, de->name_len); +++ return fscrypt_match_name(&f, de->name, denamelen); ++ } ++ ++ /* ++@@ -1695,16 +1705,30 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, ++ unsigned int offset, struct ext4_dir_entry_2 **res_dir) ++ { ++ struct ext4_dir_entry_2 * de; +++ bool probablytrunc; ++ char * dlimit; ++- int de_len; +++ int de_len, denamelen; ++ ++ de = (struct ext4_dir_entry_2 *)search_buf; ++ dlimit = search_buf + buf_size; +++ /* fname is probably truncated if it is the decoded representation of +++ * an encrypted filename not aligned on a 32-byte boundary +++ */ +++ probablytrunc = !IS_LUSTRE_MOUNT(dir->i_sb) && IS_ENCRYPTED(dir) && +++ fname->disk_name.len & 31; ++ while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) { ++ /* this code is executed quadratically often */ ++ /* do minimal checking `by hand' */ +++ denamelen = de->name_len; +++ if (unlikely(probablytrunc) && +++ de->name_len > fname->disk_name.len) +++ /* Adjust name len to look for a partial match. +++ * Since it is binary encrypted names, there +++ * should not be any collision between names. +++ */ +++ denamelen = fname->disk_name.len; ++ if (de->name + de->name_len <= dlimit && ++- ext4_match(fname, de)) { +++ ext4_match(fname, de, denamelen)) { ++ /* found a match - just to be sure, do ++ * a full check */ ++ if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf, ++@@ -1775,7 +1799,7 @@ struct buffer_head *__ext4_find_entry(struct inode *dir, ++ if (namelen > EXT4_NAME_LEN) ++ return NULL; ++ ++- retval = ext4_fname_setup_filename(dir, d_name, 1, &fname); +++ retval = ext4_setup_filename(dir, d_name, 1, &fname); ++ if (retval == -ENOENT) ++ return NULL; ++ if (retval) ++@@ -1902,7 +1926,8 @@ cleanup_and_exit: ++ /* Clean up the read-ahead blocks */ ++ for (; ra_ptr < ra_max; ra_ptr++) ++ brelse(bh_use[ra_ptr]); ++- ext4_fname_free_filename(&fname); +++ if (fname.disk_name.name != d_name->name) +++ kfree(fname.disk_name.name); ++ return ret; ++ } ++ EXPORT_SYMBOL(__ext4_find_entry); ++@@ -1968,7 +1993,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi ++ struct buffer_head *bh; ++ int err; ++ ++- err = fscrypt_prepare_lookup(dir, dentry, flags); +++ err = ext4_prepare_lookup(dir, dentry, flags); ++ if (err) ++ return ERR_PTR(err); ++ ++@@ -2000,7 +2025,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi ++ } ++ if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && ++ (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && ++- !fscrypt_has_permitted_context(dir, inode)) { +++ !ext4_has_permitted_context(dir, inode)) { ++ ext4_warning(inode->i_sb, ++ "Inconsistent encryption contexts: %lu/%lu", ++ dir->i_ino, inode->i_ino); ++@@ -2277,7 +2302,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, ++ if (ext4_check_dir_entry(dir, NULL, de, bh, ++ buf, buf_size, offset)) ++ return -EFSCORRUPTED; ++- if (ext4_match(fname, de)) +++ if (ext4_match(fname, de, de->name_len)) ++ return -EEXIST; ++ nlen = EXT4_DIR_ENTRY_LEN(de); ++ rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); ++-- ++2.27.0 ++ +diff --git a/ldiskfs/kernel_patches/patches/oe2003/ext4-mballoc-extra-checks.patch b/ldiskfs/kernel_patches/patches/oe2003/ext4-mballoc-extra-checks.patch +new file mode 100644 +index 0000000000..d483d6892e +--- /dev/null ++++ b/ldiskfs/kernel_patches/patches/oe2003/ext4-mballoc-extra-checks.patch +@@ -0,0 +1,317 @@ ++From 5b0eb8877650160e4f7357be60cd35a2ab2f2e01 Mon Sep 17 00:00:00 2001 ++From: Xinliang Liu ++Date: Wed, 9 Aug 2023 10:39:28 +0000 ++Subject: [PATCH] ext4 mballoc extra checks ++ ++Signed-off-by: Xinliang Liu ++--- ++ fs/ext4/ext4.h | 1 + ++ fs/ext4/mballoc.c | 103 ++++++++++++++++++++++++++++++++++++++++------ ++ fs/ext4/mballoc.h | 2 +- ++ 3 files changed, 93 insertions(+), 13 deletions(-) ++ ++diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h ++index b9757e2..a488545 100644 ++--- a/fs/ext4/ext4.h +++++ b/fs/ext4/ext4.h ++@@ -3037,6 +3037,7 @@ struct ext4_group_info { ++ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ ++ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ ++ struct list_head bb_prealloc_list; +++ unsigned long bb_prealloc_nr; ++ #ifdef DOUBLE_CHECK ++ void *bb_bitmap; ++ #endif ++diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c ++index f0797b0..e6b94cd 100644 ++--- a/fs/ext4/mballoc.c +++++ b/fs/ext4/mballoc.c ++@@ -352,7 +352,7 @@ static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { ++ "ext4_groupinfo_64k", "ext4_groupinfo_128k" ++ }; ++ ++-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, +++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++ ext4_group_t group); ++ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ++ ext4_group_t group); ++@@ -708,7 +708,7 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) ++ } ++ ++ static noinline_for_stack ++-void ext4_mb_generate_buddy(struct super_block *sb, +++int ext4_mb_generate_buddy(struct super_block *sb, ++ void *buddy, void *bitmap, ext4_group_t group) ++ { ++ struct ext4_group_info *grp = ext4_get_group_info(sb, group); ++@@ -752,6 +752,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, ++ grp->bb_free = free; ++ ext4_mark_group_bitmap_corrupted(sb, group, ++ EXT4_GROUP_INFO_BBITMAP_CORRUPT); +++ return -EIO; ++ } ++ mb_set_largest_free_order(sb, grp); ++ ++@@ -762,6 +763,8 @@ void ext4_mb_generate_buddy(struct super_block *sb, ++ sbi->s_mb_buddies_generated++; ++ sbi->s_mb_generation_time += period; ++ spin_unlock(&sbi->s_bal_lock); +++ +++ return 0; ++ } ++ ++ static void mb_regenerate_buddy(struct ext4_buddy *e4b) ++@@ -882,7 +885,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) ++ } ++ ++ first_block = page->index * blocks_per_page; ++- for (i = 0; i < blocks_per_page; i++) { +++ for (i = 0; i < blocks_per_page && err == 0; i++) { ++ group = (first_block + i) >> 1; ++ if (group >= ngroups) ++ break; ++@@ -926,7 +929,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) ++ ext4_lock_group(sb, group); ++ /* init the buddy */ ++ memset(data, 0xff, blocksize); ++- ext4_mb_generate_buddy(sb, data, incore, group); +++ err = ext4_mb_generate_buddy(sb, data, incore, group); ++ ext4_unlock_group(sb, group); ++ incore = NULL; ++ } else { ++@@ -941,7 +944,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) ++ memcpy(data, bitmap, blocksize); ++ ++ /* mark all preallocated blks used in in-core bitmap */ ++- ext4_mb_generate_from_pa(sb, data, group); +++ err = ext4_mb_generate_from_pa(sb, data, group); ++ ext4_mb_generate_from_freelist(sb, data, group); ++ ext4_unlock_group(sb, group); ++ ++@@ -951,7 +954,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) ++ incore = data; ++ } ++ } ++- SetPageUptodate(page); +++ if (likely(err == 0)) +++ SetPageUptodate(page); ++ ++ out: ++ if (bh) { ++@@ -2290,9 +2294,11 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) ++ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) ++ { ++ struct super_block *sb = PDE_DATA(file_inode(seq->file)); +++ struct ext4_group_desc *gdp; ++ ext4_group_t group = (ext4_group_t) ((unsigned long) v); ++ int i; ++ int err, buddy_loaded = 0; +++ int free = 0; ++ struct ext4_buddy e4b; ++ struct ext4_group_info *grinfo; ++ unsigned char blocksize_bits = min_t(unsigned char, ++@@ -2305,7 +2311,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) ++ ++ group--; ++ if (group == 0) ++- seq_puts(seq, "#group: free frags first [" +++ seq_puts(seq, "#group: bfree gfree frags first pa [" ++ " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " ++ " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); ++ ++@@ -2323,13 +2329,19 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) ++ buddy_loaded = 1; ++ } ++ +++ gdp = ext4_get_group_desc(sb, group, NULL); +++ if (gdp != NULL) +++ free = ext4_free_group_clusters(sb, gdp); +++ ++ memcpy(&sg, ext4_get_group_info(sb, group), i); ++ ++ if (buddy_loaded) ++ ext4_mb_unload_buddy(&e4b); ++ ++- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, ++- sg.info.bb_fragments, sg.info.bb_first_free); +++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u %-5u %-5lu [", +++ (long unsigned int)group, sg.info.bb_free, free, +++ sg.info.bb_fragments, sg.info.bb_first_free, +++ sg.info.bb_prealloc_nr); ++ for (i = 0; i <= 13; i++) ++ seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? ++ sg.info.bb_counters[i] : 0); ++@@ -3668,23 +3680,72 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ++ return; ++ } ++ +++/* +++ * check free blocks in bitmap match free block in group descriptor +++ * do this before taking preallocated blocks into account to be able +++ * to detect on-disk corruptions. The group lock should be hold by the +++ * caller. +++ */ +++int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap, +++ struct ext4_group_desc *gdp, int group) +++{ +++ unsigned short max = EXT4_CLUSTERS_PER_GROUP(sb); +++ unsigned short i, first, free = 0; +++ unsigned short free_in_gdp = ext4_free_group_clusters(sb, gdp); +++ +++ if (free_in_gdp == 0 && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) +++ return 0; +++ +++ i = mb_find_next_zero_bit(bitmap, max, 0); +++ +++ while (i < max) { +++ first = i; +++ i = mb_find_next_bit(bitmap, max, i); +++ if (i > max) +++ i = max; +++ free += i - first; +++ if (i < max) +++ i = mb_find_next_zero_bit(bitmap, max, i); +++ } +++ +++ if (free != free_in_gdp) { +++ ext4_error(sb, "on-disk bitmap for group %d" +++ "corrupted: %u blocks free in bitmap, %u - in gd\n", +++ group, free, free_in_gdp); +++ return -EIO; +++ } +++ return 0; +++} +++ ++ /* ++ * the function goes through all preallocation in this group and marks them ++ * used in in-core bitmap. buddy must be generated from this bitmap ++ * Need to be called with ext4 group lock held ++ */ ++ static noinline_for_stack ++-void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, +++int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++ ext4_group_t group) ++ { ++ struct ext4_group_info *grp = ext4_get_group_info(sb, group); ++ struct ext4_prealloc_space *pa; +++ struct ext4_group_desc *gdp; ++ struct list_head *cur; ++ ext4_group_t groupnr; ++ ext4_grpblk_t start; ++ int preallocated = 0; +++ int skip = 0, count = 0; +++ int err; ++ int len; ++ +++ gdp = ext4_get_group_desc(sb, group, NULL); +++ if (gdp == NULL) +++ return -EIO; +++ +++ /* before applying preallocations, check bitmap consistency */ +++ err = ext4_mb_check_ondisk_bitmap(sb, bitmap, gdp, group); +++ if (err) +++ return err; +++ ++ /* all form of preallocation discards first load group, ++ * so the only competing code is preallocation use. ++ * we don't need any locking here ++@@ -3700,13 +3761,23 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++ &groupnr, &start); ++ len = pa->pa_len; ++ spin_unlock(&pa->pa_lock); ++- if (unlikely(len == 0)) +++ if (unlikely(len == 0)) { +++ skip++; ++ continue; +++ } ++ BUG_ON(groupnr != group); ++ ext4_set_bits(bitmap, start, len); ++ preallocated += len; +++ count++; ++ } +++ if (count + skip != grp->bb_prealloc_nr) { +++ ext4_error(sb, "lost preallocations: " +++ "count %d, bb_prealloc_nr %lu, skip %d\n", +++ count, grp->bb_prealloc_nr, skip); +++ return -EIO; +++ } ++ mb_debug(1, "preallocated %u for group %u\n", preallocated, group); +++ return 0; ++ } ++ ++ static void ext4_mb_pa_callback(struct rcu_head *head) ++@@ -3770,6 +3841,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, ++ */ ++ ext4_lock_group(sb, grp); ++ list_del(&pa->pa_group_list); +++ ext4_get_group_info(sb, grp)->bb_prealloc_nr--; ++ ext4_unlock_group(sb, grp); ++ ++ spin_lock(pa->pa_obj_lock); ++@@ -3864,6 +3936,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) ++ ++ ext4_lock_group(sb, ac->ac_b_ex.fe_group); ++ list_add(&pa->pa_group_list, &grp->bb_prealloc_list); +++ grp->bb_prealloc_nr++; ++ ext4_unlock_group(sb, ac->ac_b_ex.fe_group); ++ ++ spin_lock(pa->pa_obj_lock); ++@@ -3925,6 +3998,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) ++ ++ ext4_lock_group(sb, ac->ac_b_ex.fe_group); ++ list_add(&pa->pa_group_list, &grp->bb_prealloc_list); +++ grp->bb_prealloc_nr++; ++ ext4_unlock_group(sb, ac->ac_b_ex.fe_group); ++ ++ /* ++@@ -4097,6 +4171,8 @@ repeat: ++ ++ spin_unlock(&pa->pa_lock); ++ +++ BUG_ON(grp->bb_prealloc_nr == 0); +++ grp->bb_prealloc_nr--; ++ list_del(&pa->pa_group_list); ++ list_add(&pa->u.pa_tmp_list, &list); ++ } ++@@ -4227,7 +4303,7 @@ repeat: ++ if (err) { ++ ext4_error_err(sb, -err, "Error %d loading buddy information for %u", ++ err, group); ++- continue; +++ return; ++ } ++ ++ bitmap_bh = ext4_read_block_bitmap(sb, group); ++@@ -4240,6 +4316,8 @@ repeat: ++ } ++ ++ ext4_lock_group(sb, group); +++ BUG_ON(e4b.bd_info->bb_prealloc_nr == 0); +++ e4b.bd_info->bb_prealloc_nr--; ++ list_del(&pa->pa_group_list); ++ ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); ++ ext4_unlock_group(sb, group); ++@@ -4501,6 +4579,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, ++ } ++ ext4_lock_group(sb, group); ++ list_del(&pa->pa_group_list); +++ ext4_get_group_info(sb, group)->bb_prealloc_nr--; ++ ext4_mb_release_group_pa(&e4b, pa); ++ ext4_unlock_group(sb, group); ++ ++diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h ++index 88c98f1..8325ad9 100644 ++--- a/fs/ext4/mballoc.h +++++ b/fs/ext4/mballoc.h ++@@ -70,7 +70,7 @@ do { \ ++ /* ++ * for which requests use 2^N search using buddies ++ */ ++-#define MB_DEFAULT_ORDER2_REQS 2 +++#define MB_DEFAULT_ORDER2_REQS 8 ++ ++ /* ++ * default group prealloc size 512 blocks ++-- ++2.27.0 ++ +diff --git a/ldiskfs/kernel_patches/patches/oe2003/ext4-mballoc-pa-free-mismatch.patch b/ldiskfs/kernel_patches/patches/oe2003/ext4-mballoc-pa-free-mismatch.patch +new file mode 100644 +index 0000000000..852bae0e29 +--- /dev/null ++++ b/ldiskfs/kernel_patches/patches/oe2003/ext4-mballoc-pa-free-mismatch.patch +@@ -0,0 +1,125 @@ ++From 697cee365004fd4f0438c265895d23320007603e Mon Sep 17 00:00:00 2001 ++From: Xinliang Liu ++Date: Wed, 9 Aug 2023 10:54:47 +0000 ++Subject: [PATCH] ext4 mballoc pa free mismatch ++ ++Signed-off-by: Xinliang Liu ++--- ++ fs/ext4/mballoc.c | 44 ++++++++++++++++++++++++++++++++++++++------ ++ fs/ext4/mballoc.h | 2 ++ ++ 2 files changed, 40 insertions(+), 6 deletions(-) ++ ++diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c ++index e6b94cd..3afef88 100644 ++--- a/fs/ext4/mballoc.c +++++ b/fs/ext4/mballoc.c ++@@ -3920,6 +3920,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) ++ INIT_LIST_HEAD(&pa->pa_group_list); ++ pa->pa_deleted = 0; ++ pa->pa_type = MB_INODE_PA; +++ pa->pa_error = 0; ++ ++ mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, ++ pa->pa_pstart, pa->pa_len, pa->pa_lstart); ++@@ -3981,6 +3982,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) ++ INIT_LIST_HEAD(&pa->pa_group_list); ++ pa->pa_deleted = 0; ++ pa->pa_type = MB_GROUP_PA; +++ pa->pa_error = 0; ++ ++ mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, ++ pa->pa_pstart, pa->pa_len, pa->pa_lstart); ++@@ -4040,7 +4042,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, ++ unsigned long long grp_blk_start; ++ int free = 0; ++ +++ assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); ++ BUG_ON(pa->pa_deleted == 0); +++ BUG_ON(pa->pa_inode == NULL); ++ ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); ++ grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); ++ BUG_ON(group != e4b->bd_group && pa->pa_len != 0); ++@@ -4063,12 +4067,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, ++ mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); ++ bit = next + 1; ++ } ++- if (free != pa->pa_free) { ++- ext4_msg(e4b->bd_sb, KERN_CRIT, ++- "pa %p: logic %lu, phys. %lu, len %lu", ++- pa, (unsigned long) pa->pa_lstart, ++- (unsigned long) pa->pa_pstart, ++- (unsigned long) pa->pa_len); +++ +++ /* "free < pa->pa_free" means we maybe double alloc the same blocks, +++ * otherwise maybe leave some free blocks unavailable, no need to BUG.*/ +++ if (((free > pa->pa_free && !pa->pa_error) || (free < pa->pa_free)) && +++ atomic_read(&sb->s_active) > 0) { +++ ext4_error(sb, "pa free mismatch: [pa %p] " +++ "[phy %lu] [logic %lu] [len %u] [free %u] " +++ "[error %u] [inode %lu] [freed %u]", pa, +++ (unsigned long)pa->pa_pstart, +++ (unsigned long)pa->pa_lstart, +++ (unsigned)pa->pa_len, (unsigned)pa->pa_free, +++ (unsigned)pa->pa_error, pa->pa_inode->i_ino, +++ free); ++ ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", ++ free, pa->pa_free); ++ /* ++@@ -4076,6 +4087,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, ++ * from the bitmap and continue. ++ */ ++ } +++ /* do not verify if the file system is being umounted */ +++ BUG_ON(pa->pa_free != free && atomic_read(&sb->s_active) > 0); ++ atomic_add(free, &sbi->s_mb_discarded); ++ ++ return 0; ++@@ -4822,6 +4835,25 @@ errout: ++ ac->ac_b_ex.fe_len = 0; ++ ar->len = 0; ++ ext4_mb_show_ac(ac); +++ if (ac->ac_pa) { +++ struct ext4_prealloc_space *pa = ac->ac_pa; +++ +++ /* We can not make sure whether the bitmap has +++ * been updated or not when fail case. So can +++ * not revert pa_free back, just mark pa_error*/ +++ pa->pa_error++; +++ ext4_error(sb, +++ "Updating bitmap error: [err %d] " +++ "[pa %p] [phy %lu] [logic %lu] " +++ "[len %u] [free %u] [error %u] " +++ "[inode %lu]", *errp, pa, +++ (unsigned long)pa->pa_pstart, +++ (unsigned long)pa->pa_lstart, +++ (unsigned)pa->pa_len, +++ (unsigned)pa->pa_free, +++ (unsigned)pa->pa_error, +++ pa->pa_inode ? pa->pa_inode->i_ino : 0); +++ } ++ } ++ ext4_mb_release_context(ac); ++ out: ++diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h ++index 8325ad9..e00c3b7 100644 ++--- a/fs/ext4/mballoc.h +++++ b/fs/ext4/mballoc.h ++@@ -20,6 +20,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include "ext4_jbd2.h" ++ #include "ext4.h" ++ ++@@ -111,6 +112,7 @@ struct ext4_prealloc_space { ++ ext4_grpblk_t pa_len; /* len of preallocated chunk */ ++ ext4_grpblk_t pa_free; /* how many blocks are free */ ++ unsigned short pa_type; /* pa type. inode or group */ +++ unsigned short pa_error; ++ spinlock_t *pa_obj_lock; ++ struct inode *pa_inode; /* hack, for history only */ ++ }; ++-- ++2.27.0 ++ +diff --git a/ldiskfs/kernel_patches/patches/oe2003/ext4-mballoc-prefetch.patch b/ldiskfs/kernel_patches/patches/oe2003/ext4-mballoc-prefetch.patch +new file mode 100644 +index 0000000000..2506b56895 +--- /dev/null ++++ b/ldiskfs/kernel_patches/patches/oe2003/ext4-mballoc-prefetch.patch +@@ -0,0 +1,288 @@ ++From 9e2e2f2b16f9693deb69b98e15d72112b75cc086 Mon Sep 17 00:00:00 2001 ++From: Xinliang Liu ++Date: Thu, 10 Aug 2023 02:06:24 +0000 ++Subject: [PATCH] ext4 mballoc prefetch ++ ++Signed-off-by: Xinliang Liu ++--- ++ fs/ext4/balloc.c | 12 ++++- ++ fs/ext4/ext4.h | 10 ++++- ++ fs/ext4/mballoc.c | 111 +++++++++++++++++++++++++++++++++++++++++++++- ++ fs/ext4/mballoc.h | 2 + ++ fs/ext4/sysfs.c | 4 ++ ++ 5 files changed, 135 insertions(+), 4 deletions(-) ++ ++diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c ++index bd8a47e..504833d 100644 ++--- a/fs/ext4/balloc.c +++++ b/fs/ext4/balloc.c ++@@ -437,7 +437,8 @@ verified: ++ * Return buffer_head on success or NULL in case of failure. ++ */ ++ struct buffer_head * ++-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) +++ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, +++ int ignore_locked) ++ { ++ struct ext4_group_desc *desc; ++ struct ext4_sb_info *sbi = EXT4_SB(sb); ++@@ -468,6 +469,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) ++ if (bitmap_uptodate(bh)) ++ goto verify; ++ +++ if (ignore_locked && buffer_locked(bh)) { +++ /* buffer under IO already, do not wait +++ * if called for prefetching */ +++ put_bh(bh); +++ return NULL; +++ } +++ ++ lock_buffer(bh); ++ if (bitmap_uptodate(bh)) { ++ unlock_buffer(bh); ++@@ -555,7 +563,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) ++ struct buffer_head *bh; ++ int err; ++ ++- bh = ext4_read_block_bitmap_nowait(sb, block_group); +++ bh = ext4_read_block_bitmap_nowait(sb, block_group, 0); ++ if (IS_ERR(bh)) ++ return bh; ++ err = ext4_wait_block_bitmap(sb, block_group, bh); ++diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h ++index 5f0dd85..42f8f7d 100644 ++--- a/fs/ext4/ext4.h +++++ b/fs/ext4/ext4.h ++@@ -1481,6 +1481,8 @@ struct ext4_sb_info { ++ /* where last allocation was done - for stream allocation */ ++ unsigned long s_mb_last_group; ++ unsigned long s_mb_last_start; +++ unsigned int s_mb_prefetch; +++ unsigned int s_mb_prefetch_limit; ++ ++ /* stats for buddy allocator */ ++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ++@@ -2529,7 +2531,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ++ extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); ++ ++ extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, ++- ext4_group_t block_group); +++ ext4_group_t block_group, +++ int ignore_locked); ++ extern int ext4_wait_block_bitmap(struct super_block *sb, ++ ext4_group_t block_group, ++ struct buffer_head *bh); ++@@ -3211,6 +3214,7 @@ struct ext4_group_info { ++ (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) ++ #define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ ++ (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) +++#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4 ++ ++ #define EXT4_MB_GRP_NEED_INIT(grp) \ ++ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) ++@@ -3225,6 +3229,10 @@ struct ext4_group_info { ++ (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) ++ #define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ ++ (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +++#define EXT4_MB_GRP_TEST(grp) \ +++ (test_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state))) +++#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \ +++ (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state))) ++ ++ #define EXT4_MAX_CONTENTION 8 ++ #define EXT4_CONTENTION_THRESHOLD 2 ++diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c ++index 5868b3f..e827c0c 100644 ++--- a/fs/ext4/mballoc.c +++++ b/fs/ext4/mballoc.c ++@@ -868,7 +868,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) ++ bh[i] = NULL; ++ continue; ++ } ++- bh[i] = ext4_read_block_bitmap_nowait(sb, group); +++ bh[i] = ext4_read_block_bitmap_nowait(sb, group, 0); ++ if (IS_ERR(bh[i])) { ++ err = PTR_ERR(bh[i]); ++ bh[i] = NULL; ++@@ -2151,6 +2151,92 @@ static u64 available_blocks_count(struct ext4_sb_info *sbi) ++ return bfree - (ext4_r_blocks_count(es) + resv_blocks); ++ } ++ +++/* +++ * each allocation context (i.e. a thread doing allocation) has own +++ * sliding prefetch window of @s_mb_prefetch size which starts at the +++ * very first goal and moves ahead of scaning. +++ * a side effect is that subsequent allocations will likely find +++ * the bitmaps in cache or at least in-flight. +++ */ +++static void +++ext4_mb_prefetch(struct ext4_allocation_context *ac, +++ ext4_group_t start) +++{ +++ struct super_block *sb = ac->ac_sb; +++ ext4_group_t ngroups = ext4_get_groups_count(sb); +++ struct ext4_sb_info *sbi = EXT4_SB(sb); +++ struct ext4_group_info *grp; +++ ext4_group_t group = start; +++ struct buffer_head *bh; +++ int nr; +++ +++ /* limit prefetching at cr=0, otherwise mballoc can +++ * spend a lot of time loading imperfect groups */ +++ if (ac->ac_criteria < 2 && ac->ac_prefetch_ios >= sbi->s_mb_prefetch_limit) +++ return; +++ +++ /* batch prefetching to get few READs in flight */ +++ nr = ac->ac_prefetch - group; +++ if (ac->ac_prefetch < group) +++ /* wrapped to the first groups */ +++ nr += ngroups; +++ if (nr > 0) +++ return; +++ BUG_ON(nr < 0); +++ +++ nr = sbi->s_mb_prefetch; +++ if (ext4_has_feature_flex_bg(sb)) { +++ /* align to flex_bg to get more bitmas with a single IO */ +++ nr = (group / sbi->s_mb_prefetch) * sbi->s_mb_prefetch; +++ nr = nr + sbi->s_mb_prefetch - group; +++ } +++ while (nr-- > 0) { +++ grp = ext4_get_group_info(sb, group); +++ /* prevent expensive getblk() on groups w/ IO in progress */ +++ if (EXT4_MB_GRP_TEST(grp) || EXT4_MB_GRP_TEST_AND_SET_READ(grp)) +++ goto next; +++ +++ /* ignore empty groups - those will be skipped +++ * during the scanning as well */ +++ if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) { +++ bh = ext4_read_block_bitmap_nowait(sb, group, 1); +++ if (bh && !IS_ERR(bh)) { +++ if (!buffer_uptodate(bh)) +++ ac->ac_prefetch_ios++; +++ brelse(bh); +++ } +++ } +++next: +++ if (++group >= ngroups) +++ group = 0; +++ } +++ ac->ac_prefetch = group; +++} +++ +++static void +++ext4_mb_prefetch_fini(struct ext4_allocation_context *ac) +++{ +++ struct ext4_group_info *grp; +++ ext4_group_t group; +++ int nr, rc; +++ +++ /* initialize last window of prefetched groups */ +++ nr = ac->ac_prefetch_ios; +++ if (nr > EXT4_SB(ac->ac_sb)->s_mb_prefetch) +++ nr = EXT4_SB(ac->ac_sb)->s_mb_prefetch; +++ group = ac->ac_prefetch; +++ while (nr-- > 0) { +++ grp = ext4_get_group_info(ac->ac_sb, group); +++ if (grp->bb_free > 0 && EXT4_MB_GRP_NEED_INIT(grp)) { +++ rc = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); +++ if (rc) +++ break; +++ } +++ if (group-- == 0) +++ group = ext4_get_groups_count(ac->ac_sb) - 1; +++ } +++} +++ ++ static noinline_for_stack int ++ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ++ { ++@@ -2240,6 +2326,7 @@ repeat: ++ * from the goal value specified ++ */ ++ group = ac->ac_g_ex.fe_group; +++ ac->ac_prefetch = group; ++ ++ for (i = 0; i < ngroups; group++, i++) { ++ int ret = 0; ++@@ -2251,6 +2338,8 @@ repeat: ++ if (group >= ngroups) ++ group = 0; ++ +++ ext4_mb_prefetch(ac, group); +++ ++ /* This now checks without needing the buddy page */ ++ ret = ext4_mb_good_group(ac, group, cr); ++ if (ret <= 0) { ++@@ -2326,6 +2415,8 @@ repeat: ++ out: ++ if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) ++ err = first_err; +++ /* use prefetched bitmaps to init buddy so that read info is not lost */ +++ ext4_mb_prefetch_fini(ac); ++ return err; ++ } ++ ++@@ -3012,6 +3103,24 @@ int ext4_mb_init(struct super_block *sb) ++ sbi->s_mb_large_req = sbi->s_stripe * 8; ++ sbi->s_mb_group_prealloc = sbi->s_stripe * 4; ++ } +++ if (ext4_has_feature_flex_bg(sb)) { +++ /* a single flex group is supposed to be read by a single IO */ +++ sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex; +++ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ +++ } else { +++ sbi->s_mb_prefetch = 32; +++ } +++ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) +++ sbi->s_mb_prefetch = ext4_get_groups_count(sb); +++ /* now many real IOs to prefetch within a single allocation at cr=0 +++ * given cr=0 is an CPU-related optimization we shouldn't try to +++ * load too many groups, at some point we should start to use what +++ * we've got in memory. +++ * with an average random access time 5ms, it'd take a second to get +++ * 200 groups (* N with flex_bg), so let's make this limit 4 */ +++ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4; +++ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb)) +++ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb); ++ ++ sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); ++ if (sbi->s_locality_groups == NULL) { ++diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h ++index d02daaf..608a702 100644 ++--- a/fs/ext4/mballoc.h +++++ b/fs/ext4/mballoc.h ++@@ -180,6 +180,8 @@ struct ext4_allocation_context { ++ struct page *ac_buddy_page; ++ struct ext4_prealloc_space *ac_pa; ++ struct ext4_locality_group *ac_lg; +++ ext4_group_t ac_prefetch; +++ int ac_prefetch_ios; /* number of initialied prefetch IO */ ++ }; ++ ++ #define AC_STATUS_CONTINUE 1 ++diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c ++index abb9ee6..3ddd1fe 100644 ++--- a/fs/ext4/sysfs.c +++++ b/fs/ext4/sysfs.c ++@@ -224,6 +224,8 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); ++ EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); ++ EXT4_ATTR(first_error_time, 0444, first_error_time); ++ EXT4_ATTR(last_error_time, 0444, last_error_time); +++EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch); +++EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit); ++ ++ static unsigned int old_bump_val = 128; ++ EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); ++@@ -260,6 +262,8 @@ static struct attribute *ext4_attrs[] = { ++ ATTR_LIST(errors_count), ++ ATTR_LIST(first_error_time), ++ ATTR_LIST(last_error_time), +++ ATTR_LIST(mb_prefetch), +++ ATTR_LIST(mb_prefetch_limit), ++ NULL, ++ }; ++ ++-- ++2.27.0 ++ +diff --git a/ldiskfs/kernel_patches/patches/oe2003/ext4-misc.patch b/ldiskfs/kernel_patches/patches/oe2003/ext4-misc.patch +new file mode 100644 +index 0000000000..4beabb2690 +--- /dev/null ++++ b/ldiskfs/kernel_patches/patches/oe2003/ext4-misc.patch +@@ -0,0 +1,210 @@ ++From cca6adb175c514457f9284e2d0c6a9645a84f8f6 Mon Sep 17 00:00:00 2001 ++From: Xinliang Liu ++Date: Wed, 9 Aug 2023 10:26:06 +0000 ++Subject: [PATCH] ext4 misc ++ ++Signed-off-by: Xinliang Liu ++--- ++ fs/ext4/ext4.h | 25 ++++++++++++++++++++++++- ++ fs/ext4/ialloc.c | 3 ++- ++ fs/ext4/inode.c | 16 ++++++++++++++++ ++ fs/ext4/namei.c | 9 ++++++--- ++ fs/ext4/super.c | 10 ++-------- ++ 5 files changed, 50 insertions(+), 13 deletions(-) ++ ++diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h ++index 1f6f586..b9757e2 100644 ++--- a/fs/ext4/ext4.h +++++ b/fs/ext4/ext4.h ++@@ -1699,6 +1699,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) ++ */ ++ #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime ++ +++#define JOURNAL_START_HAS_3ARGS 1 +++ ++ /* ++ * Codes for operating systems ++ */ ++@@ -1918,7 +1920,21 @@ static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_bl ++ ++ EXTN_FEATURE_FUNCS(2) ++ EXTN_FEATURE_FUNCS(3) ++-EXTN_FEATURE_FUNCS(4) +++static inline bool ext4_has_unknown_ext4_compat_features(struct super_block *sb) +++{ +++ return ((EXT4_SB(sb)->s_es->s_feature_compat & +++ cpu_to_le32(~EXT4_FEATURE_COMPAT_SUPP)) != 0); +++} +++static inline bool ext4_has_unknown_ext4_ro_compat_features(struct super_block *sb) +++{ +++ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & +++ cpu_to_le32(~EXT4_FEATURE_RO_COMPAT_SUPP)) != 0); +++} +++static inline bool ext4_has_unknown_ext4_incompat_features(struct super_block *sb) +++{ +++ return ((EXT4_SB(sb)->s_es->s_feature_incompat & +++ cpu_to_le32(~EXT4_FEATURE_INCOMPAT_SUPP)) != 0); +++} ++ ++ static inline bool ext4_has_compat_features(struct super_block *sb) ++ { ++@@ -3263,6 +3279,13 @@ struct ext4_extent; ++ #define EXT_MAX_BLOCKS 0xffffffff ++ ++ extern int ext4_ext_tree_init(handle_t *handle, struct inode *); +++extern struct buffer_head *ext4_read_inode_bitmap(struct super_block *sb, +++ ext4_group_t block_group); +++extern void ext4_inc_count(handle_t *handle, struct inode *inode); +++extern void ext4_dec_count(handle_t *handle, struct inode *inode); +++extern struct buffer_head *ext4_append(handle_t *handle, +++ struct inode *inode, +++ ext4_lblk_t *block); ++ extern int ext4_ext_writepage_trans_blocks(struct inode *, int); ++ extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); ++ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ++diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c ++index 835109f..86ad508 100644 ++--- a/fs/ext4/ialloc.c +++++ b/fs/ext4/ialloc.c ++@@ -114,7 +114,7 @@ verified: ++ * ++ * Return buffer_head of bitmap on success or NULL. ++ */ ++-static struct buffer_head * +++struct buffer_head * ++ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ++ { ++ struct ext4_group_desc *desc; ++@@ -208,6 +208,7 @@ out: ++ put_bh(bh); ++ return ERR_PTR(err); ++ } +++EXPORT_SYMBOL(ext4_read_inode_bitmap); ++ ++ /* ++ * NOTE! When we get the inode, we're the only people ++diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c ++index 216f96b..15426d6 100644 ++--- a/fs/ext4/inode.c +++++ b/fs/ext4/inode.c ++@@ -6449,3 +6449,19 @@ int ext4_filemap_fault(struct vm_fault *vmf) ++ ++ return err; ++ } +++EXPORT_SYMBOL(ext4_map_blocks); +++EXPORT_SYMBOL(ext4_truncate); +++EXPORT_SYMBOL(__ext4_iget); +++EXPORT_SYMBOL(ext4_bread); +++EXPORT_SYMBOL(ext4_itable_unused_count); +++EXPORT_SYMBOL(ext4_force_commit); +++EXPORT_SYMBOL(ext4_mark_inode_dirty); +++EXPORT_SYMBOL(ext4_get_group_desc); +++EXPORT_SYMBOL(__ext4_journal_get_write_access); +++EXPORT_SYMBOL(__ext4_journal_start_sb); +++EXPORT_SYMBOL(__ext4_journal_stop); +++EXPORT_SYMBOL(__ext4_handle_dirty_metadata); +++EXPORT_SYMBOL(__ext4_std_error); +++EXPORT_SYMBOL(ext4fs_dirhash); +++EXPORT_SYMBOL(ext4_get_inode_loc); +++EXPORT_SYMBOL(__ext4_journal_ensure_credits); ++diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c ++index 5a42348..006c19a 100644 ++--- a/fs/ext4/namei.c +++++ b/fs/ext4/namei.c ++@@ -49,7 +49,7 @@ ++ #define NAMEI_RA_BLOCKS 4 ++ #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) ++ ++-static struct buffer_head *ext4_append(handle_t *handle, +++struct buffer_head *ext4_append(handle_t *handle, ++ struct inode *inode, ++ ext4_lblk_t *block) ++ { ++@@ -203,6 +203,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, ++ } ++ return bh; ++ } +++EXPORT_SYMBOL(ext4_append); ++ ++ #ifndef assert ++ #define assert(test) J_ASSERT(test) ++@@ -2513,23 +2514,25 @@ EXPORT_SYMBOL(ext4_delete_entry); ++ * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set ++ * on regular files) and to avoid creating huge/slow non-HTREE directories. ++ */ ++-static void ext4_inc_count(handle_t *handle, struct inode *inode) +++void ext4_inc_count(handle_t *handle, struct inode *inode) ++ { ++ inc_nlink(inode); ++ if (is_dx(inode) && ++ (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2)) ++ set_nlink(inode, 1); ++ } +++EXPORT_SYMBOL(ext4_inc_count); ++ ++ /* ++ * If a directory had nlink == 1, then we should let it be 1. This indicates ++ * directory has >EXT4_LINK_MAX subdirs. ++ */ ++-static void ext4_dec_count(handle_t *handle, struct inode *inode) +++void ext4_dec_count(handle_t *handle, struct inode *inode) ++ { ++ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) ++ drop_nlink(inode); ++ } +++EXPORT_SYMBOL(ext4_dec_count); ++ ++ ++ /* ++diff --git a/fs/ext4/super.c b/fs/ext4/super.c ++index 571eaf1..18dc174 100644 ++--- a/fs/ext4/super.c +++++ b/fs/ext4/super.c ++@@ -5253,7 +5253,7 @@ static void ext4_update_super(struct super_block *sb) ++ __ext4_update_tstamp(&es->s_first_error_time, ++ &es->s_first_error_time_hi, ++ sbi->s_first_error_time); ++- strncpy(es->s_first_error_func, sbi->s_first_error_func, +++ strlcpy(es->s_first_error_func, sbi->s_first_error_func, ++ sizeof(es->s_first_error_func)); ++ es->s_first_error_line = ++ cpu_to_le32(sbi->s_first_error_line); ++@@ -5267,7 +5267,7 @@ static void ext4_update_super(struct super_block *sb) ++ __ext4_update_tstamp(&es->s_last_error_time, ++ &es->s_last_error_time_hi, ++ sbi->s_last_error_time); ++- strncpy(es->s_last_error_func, sbi->s_last_error_func, +++ strlcpy(es->s_last_error_func, sbi->s_last_error_func, ++ sizeof(es->s_last_error_func)); ++ es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); ++ es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); ++@@ -6546,8 +6546,6 @@ static int __init ext4_init_fs(void) ++ err = init_inodecache(); ++ if (err) ++ goto out1; ++- register_as_ext3(); ++- register_as_ext2(); ++ err = register_filesystem(&ext4_fs_type); ++ if (err) ++ goto out; ++@@ -6557,8 +6555,6 @@ static int __init ext4_init_fs(void) ++ printk(KERN_ERR "EXT4-fs: Cannot create netlink socket.\n"); ++ return 0; ++ out: ++- unregister_as_ext2(); ++- unregister_as_ext3(); ++ destroy_inodecache(); ++ out1: ++ ext4_exit_mballoc(); ++@@ -6577,8 +6573,6 @@ out5: ++ static void __exit ext4_exit_fs(void) ++ { ++ ext4_destroy_lazyinit_thread(); ++- unregister_as_ext2(); ++- unregister_as_ext3(); ++ unregister_filesystem(&ext4_fs_type); ++ destroy_inodecache(); ++ ext4_exit_mballoc(); ++-- ++2.27.0 ++ +diff --git a/ldiskfs/kernel_patches/patches/oe2003/ext4-pdirop.patch b/ldiskfs/kernel_patches/patches/oe2003/ext4-pdirop.patch +new file mode 100644 +index 0000000000..efee644f0e +--- /dev/null ++++ b/ldiskfs/kernel_patches/patches/oe2003/ext4-pdirop.patch +@@ -0,0 +1,907 @@ ++From 3e80311540dbe7839cc21a20e4c398882191bbdf Mon Sep 17 00:00:00 2001 ++From: Xinliang Liu ++Date: Thu, 10 Aug 2023 01:45:25 +0000 ++Subject: [PATCH] ext4 pdirop ++ ++Single directory performance is a critical for HPC workloads. In a ++typical use case an application creates a separate output file for ++each node and task in a job. As nodes and tasks increase, hundreds ++of thousands of files may be created in a single directory within ++a short window of time. ++Today, both filename lookup and file system modifying operations ++(such as create and unlink) are protected with a single lock for ++an entire ldiskfs directory. PDO project will remove this ++bottleneck by introducing a parallel locking mechanism for entire ++ldiskfs directories. This work will enable multiple application ++threads to simultaneously lookup, create and unlink in parallel. ++ ++This patch contains: ++ - pdirops support for ldiskfs ++ - integrate with osd-ldiskfs ++ ++Signed-off-by: Xinliang Liu ++--- ++ fs/ext4/Makefile | 1 + ++ fs/ext4/ext4.h | 78 ++++++++ ++ fs/ext4/namei.c | 455 +++++++++++++++++++++++++++++++++++++++++++---- ++ fs/ext4/super.c | 1 + ++ 4 files changed, 496 insertions(+), 39 deletions(-) ++ ++diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile ++index 8fdfcd3..771652e 100644 ++--- a/fs/ext4/Makefile +++++ b/fs/ext4/Makefile ++@@ -7,6 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ++ ++ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ ++ extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \ +++ htree_lock.o \ ++ indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ ++ mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ ++ super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o ++diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h ++index a45cc92..01ee209 100644 ++--- a/fs/ext4/ext4.h +++++ b/fs/ext4/ext4.h ++@@ -30,6 +30,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -958,6 +959,9 @@ struct ext4_inode_info { ++ __u32 i_dtime; ++ ext4_fsblk_t i_file_acl; ++ +++ /* following fields for parallel directory operations -bzzz */ +++ struct semaphore i_append_sem; +++ ++ /* ++ * i_block_group is the number of the block group which contains ++ * this file's inode. Constant across the lifetime of the inode, ++@@ -2231,6 +2235,72 @@ struct dx_hash_info ++ */ ++ #define HASH_NB_ALWAYS 1 ++ +++/* assume name-hash is protected by upper layer */ +++#define EXT4_HTREE_LOCK_HASH 0 +++ +++enum ext4_pdo_lk_types { +++#if EXT4_HTREE_LOCK_HASH +++ EXT4_LK_HASH, +++#endif +++ EXT4_LK_DX, /* index block */ +++ EXT4_LK_DE, /* directory entry block */ +++ EXT4_LK_SPIN, /* spinlock */ +++ EXT4_LK_MAX, +++}; +++ +++/* read-only bit */ +++#define EXT4_LB_RO(b) (1 << (b)) +++/* read + write, high bits for writer */ +++#define EXT4_LB_RW(b) ((1 << (b)) | (1 << (EXT4_LK_MAX + (b)))) +++ +++enum ext4_pdo_lock_bits { +++ /* DX lock bits */ +++ EXT4_LB_DX_RO = EXT4_LB_RO(EXT4_LK_DX), +++ EXT4_LB_DX = EXT4_LB_RW(EXT4_LK_DX), +++ /* DE lock bits */ +++ EXT4_LB_DE_RO = EXT4_LB_RO(EXT4_LK_DE), +++ EXT4_LB_DE = EXT4_LB_RW(EXT4_LK_DE), +++ /* DX spinlock bits */ +++ EXT4_LB_SPIN_RO = EXT4_LB_RO(EXT4_LK_SPIN), +++ EXT4_LB_SPIN = EXT4_LB_RW(EXT4_LK_SPIN), +++ /* accurate searching */ +++ EXT4_LB_EXACT = EXT4_LB_RO(EXT4_LK_MAX << 1), +++}; +++ +++enum ext4_pdo_lock_opc { +++ /* external */ +++ EXT4_HLOCK_READDIR = (EXT4_LB_DE_RO | EXT4_LB_DX_RO), +++ EXT4_HLOCK_LOOKUP = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO | +++ EXT4_LB_EXACT), +++ EXT4_HLOCK_DEL = (EXT4_LB_DE | EXT4_LB_SPIN_RO | +++ EXT4_LB_EXACT), +++ EXT4_HLOCK_ADD = (EXT4_LB_DE | EXT4_LB_SPIN_RO), +++ +++ /* internal */ +++ EXT4_HLOCK_LOOKUP_SAFE = (EXT4_LB_DE_RO | EXT4_LB_DX_RO | +++ EXT4_LB_EXACT), +++ EXT4_HLOCK_DEL_SAFE = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT), +++ EXT4_HLOCK_SPLIT = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN), +++}; +++ +++extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits); +++#define ext4_htree_lock_head_free(lhead) htree_lock_head_free(lhead) +++ +++extern struct htree_lock *ext4_htree_lock_alloc(void); +++#define ext4_htree_lock_free(lck) htree_lock_free(lck) +++ +++extern void ext4_htree_lock(struct htree_lock *lck, +++ struct htree_lock_head *lhead, +++ struct inode *dir, unsigned flags); +++#define ext4_htree_unlock(lck) htree_unlock(lck) +++ +++extern struct buffer_head *__ext4_find_entry(struct inode *dir, +++ const struct qstr *d_name, +++ struct ext4_dir_entry_2 **res_dir, +++ int *inlined, struct htree_lock *lck); +++extern int __ext4_add_entry(handle_t *handle, struct dentry *dentry, +++ struct inode *inode, struct htree_lock *lck); +++ ++ struct ext4_filename { ++ const struct qstr *usr_fname; ++ struct fscrypt_str disk_name; ++@@ -2538,12 +2608,20 @@ void ext4_insert_dentry(struct inode *inode, ++ struct ext4_filename *fname, void *data); ++ static inline void ext4_update_dx_flag(struct inode *inode) ++ { +++ /* Disable it for ldiskfs, because going from a DX directory to +++ * a non-DX directory while it is in use will completely break +++ * the htree-locking. +++ * If we really want to support this operation in the future, +++ * we need to exclusively lock the directory at here which will +++ * increase complexity of code */ +++#if 0 ++ if (!ext4_has_feature_dir_index(inode->i_sb) && ++ ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ++ /* ext4_iget() should have caught this... */ ++ WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); ++ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); ++ } +++#endif ++ } ++ static const unsigned char ext4_filetype_table[] = { ++ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK ++diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c ++index 278e83d..12ba206 100644 ++--- a/fs/ext4/namei.c +++++ b/fs/ext4/namei.c ++@@ -55,6 +55,7 @@ struct buffer_head *ext4_append(handle_t *handle, ++ { ++ struct ext4_map_blocks map; ++ struct buffer_head *bh; +++ struct ext4_inode_info *ei = EXT4_I(inode); ++ int err; ++ ++ if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && ++@@ -62,6 +63,10 @@ struct buffer_head *ext4_append(handle_t *handle, ++ EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) ++ return ERR_PTR(-ENOSPC); ++ +++ /* with parallel dir operations all appends +++ * have to be serialized -bzzz */ +++ down(&ei->i_append_sem); +++ ++ *block = inode->i_size >> inode->i_sb->s_blocksize_bits; ++ map.m_lblk = *block; ++ map.m_len = 1; ++@@ -73,15 +78,18 @@ struct buffer_head *ext4_append(handle_t *handle, ++ */ ++ err = ext4_map_blocks(NULL, inode, &map, 0); ++ if (err < 0) ++- return ERR_PTR(err); +++ goto err_unlock; ++ if (err) { ++ EXT4_ERROR_INODE(inode, "Logical block already allocated"); ++- return ERR_PTR(-EFSCORRUPTED); +++ err = -EFSCORRUPTED; +++ goto err_unlock; ++ } ++ ++ bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); ++- if (IS_ERR(bh)) +++ if (IS_ERR(bh)) { +++ up(&ei->i_append_sem); ++ return bh; +++ } ++ inode->i_size += inode->i_sb->s_blocksize; ++ EXT4_I(inode)->i_disksize = inode->i_size; ++ err = ext4_mark_inode_dirty(handle, inode); ++@@ -91,11 +99,14 @@ struct buffer_head *ext4_append(handle_t *handle, ++ err = ext4_journal_get_write_access(handle, bh); ++ if (err) ++ goto out; +++ up(&ei->i_append_sem); ++ return bh; ++ ++ out: ++ brelse(bh); ++ ext4_std_error(inode->i_sb, err); +++err_unlock: +++ up(&ei->i_append_sem); ++ return ERR_PTR(err); ++ } ++ ++@@ -293,7 +304,8 @@ static unsigned dx_node_limit(struct inode *dir); ++ static struct dx_frame *dx_probe(struct ext4_filename *fname, ++ struct inode *dir, ++ struct dx_hash_info *hinfo, ++- struct dx_frame *frame); +++ struct dx_frame *frame, +++ struct htree_lock *lck); ++ static void dx_release(struct dx_frame *frames); ++ static int dx_make_map(struct inode *dir, struct buffer_head *bh, ++ struct dx_hash_info *hinfo, ++@@ -307,12 +319,13 @@ static void dx_insert_block(struct dx_frame *frame, ++ static int ext4_htree_next_block(struct inode *dir, __u32 hash, ++ struct dx_frame *frame, ++ struct dx_frame *frames, ++- __u32 *start_hash); +++ __u32 *start_hash, struct htree_lock *lck); ++ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, ++ struct ext4_filename *fname, ++- struct ext4_dir_entry_2 **res_dir); +++ struct ext4_dir_entry_2 **res_dir, struct htree_lock *lck); ++ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, ++- struct inode *dir, struct inode *inode); +++ struct inode *dir, struct inode *inode, +++ struct htree_lock *lck); ++ ++ /* checksumming functions */ ++ void initialize_dirent_tail(struct ext4_dir_entry_tail *t, ++@@ -776,6 +789,227 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, ++ } ++ #endif /* DX_DEBUG */ ++ +++/* private data for htree_lock */ +++struct ext4_dir_lock_data { +++ unsigned ld_flags; /* bits-map for lock types */ +++ unsigned ld_count; /* # entries of the last DX block */ +++ struct dx_entry ld_at_entry; /* copy of leaf dx_entry */ +++ struct dx_entry *ld_at; /* position of leaf dx_entry */ +++}; +++ +++#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) +++#define ext4_find_entry(dir, name, dirent, inline) \ +++ __ext4_find_entry(dir, name, dirent, inline, NULL) +++#define ext4_add_entry(handle, dentry, inode) \ +++ __ext4_add_entry(handle, dentry, inode, NULL) +++ +++/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */ +++#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32) +++ +++static void ext4_htree_event_cb(void *target, void *event) +++{ +++ u64 *block = (u64 *)target; +++ +++ if (*block == dx_get_block((struct dx_entry *)event)) +++ *block = EXT4_HTREE_NODE_CHANGED; +++} +++ +++struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits) +++{ +++ struct htree_lock_head *lhead; +++ +++ lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0); +++ if (lhead != NULL) { +++ htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR, +++ ext4_htree_event_cb); +++ } +++ return lhead; +++} +++EXPORT_SYMBOL(ext4_htree_lock_head_alloc); +++ +++struct htree_lock *ext4_htree_lock_alloc(void) +++{ +++ return htree_lock_alloc(EXT4_LK_MAX, +++ sizeof(struct ext4_dir_lock_data)); +++} +++EXPORT_SYMBOL(ext4_htree_lock_alloc); +++ +++static htree_lock_mode_t ext4_htree_mode(unsigned flags) +++{ +++ switch (flags) { +++ default: /* 0 or unknown flags require EX lock */ +++ return HTREE_LOCK_EX; +++ case EXT4_HLOCK_READDIR: +++ return HTREE_LOCK_PR; +++ case EXT4_HLOCK_LOOKUP: +++ return HTREE_LOCK_CR; +++ case EXT4_HLOCK_DEL: +++ case EXT4_HLOCK_ADD: +++ return HTREE_LOCK_CW; +++ } +++} +++ +++/* return PR for read-only operations, otherwise return EX */ +++static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags) +++{ +++ int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE; +++ +++ /* 0 requires EX lock */ +++ return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR; +++} +++ +++static int ext4_htree_safe_locked(struct htree_lock *lck) +++{ +++ int writer; +++ +++ if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX) +++ return 1; +++ +++ writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) == +++ EXT4_LB_DE; +++ if (writer) /* all readers & writers are excluded? */ +++ return lck->lk_mode == HTREE_LOCK_EX; +++ +++ /* all writers are excluded? */ +++ return lck->lk_mode == HTREE_LOCK_PR || +++ lck->lk_mode == HTREE_LOCK_PW || +++ lck->lk_mode == HTREE_LOCK_EX; +++} +++ +++/* relock htree_lock with EX mode if it's change operation, otherwise +++ * relock it with PR mode. It's noop if PDO is disabled. */ +++static void ext4_htree_safe_relock(struct htree_lock *lck) +++{ +++ if (!ext4_htree_safe_locked(lck)) { +++ unsigned flags = ext4_htree_lock_data(lck)->ld_flags; +++ +++ htree_change_lock(lck, ext4_htree_safe_mode(flags)); +++ } +++} +++ +++void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead, +++ struct inode *dir, unsigned flags) +++{ +++ htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) : +++ ext4_htree_safe_mode(flags); +++ +++ ext4_htree_lock_data(lck)->ld_flags = flags; +++ htree_lock(lck, lhead, mode); +++ if (!is_dx(dir)) +++ ext4_htree_safe_relock(lck); /* make sure it's safe locked */ +++} +++EXPORT_SYMBOL(ext4_htree_lock); +++ +++static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at, +++ unsigned lmask, int wait, void *ev) +++{ +++ u32 key = (at == NULL) ? 0 : dx_get_block(at); +++ u32 mode; +++ +++ /* NOOP if htree is well protected or caller doesn't require the lock */ +++ if (ext4_htree_safe_locked(lck) || +++ !(ext4_htree_lock_data(lck)->ld_flags & lmask)) +++ return 1; +++ +++ mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ? +++ HTREE_LOCK_PW : HTREE_LOCK_PR; +++ while (1) { +++ if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev)) +++ return 1; +++ if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */ +++ return 0; +++ cpu_relax(); /* spin until granted */ +++ } +++} +++ +++static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask) +++{ +++ return ext4_htree_safe_locked(lck) || +++ htree_node_is_granted(lck, ffz(~lmask)); +++} +++ +++static void ext4_htree_node_unlock(struct htree_lock *lck, +++ unsigned lmask, void *buf) +++{ +++ /* NB: it's safe to call mutiple times or even it's not locked */ +++ if (!ext4_htree_safe_locked(lck) && +++ htree_node_is_granted(lck, ffz(~lmask))) +++ htree_node_unlock(lck, ffz(~lmask), buf); +++} +++ +++#define ext4_htree_dx_lock(lck, key) \ +++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL) +++#define ext4_htree_dx_lock_try(lck, key) \ +++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL) +++#define ext4_htree_dx_unlock(lck) \ +++ ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL) +++#define ext4_htree_dx_locked(lck) \ +++ ext4_htree_node_locked(lck, EXT4_LB_DX) +++ +++static void ext4_htree_dx_need_lock(struct htree_lock *lck) +++{ +++ struct ext4_dir_lock_data *ld; +++ +++ if (ext4_htree_safe_locked(lck)) +++ return; +++ +++ ld = ext4_htree_lock_data(lck); +++ switch (ld->ld_flags) { +++ default: +++ return; +++ case EXT4_HLOCK_LOOKUP: +++ ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE; +++ return; +++ case EXT4_HLOCK_DEL: +++ ld->ld_flags = EXT4_HLOCK_DEL_SAFE; +++ return; +++ case EXT4_HLOCK_ADD: +++ ld->ld_flags = EXT4_HLOCK_SPLIT; +++ return; +++ } +++} +++ +++#define ext4_htree_de_lock(lck, key) \ +++ ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL) +++#define ext4_htree_de_unlock(lck) \ +++ ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL) +++ +++#define ext4_htree_spin_lock(lck, key, event) \ +++ ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event) +++#define ext4_htree_spin_unlock(lck) \ +++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL) +++#define ext4_htree_spin_unlock_listen(lck, p) \ +++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p) +++ +++static void ext4_htree_spin_stop_listen(struct htree_lock *lck) +++{ +++ if (!ext4_htree_safe_locked(lck) && +++ htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN))) +++ htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN)); +++} +++ +++enum { +++ DX_HASH_COL_IGNORE, /* ignore collision while probing frames */ +++ DX_HASH_COL_YES, /* there is collision and it does matter */ +++ DX_HASH_COL_NO, /* there is no collision */ +++}; +++ +++static int dx_probe_hash_collision(struct htree_lock *lck, +++ struct dx_entry *entries, +++ struct dx_entry *at, u32 hash) +++{ +++ if (!(lck && ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) { +++ return DX_HASH_COL_IGNORE; /* don't care about collision */ +++ +++ } else if (at == entries + dx_get_count(entries) - 1) { +++ return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */ +++ +++ } else { /* hash collision? */ +++ return ((dx_get_hash(at + 1) & ~1) == hash) ? +++ DX_HASH_COL_YES : DX_HASH_COL_NO; +++ } +++} +++ ++ /* ++ * Probe for a directory leaf block to search. ++ * ++@@ -787,10 +1021,11 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, ++ */ ++ static struct dx_frame * ++ dx_probe(struct ext4_filename *fname, struct inode *dir, ++- struct dx_hash_info *hinfo, struct dx_frame *frame_in) +++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, +++ struct htree_lock *lck) ++ { ++ unsigned count, indirect, level, i; ++- struct dx_entry *at, *entries, *p, *q, *m; +++ struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL; ++ struct dx_root_info *info; ++ struct dx_frame *frame = frame_in; ++ struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); ++@@ -856,8 +1091,16 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, ++ level = 0; ++ blocks[0] = 0; ++ while (1) { +++ if (indirect == level) { /* the last index level */ +++ /* NB: ext4_htree_dx_lock() could be noop if +++ * DX-lock flag is not set for current operation */ +++ ext4_htree_dx_lock(lck, dx); +++ ext4_htree_spin_lock(lck, dx, NULL); +++ } +++ ++ count = dx_get_count(entries); ++ if (!count || count > dx_get_limit(entries)) { +++ ext4_htree_spin_unlock(lck); /* release spin */ ++ ext4_warning_inode(dir, ++ "dx entry: count %u beyond limit %u", ++ count, dx_get_limit(entries)); ++@@ -897,6 +1140,69 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, ++ frame->entries = entries; ++ frame->at = at; ++ +++ if (indirect == level) { /* the last index level */ +++ struct ext4_dir_lock_data *ld; +++ u64 myblock; +++ +++ /* By default we only lock DE-block, however, we will +++ * also lock the last level DX-block if: +++ * a) there is hash collision +++ * we will set DX-lock flag (a few lines below) +++ * and redo to lock DX-block +++ * see detail in dx_probe_hash_collision() +++ * b) it's a retry from splitting +++ * we need to lock the last level DX-block so nobody +++ * else can split any leaf blocks under the same +++ * DX-block, see detail in ext4_dx_add_entry() +++ */ +++ if (ext4_htree_dx_locked(lck)) { +++ /* DX-block is locked, just lock DE-block +++ * and return */ +++ ext4_htree_spin_unlock(lck); +++ if (!ext4_htree_safe_locked(lck)) +++ ext4_htree_de_lock(lck, frame->at); +++ return frame; +++ } +++ /* it's pdirop and no DX lock */ +++ if (dx_probe_hash_collision(lck, entries, at, hash) == +++ DX_HASH_COL_YES) { +++ /* found hash collision, set DX-lock flag +++ * and retry to abtain DX-lock */ +++ ext4_htree_spin_unlock(lck); +++ ext4_htree_dx_need_lock(lck); +++ continue; +++ } +++ ld = ext4_htree_lock_data(lck); +++ /* because I don't lock DX, so @at can't be trusted +++ * after I release spinlock so I have to save it */ +++ ld->ld_at = at; +++ ld->ld_at_entry = *at; +++ ld->ld_count = dx_get_count(entries); +++ +++ frame->at = &ld->ld_at_entry; +++ myblock = dx_get_block(at); +++ +++ /* NB: ordering locking */ +++ ext4_htree_spin_unlock_listen(lck, &myblock); +++ /* other thread can split this DE-block because: +++ * a) I don't have lock for the DE-block yet +++ * b) I released spinlock on DX-block +++ * if it happened I can detect it by listening +++ * splitting event on this DE-block */ +++ ext4_htree_de_lock(lck, frame->at); +++ ext4_htree_spin_stop_listen(lck); +++ +++ if (myblock == EXT4_HTREE_NODE_CHANGED) { +++ /* someone split this DE-block before +++ * I locked it, I need to retry and lock +++ * valid DE-block */ +++ ext4_htree_de_unlock(lck); +++ continue; +++ } +++ return frame; +++ } +++ dx = at; +++ ++ block = dx_get_block(at); ++ for (i = 0; i <= level; i++) { ++ if (blocks[i] == block) { ++@@ -906,8 +1212,8 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, ++ goto fail; ++ } ++ } ++- if (++level > indirect) ++- return frame; +++ +++ ++level; ++ blocks[level] = block; ++ frame++; ++ frame->bh = ext4_read_dirblock(dir, block, INDEX); ++@@ -978,7 +1284,7 @@ static void dx_release(struct dx_frame *frames) ++ static int ext4_htree_next_block(struct inode *dir, __u32 hash, ++ struct dx_frame *frame, ++ struct dx_frame *frames, ++- __u32 *start_hash) +++ __u32 *start_hash, struct htree_lock *lck) ++ { ++ struct dx_frame *p; ++ struct buffer_head *bh; ++@@ -993,12 +1299,22 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, ++ * this loop, num_frames indicates the number of interior ++ * nodes need to be read. ++ */ +++ ext4_htree_de_unlock(lck); ++ while (1) { ++- if (++(p->at) < p->entries + dx_get_count(p->entries)) ++- break; +++ if (num_frames > 0 || ext4_htree_dx_locked(lck)) { +++ /* num_frames > 0 : +++ * DX block +++ * ext4_htree_dx_locked: +++ * frame->at is reliable pointer returned by dx_probe, +++ * otherwise dx_probe already knew no collision */ +++ if (++(p->at) < p->entries + dx_get_count(p->entries)) +++ break; +++ } ++ if (p == frames) ++ return 0; ++ num_frames++; +++ if (num_frames == 1) +++ ext4_htree_dx_unlock(lck); ++ p--; ++ } ++ ++@@ -1021,6 +1337,13 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, ++ * block so no check is necessary ++ */ ++ while (num_frames--) { +++ if (num_frames == 0) { +++ /* it's not always necessary, we just don't want to +++ * detect hash collision again */ +++ ext4_htree_dx_need_lock(lck); +++ ext4_htree_dx_lock(lck, p->at); +++ } +++ ++ bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); ++ if (IS_ERR(bh)) ++ return PTR_ERR(bh); ++@@ -1029,6 +1352,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, ++ p->bh = bh; ++ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; ++ } +++ ext4_htree_de_lock(lck, p->at); ++ return 1; ++ } ++ ++@@ -1176,10 +1500,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, ++ } ++ hinfo.hash = start_hash; ++ hinfo.minor_hash = 0; ++- frame = dx_probe(NULL, dir, &hinfo, frames); +++ /* assume it's PR locked */ +++ frame = dx_probe(NULL, dir, &hinfo, frames, NULL); ++ if (IS_ERR(frame)) ++ return PTR_ERR(frame); ++- ++ /* Add '.' and '..' from the htree header */ ++ if (!start_hash && !start_minor_hash) { ++ de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; ++@@ -1219,7 +1543,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, ++ count += ret; ++ hashval = ~0; ++ ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, ++- frame, frames, &hashval); +++ frame, frames, &hashval, NULL); ++ *next_hash = hashval; ++ if (ret < 0) { ++ err = ret; ++@@ -1418,10 +1742,10 @@ static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, ++ * The returned buffer_head has ->b_count elevated. The caller is expected ++ * to brelse() it when appropriate. ++ */ ++-static struct buffer_head * ext4_find_entry (struct inode *dir, +++struct buffer_head *__ext4_find_entry(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++- int *inlined) +++ int *inlined, struct htree_lock *lck) ++ { ++ struct super_block *sb; ++ struct buffer_head *bh_use[NAMEI_RA_SIZE]; ++@@ -1470,7 +1794,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, ++ goto restart; ++ } ++ if (is_dx(dir)) { ++- ret = ext4_dx_find_entry(dir, &fname, res_dir); +++ ret = ext4_dx_find_entry(dir, &fname, res_dir, lck); ++ /* ++ * On success, or if the error was file not found, ++ * return. Otherwise, fall back to doing a search the ++@@ -1480,6 +1804,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, ++ goto cleanup_and_exit; ++ dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " ++ "falling back\n")); +++ ext4_htree_safe_relock(lck); ++ ret = NULL; ++ } ++ nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); ++@@ -1571,10 +1896,12 @@ cleanup_and_exit: ++ ext4_fname_free_filename(&fname); ++ return ret; ++ } +++EXPORT_SYMBOL(__ext4_find_entry); ++ ++ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, ++ struct ext4_filename *fname, ++- struct ext4_dir_entry_2 **res_dir) +++ struct ext4_dir_entry_2 **res_dir, +++ struct htree_lock *lck) ++ { ++ struct super_block * sb = dir->i_sb; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; ++@@ -1585,7 +1912,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, ++ #ifdef CONFIG_EXT4_FS_ENCRYPTION ++ *res_dir = NULL; ++ #endif ++- frame = dx_probe(fname, dir, NULL, frames); +++ frame = dx_probe(fname, dir, NULL, frames, lck); ++ if (IS_ERR(frame)) ++ return (struct buffer_head *) frame; ++ do { ++@@ -1607,7 +1934,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, ++ ++ /* Check to see if we should continue to search */ ++ retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, ++- frames, NULL); +++ frames, NULL, lck); ++ if (retval < 0) { ++ ext4_warning_inode(dir, ++ "error %d reading directory index block", ++@@ -1782,8 +2109,9 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) ++ * Returns pointer to de in block into which the new entry will be inserted. ++ */ ++ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, ++- struct buffer_head **bh,struct dx_frame *frame, ++- struct dx_hash_info *hinfo) +++ struct buffer_head **bh, struct dx_frame *frames, +++ struct dx_frame *frame, struct dx_hash_info *hinfo, +++ struct htree_lock *lck) ++ { ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned continued; ++@@ -1859,8 +2187,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, ++ hash2, split, count-split)); ++ ++ /* Fancy dance to stay within two buffers */ ++- de2 = dx_move_dirents(data1, data2, map + split, count - split, ++- blocksize); +++ if (hinfo->hash < hash2) { +++ de2 = dx_move_dirents(data1, data2, map + split, +++ count - split, blocksize); +++ } else { +++ /* make sure we will add entry to the same block which +++ * we have already locked */ +++ de2 = dx_move_dirents(data1, data2, map, split, blocksize); +++ } ++ de = dx_pack_dirents(data1, blocksize); ++ de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - ++ (char *) de, ++@@ -1881,12 +2215,21 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, ++ dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2, ++ blocksize, 1)); ++ ++- /* Which block gets the new entry? */ ++- if (hinfo->hash >= hash2) { ++- swap(*bh, bh2); ++- de = de2; +++ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL, +++ frame->at); /* notify block is being split */ +++ if (hinfo->hash < hash2) { +++ dx_insert_block(frame, hash2 + continued, newblock); +++ +++ } else { +++ /* switch block number */ +++ dx_insert_block(frame, hash2 + continued, +++ dx_get_block(frame->at)); +++ dx_set_block(frame->at, newblock); +++ (frame->at)++; ++ } ++- dx_insert_block(frame, hash2 + continued, newblock); +++ ext4_htree_spin_unlock(lck); +++ ext4_htree_dx_unlock(lck); +++ ++ err = ext4_handle_dirty_dirent_node(handle, dir, bh2); ++ if (err) ++ goto journal_error; ++@@ -2160,7 +2503,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, ++ if (retval) ++ goto out_frames; ++ ++- de = do_split(handle,dir, &bh2, frame, &fname->hinfo); +++ de = do_split(handle, dir, &bh2, frames, frame, &fname->hinfo, NULL); ++ if (IS_ERR(de)) { ++ retval = PTR_ERR(de); ++ goto out_frames; ++@@ -2270,8 +2613,8 @@ out: ++ * may not sleep between calling this and putting something into ++ * the entry, as someone else might have used it while you slept. ++ */ ++-static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ++- struct inode *inode) +++int __ext4_add_entry(handle_t *handle, struct dentry *dentry, +++ struct inode *inode, struct htree_lock *lck) ++ { ++ struct inode *dir = d_inode(dentry->d_parent); ++ struct buffer_head *bh = NULL; ++@@ -2312,9 +2655,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ if (dentry->d_name.len == 2 && ++ memcmp(dentry->d_name.name, "..", 2) == 0) ++ return ext4_update_dotdot(handle, dentry, inode); ++- retval = ext4_dx_add_entry(handle, &fname, dir, inode); +++ retval = ext4_dx_add_entry(handle, &fname, dir, inode, lck); ++ if (!retval || (retval != ERR_BAD_DX_DIR)) ++ goto out; +++ ext4_htree_safe_relock(lck); ++ /* Can we just ignore htree data? */ ++ if (ext4_has_metadata_csum(sb)) { ++ EXT4_ERROR_INODE(dir, ++@@ -2377,12 +2721,14 @@ out: ++ ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); ++ return retval; ++ } +++EXPORT_SYMBOL(__ext4_add_entry); ++ ++ /* ++ * Returns 0 for success, or a negative error value ++ */ ++ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, ++- struct inode *dir, struct inode *inode) +++ struct inode *dir, struct inode *inode, +++ struct htree_lock *lck) ++ { ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; ++ struct dx_entry *entries, *at; ++@@ -2394,7 +2740,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, ++ ++ again: ++ restart = 0; ++- frame = dx_probe(fname, dir, NULL, frames); +++ frame = dx_probe(fname, dir, NULL, frames, lck); ++ if (IS_ERR(frame)) ++ return PTR_ERR(frame); ++ entries = frame->entries; ++@@ -2429,6 +2775,11 @@ again: ++ struct dx_node *node2; ++ struct buffer_head *bh2; ++ +++ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */ +++ ext4_htree_safe_relock(lck); +++ restart = 1; +++ goto cleanup; +++ } ++ while (frame > frames) { ++ if (dx_get_count((frame - 1)->entries) < ++ dx_get_limit((frame - 1)->entries)) { ++@@ -2530,8 +2881,32 @@ again: ++ restart = 1; ++ goto journal_error; ++ } +++ } else if (!ext4_htree_dx_locked(lck)) { +++ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck); +++ +++ /* not well protected, require DX lock */ +++ ext4_htree_dx_need_lock(lck); +++ at = frame > frames ? (frame - 1)->at : NULL; +++ +++ /* NB: no risk of deadlock because it's just a try. +++ * +++ * NB: we check ld_count for twice, the first time before +++ * having DX lock, the second time after holding DX lock. +++ * +++ * NB: We never free blocks for directory so far, which +++ * means value returned by dx_get_count() should equal to +++ * ld->ld_count if nobody split any DE-block under @at, +++ * and ld->ld_at still points to valid dx_entry. */ +++ if ((ld->ld_count != dx_get_count(entries)) || +++ !ext4_htree_dx_lock_try(lck, at) || +++ (ld->ld_count != dx_get_count(entries))) { +++ restart = 1; +++ goto cleanup; +++ } +++ /* OK, I've got DX lock and nothing changed */ +++ frame->at = ld->ld_at; ++ } ++- de = do_split(handle, dir, &bh, frame, &fname->hinfo); +++ de = do_split(handle, dir, &bh, frames, frame, &fname->hinfo, lck); ++ if (IS_ERR(de)) { ++ err = PTR_ERR(de); ++ goto cleanup; ++@@ -2542,6 +2917,8 @@ again: ++ journal_error: ++ ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ ++ cleanup: +++ ext4_htree_dx_unlock(lck); +++ ext4_htree_de_unlock(lck); ++ brelse(bh); ++ dx_release(frames); ++ /* @restart is true means htree-path has been changed, we need to ++diff --git a/fs/ext4/super.c b/fs/ext4/super.c ++index 89f39f9..768e6a5 100644 ++--- a/fs/ext4/super.c +++++ b/fs/ext4/super.c ++@@ -1271,6 +1271,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ++ ++ inode_set_iversion(&ei->vfs_inode, 1); ++ spin_lock_init(&ei->i_raw_lock); +++ sema_init(&ei->i_append_sem, 1); ++ INIT_LIST_HEAD(&ei->i_prealloc_list); ++ spin_lock_init(&ei->i_prealloc_lock); ++ ext4_es_init_tree(&ei->i_es_tree); ++-- ++2.27.0 ++ +diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.19.90-oe2003.series b/ldiskfs/kernel_patches/series/ldiskfs-4.19.90-oe2003.series +new file mode 100644 +index 0000000000..ec19e1e90f +--- /dev/null ++++ b/ldiskfs/kernel_patches/series/ldiskfs-4.19.90-oe2003.series +@@ -0,0 +1,36 @@ ++oe2203/ext4-inode-version.patch ++suse15/ext4-lookup-dotdot.patch ++suse15/ext4-print-inum-in-htree-warning.patch ++rhel8/ext4-prealloc.patch ++ubuntu18/ext4-osd-iop-common.patch ++oe2003/ext4-misc.patch ++oe2003/ext4-mballoc-extra-checks.patch ++ubuntu18/ext4-hash-indexed-dir-dotdot-update.patch ++rhel8.1/ext4-kill-dx-root.patch ++oe2003/ext4-mballoc-pa-free-mismatch.patch ++linux-5.4/ext4-data-in-dirent.patch ++rhel8/ext4-nocmtime.patch ++base/ext4-htree-lock.patch ++oe2003/ext4-pdirop.patch ++rhel8/ext4-max-dir-size.patch ++rhel8.3/ext4-corrupted-inode-block-bitmaps-handling-patches.patch ++ubuntu18/ext4-give-warning-with-dir-htree-growing.patch ++ubuntu18/ext4-jcb-optimization.patch ++rhel8.2/ext4-attach-jinode-in-writepages.patch ++rhel8/ext4-dont-check-before-replay.patch ++rhel7.6/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch ++rhel7.6/ext4-export-orphan-add.patch ++rhel8/ext4-export-mb-stream-allocator-variables.patch ++rhel8/ext4-simple-blockalloc.patch ++rhel8/ext4-mballoc-skip-uninit-groups-cr0.patch ++oe2003/ext4-mballoc-prefetch.patch ++ubuntu2004/ext4-xattr-disable-credits-check.patch ++base/ext4-no-max-dir-size-limit-for-iam-objects.patch ++rhel7.6/ext4-dquot-commit-speedup.patch ++rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch ++base/ext4-projid-xattrs.patch ++rhel8/ext4-enc-flag.patch ++oe2203/ext4-delayed-iput.patch ++oe2003/ext4-filename-encode.patch ++rhel8/ext4-old_ea_inodes_handling_fix.patch ++rhel8.4/ext4-optimize-find_delayed_extent.patch +diff --git a/lustre/ChangeLog b/lustre/ChangeLog +index 3f03823e3d..6706ff15b1 100644 +--- a/lustre/ChangeLog ++++ b/lustre/ChangeLog +@@ -100,6 +100,7 @@ TBD Whamcloud + vanilla linux 5.4.0 (ZFS + ldiskfs) + vanilla linux 5.4.21 (ZFS + ldiskfs) + vanilla linux 5.4.136 (ZFS + ldiskfs) ++ 4.19.90-2308.1.0.0212.oe1 (openEuler 20.03 LTS SP3) + 5.10.0-60.94.0.118.oe2203 (openEuler 22.03 LTS) + 5.10.0-136.32.0.108.oe2203sp1 (openEuler 22.03 LTS SP1) + 5.10.0-153.19.0.95.oe2203sp2 (openEuler 22.03 LTS SP2) +@@ -209,6 +210,7 @@ TBD Whamcloud + 5.8.0-53 (Ubuntu 20.04.2 HWE) + 5.11.0-31 (Ubuntu 20.04.3 HWE) + 5.11.0 (vanilla kernel.org) ++ 4.19.90 (openEuler 20.03 LTS) + 5.10.0 (openEuler 22.03 LTS) + * Recommended e2fsprogs version: 1.46.5.wc1 or newer + * Recommended ZFS version: 2.1.5 +diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 +index feed23e01f..251c977ea4 100644 +--- a/lustre/autoconf/lustre-core.m4 ++++ b/lustre/autoconf/lustre-core.m4 +@@ -3496,6 +3496,7 @@ lustre/kernel_patches/targets/5.3-sles15sp2.target + lustre/kernel_patches/targets/5.3-sles15sp3.target + lustre/kernel_patches/targets/5.14-sles15sp4.target + lustre/kernel_patches/targets/3.x-fc18.target ++lustre/kernel_patches/targets/4.19-oe2003sp3.target + lustre/kernel_patches/targets/5.10-oe2203.target + lustre/kernel_patches/targets/5.10-oe2203sp1.target + lustre/kernel_patches/targets/5.10-oe2203sp2.target +diff --git a/lustre/kernel_patches/targets/4.19-oe2003sp3.target.in b/lustre/kernel_patches/targets/4.19-oe2003sp3.target.in +new file mode 100644 +index 0000000000..dd8deef6f2 +--- /dev/null ++++ b/lustre/kernel_patches/targets/4.19-oe2003sp3.target.in +@@ -0,0 +1,21 @@ ++lnxmaj="4.19.90" ++lnxrel="2308.1.0.0212.oe1" ++ ++KERNEL_SRPM=kernel-${lnxmaj}-${lnxrel}.src.rpm ++SERIES="" ++EXTRA_VERSION=${lnxrel}_lustre.@VERSION@ ++LUSTRE_VERSION=@VERSION@ ++ ++DEVEL_PATH_ARCH_DELIMETER="." ++OFED_VERSION=inkernel ++ ++#SMP_ARCHS="i686 x86_64 ia64 ppc64" ++# openEuler doesn't use smp specific kernels ++SMP_ARCHS="" ++ ++for cc in gcc ; do ++ if which $cc >/dev/null 2>/dev/null ; then ++ export CC=$cc ++ break ++ fi ++done +-- +2.33.0 + diff --git a/0047-lustre.spec.in-Add-gcc-option-Wno-stringop-overflow.patch b/0047-lustre.spec.in-Add-gcc-option-Wno-stringop-overflow.patch new file mode 100644 index 0000000000000000000000000000000000000000..9e4f6fad5933055e9a32dce82d8ec47e6dcc7ad8 --- /dev/null +++ b/0047-lustre.spec.in-Add-gcc-option-Wno-stringop-overflow.patch @@ -0,0 +1,28 @@ +From 9021e6e81632d116c9bc0e5d59ce2b1a47cc1559 Mon Sep 17 00:00:00 2001 +From: Xinliang Liu +Date: Fri, 24 Mar 2023 08:21:06 +0000 +Subject: [PATCH 47/61] lustre.spec.in: Add gcc option -Wno-stringop-overflow + +See openEuler gcc bug: https://gitee.com/openeuler/gcc/issues/I5XMD0 + +Signed-off-by: Xinliang Liu +--- + lustre.spec.in | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lustre.spec.in b/lustre.spec.in +index 9aa271f504..8f9882f989 100644 +--- a/lustre.spec.in ++++ b/lustre.spec.in +@@ -472,7 +472,7 @@ export UTILS_CFLAGS="${UTILS_CFLAGS} -D__SANE_USERSPACE_TYPES__=1" + # Disable any hardening or annotation since this doesn't make sense for + # kernel code, and reset "optflags" so that the vendor's overzealous flags don't + # create build failures. +-%define optflags -g -O2 -Werror ++%define optflags -g -O2 -Werror -Wno-stringop-overflow + %undefine _annotated_build + %undefine _hardened_build + +-- +2.33.0 + diff --git a/0048-LU-16321-osd-Allow-fiemap-on-kernel-buffers.patch b/0048-LU-16321-osd-Allow-fiemap-on-kernel-buffers.patch new file mode 100644 index 0000000000000000000000000000000000000000..d37d58edd91b60c145dbf21704d88c5ccf24a9cd --- /dev/null +++ b/0048-LU-16321-osd-Allow-fiemap-on-kernel-buffers.patch @@ -0,0 +1,453 @@ +From e55361ee38116437b8b6529368f6a25160bfa410 Mon Sep 17 00:00:00 2001 +From: Shaun Tancheff +Date: Fri, 2 Dec 2022 04:19:59 -0600 +Subject: [PATCH 48/61] LU-16321 osd: Allow fiemap on kernel buffers + +Linux commit v5.17-rc3-19-g967747bbc084 + uaccess: remove CONFIG_SET_FS + +When KERNEL_DS gone lustre needs an alternative for fiemap to +copy extents to kernel space memory. + +Direct in-kernel calls to inode->f_ops->fiemap() can utilize +an otherwise unused flag on fiemap_extent_info fi_flags +to indicate the fiemap extent buffer is allocated in kernel space. + +Include ldiskfs patches for ldiskfs_fiemap() to +define EXT4_FIEMAP_FLAG_MEMCPY and utilize it. + +HPE-bug-id: LUS-11337 +Fixes: d0337cab8e ("LU-14195 osd: don't use set_fs() for ->fiemap() calls.") +Signed-off-by: Shaun Tancheff +Change-Id: I7a8edb481833fd1bdcf7b6cd6e08397c1754baee +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49190 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Neil Brown +Reviewed-by: Alexander Boyko +Reviewed-by: Petros Koutoupis +Reviewed-by: Oleg Drokin +Signed-off-by: Xinliang Liu +--- + .../linux-5.10/ext4-fiemap-kernel-data.patch | 312 ++++++++++++++++++ + .../series/ldiskfs-5.10.0-ml.series | 1 + + .../series/ldiskfs-5.10.0-oe2203.series | 1 + + .../series/ldiskfs-5.10.0-oe2203sp1.series | 1 + + lustre/osd-ldiskfs/osd_io.c | 23 +- + 5 files changed, 337 insertions(+), 1 deletion(-) + create mode 100644 ldiskfs/kernel_patches/patches/linux-5.10/ext4-fiemap-kernel-data.patch + +diff --git a/ldiskfs/kernel_patches/patches/linux-5.10/ext4-fiemap-kernel-data.patch b/ldiskfs/kernel_patches/patches/linux-5.10/ext4-fiemap-kernel-data.patch +new file mode 100644 +index 0000000000..4f7cbc9bdf +--- /dev/null ++++ b/ldiskfs/kernel_patches/patches/linux-5.10/ext4-fiemap-kernel-data.patch +@@ -0,0 +1,312 @@ ++Subject: [PATCH] ext4-fiemap-kernel-data ++ ++Pull in enough upstream fiemap handling to conditionally use ++memcpy instead of copy_to_user in fiemap_fill_next_extent. ++Common kernel functions prefixed with ext4_ or _ext4_ ++ ++--- ++ fs/ext4/ext4.h | 3 + ++ fs/ext4/extents.c | 265 +++++++++++++++++++++++++++++++++++++++++++++- ++ 2 files changed, 264 insertions(+), 4 deletions(-) ++ ++diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h ++index c931e3a..0bb54fe 100644 ++--- a/fs/ext4/ext4.h +++++ b/fs/ext4/ext4.h ++@@ -766,6 +766,9 @@ enum { ++ */ ++ #define EXT4_FIEMAP_EXTENT_HOLE 0x08000000 ++ +++/* Otherwise unused fi_flags ext4 use memcpy instead of copy_[to|from]_uiser */ +++#define EXT4_FIEMAP_FLAG_MEMCPY 0x80000000 +++ ++ /* Max physical block we can address w/o extents */ ++ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF ++ ++diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c ++index 2e62f83..176d2b8 100644 ++--- a/fs/ext4/extents.c +++++ b/fs/ext4/extents.c ++@@ -2138,6 +2138,264 @@ cleanup: ++ return err; ++ } ++ +++#ifdef KERNEL_DS +++#define ext4_iomap_fiemap(i, f, s, l, ops) \ +++ iomap_fiemap((i), (f), (s), (l), (ops)) +++#else +++/* +++ * linux: +++ * ext4_fiemap_fill_next_extent <--- fiemap_fill_next_extent +++ * ext4_iomap_to_fiemap <----------- iomap_to_fiemap +++ * ext4_iomap_fiemap_actor <-------- iomap_fiemap_actor +++ * ext4_iomap_apply <----------------iomap_apply +++ * _ext4_iomap_fiemap <------------- iomap_fiemap +++ */ +++/** +++ * ext4_fiemap_fill_next_extent - Fiemap helper function +++ * @fieinfo: Fiemap context passed into ->fiemap +++ * @logical: Extent logical start offset, in bytes +++ * @phys: Extent physical start offset, in bytes +++ * @len: Extent length, in bytes +++ * @flags: FIEMAP_EXTENT flags that describe this extent +++ * +++ * Called from file system ->fiemap callback. Will populate extent +++ * info as passed in via arguments and copy to user memory. On +++ * success, extent count on fieinfo is incremented. +++ * +++ * Returns 0 on success, -errno on error, 1 if this was the last +++ * extent that will fit in user array. +++ */ +++#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) +++#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) +++#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) +++static int ext4_fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, +++ u64 logical, u64 phys, u64 len, +++ u32 flags) +++{ +++ struct fiemap_extent extent; +++ struct fiemap_extent __user *dest = fieinfo->fi_extents_start; +++ +++ /* only count the extents */ +++ if (fieinfo->fi_extents_max == 0) { +++ fieinfo->fi_extents_mapped++; +++ return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; +++ } +++ +++ if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) +++ return 1; +++ +++ if (flags & SET_UNKNOWN_FLAGS) +++ flags |= FIEMAP_EXTENT_UNKNOWN; +++ if (flags & SET_NO_UNMOUNTED_IO_FLAGS) +++ flags |= FIEMAP_EXTENT_ENCODED; +++ if (flags & SET_NOT_ALIGNED_FLAGS) +++ flags |= FIEMAP_EXTENT_NOT_ALIGNED; +++ +++ memset(&extent, 0, sizeof(extent)); +++ extent.fe_logical = logical; +++ extent.fe_physical = phys; +++ extent.fe_length = len; +++ extent.fe_flags = flags; +++ +++ dest += fieinfo->fi_extents_mapped; +++ if (fieinfo->fi_flags & EXT4_FIEMAP_FLAG_MEMCPY) +++ memcpy((__force void *)dest, &extent, sizeof(extent)); +++ else if (copy_to_user(dest, &extent, sizeof(extent))) +++ return -EFAULT; +++ +++ fieinfo->fi_extents_mapped++; +++ if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max) +++ return 1; +++ return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; +++} +++ +++static int ext4_iomap_to_fiemap(struct fiemap_extent_info *fi, +++ struct iomap *iomap, u32 flags) +++{ +++ switch (iomap->type) { +++ case IOMAP_HOLE: +++ /* skip holes */ +++ return 0; +++ case IOMAP_DELALLOC: +++ flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; +++ break; +++ case IOMAP_MAPPED: +++ break; +++ case IOMAP_UNWRITTEN: +++ flags |= FIEMAP_EXTENT_UNWRITTEN; +++ break; +++ case IOMAP_INLINE: +++ flags |= FIEMAP_EXTENT_DATA_INLINE; +++ break; +++ } +++ +++ if (iomap->flags & IOMAP_F_MERGED) +++ flags |= FIEMAP_EXTENT_MERGED; +++ if (iomap->flags & IOMAP_F_SHARED) +++ flags |= FIEMAP_EXTENT_SHARED; +++ +++ return ext4_fiemap_fill_next_extent(fi, iomap->offset, +++ iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, +++ iomap->length, flags); +++} +++ +++struct fiemap_ctx { +++ struct fiemap_extent_info *fi; +++ struct iomap prev; +++}; +++ +++static loff_t +++ext4_iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, +++ void *data, struct iomap *iomap, struct iomap *srcmap) +++{ +++ struct fiemap_ctx *ctx = data; +++ loff_t ret = length; +++ +++ if (iomap->type == IOMAP_HOLE) +++ return length; +++ +++ ret = ext4_iomap_to_fiemap(ctx->fi, &ctx->prev, 0); +++ ctx->prev = *iomap; +++ switch (ret) { +++ case 0: /* success */ +++ return length; +++ case 1: /* extent array full */ +++ return 0; +++ default: +++ return ret; +++ } +++} +++ +++/* +++ * Execute a iomap write on a segment of the mapping that spans a +++ * contiguous range of pages that have identical block mapping state. +++ * +++ * This avoids the need to map pages individually, do individual allocations +++ * for each page and most importantly avoid the need for filesystem specific +++ * locking per page. Instead, all the operations are amortised over the entire +++ * range of pages. It is assumed that the filesystems will lock whatever +++ * resources they require in the iomap_begin call, and release them in the +++ * iomap_end call. +++ */ +++static loff_t +++ext4_iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, +++ const struct iomap_ops *ops, void *data, iomap_actor_t actor) +++{ +++ struct iomap iomap = { .type = IOMAP_HOLE }; +++ struct iomap srcmap = { .type = IOMAP_HOLE }; +++ loff_t written = 0, ret; +++ u64 end; +++ +++ /* +++ * Need to map a range from start position for length bytes. This can +++ * span multiple pages - it is only guaranteed to return a range of a +++ * single type of pages (e.g. all into a hole, all mapped or all +++ * unwritten). Failure at this point has nothing to undo. +++ * +++ * If allocation is required for this range, reserve the space now so +++ * that the allocation is guaranteed to succeed later on. Once we copy +++ * the data into the page cache pages, then we cannot fail otherwise we +++ * expose transient stale data. If the reserve fails, we can safely +++ * back out at this point as there is nothing to undo. +++ */ +++ ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap); +++ if (ret) +++ return ret; +++ if (WARN_ON(iomap.offset > pos)) { +++ written = -EIO; +++ goto out; +++ } +++ if (WARN_ON(iomap.length == 0)) { +++ written = -EIO; +++ goto out; +++ } +++ +++ /* +++ * Cut down the length to the one actually provided by the filesystem, +++ * as it might not be able to give us the whole size that we requested. +++ */ +++ end = iomap.offset + iomap.length; +++ if (srcmap.type != IOMAP_HOLE) +++ end = min(end, srcmap.offset + srcmap.length); +++ if (pos + length > end) +++ length = end - pos; +++ +++ /* +++ * Now that we have guaranteed that the space allocation will succeed, +++ * we can do the copy-in page by page without having to worry about +++ * failures exposing transient data. +++ * +++ * To support COW operations, we read in data for partially blocks from +++ * the srcmap if the file system filled it in. In that case we the +++ * length needs to be limited to the earlier of the ends of the iomaps. +++ * If the file system did not provide a srcmap we pass in the normal +++ * iomap into the actors so that they don't need to have special +++ * handling for the two cases. +++ */ +++ written = actor(inode, pos, length, data, &iomap, +++ srcmap.type != IOMAP_HOLE ? &srcmap : &iomap); +++ +++out: +++ /* +++ * Now the data has been copied, commit the range we've copied. This +++ * should not fail unless the filesystem has had a fatal error. +++ */ +++ if (ops->iomap_end) { +++ ret = ops->iomap_end(inode, pos, length, +++ written > 0 ? written : 0, +++ flags, &iomap); +++ } +++ +++ return written ? written : ret; +++} +++ +++static +++int _ext4_iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, +++ u64 start, u64 len, const struct iomap_ops *ops) +++{ +++ struct fiemap_ctx ctx; +++ loff_t ret; +++ bool in_kernel = fi->fi_flags & EXT4_FIEMAP_FLAG_MEMCPY; +++ +++ memset(&ctx, 0, sizeof(ctx)); +++ ctx.fi = fi; +++ ctx.prev.type = IOMAP_HOLE; +++ +++ fi->fi_flags &= ~EXT4_FIEMAP_FLAG_MEMCPY; +++ ret = fiemap_prep(inode, fi, start, &len, 0); +++ if (in_kernel) +++ fi->fi_flags |= EXT4_FIEMAP_FLAG_MEMCPY; +++ if (ret) +++ return ret; +++ +++ while (len > 0) { +++ ret = ext4_iomap_apply(inode, start, len, IOMAP_REPORT, ops, +++ &ctx, ext4_iomap_fiemap_actor); +++ /* inode with no (attribute) mapping will give ENOENT */ +++ if (ret == -ENOENT) +++ break; +++ if (ret < 0) +++ return ret; +++ if (ret == 0) +++ break; +++ +++ start += ret; +++ len -= ret; +++ } +++ +++ if (ctx.prev.type != IOMAP_HOLE) { +++ ret = ext4_iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); +++ if (ret < 0) +++ return ret; +++ } +++ +++ return 0; +++} +++ +++#define ext4_iomap_fiemap(i, f, s, l, ops) \ +++ _ext4_iomap_fiemap((i), (f), (s), (l), (ops)) +++#endif /* KERNEL_DS */ +++ ++ static int ext4_fill_es_cache_info(struct inode *inode, ++ ext4_lblk_t block, ext4_lblk_t num, ++ struct fiemap_extent_info *fieinfo) ++@@ -4918,11 +5176,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ++ ++ if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { ++ fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; ++- return iomap_fiemap(inode, fieinfo, start, len, ++- &ext4_iomap_xattr_ops); +++ return ext4_iomap_fiemap(inode, fieinfo, start, len, +++ &ext4_iomap_xattr_ops); ++ } ++- ++- return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops); +++ return ext4_iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops); ++ } ++ ++ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, ++-- ++2.34.1 ++ +diff --git a/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-ml.series b/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-ml.series +index 74d865d9d8..3e706f8afb 100644 +--- a/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-ml.series ++++ b/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-ml.series +@@ -29,3 +29,4 @@ linux-5.8/ext4-no-max-dir-size-limit-for-iam-objects.patch + rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch + base/ext4-projid-xattrs.patch + linux-5.8/ext4-enc-flag.patch ++linux-5.10/ext4-fiemap-kernel-data.patch +diff --git a/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203.series b/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203.series +index 65d1f3a312..178033b03d 100644 +--- a/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203.series ++++ b/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203.series +@@ -30,4 +30,5 @@ rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch + base/ext4-projid-xattrs.patch + linux-5.8/ext4-enc-flag.patch + oe2203/ext4-delayed-iput.patch ++linux-5.10/ext4-fiemap-kernel-data.patch + rhel8/ext4-old_ea_inodes_handling_fix.patch +diff --git a/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203sp1.series b/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203sp1.series +index 1cc20c8e58..82711ec7a5 100644 +--- a/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203sp1.series ++++ b/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203sp1.series +@@ -30,4 +30,5 @@ rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch + base/ext4-projid-xattrs.patch + linux-5.8/ext4-enc-flag.patch + oe2203/ext4-delayed-iput.patch ++linux-5.10/ext4-fiemap-kernel-data.patch + rhel8/ext4-old_ea_inodes_handling_fix.patch +diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c +index 233d88397c..19b5da7bd1 100644 +--- a/lustre/osd-ldiskfs/osd_io.c ++++ b/lustre/osd-ldiskfs/osd_io.c +@@ -1357,6 +1357,22 @@ struct osd_fextent { + unsigned int mapped:1; + }; + ++#ifdef KERNEL_DS ++#define DECLARE_MM_SEGMENT_T(name) mm_segment_t name ++#define access_set_kernel(saved_fs, fei) \ ++do { \ ++ saved_fs = get_fs(); \ ++ set_fs(KERNEL_DS); \ ++} while (0) ++#define access_unset_kernel(saved_fs, fei) set_fs((saved_fs)) ++#else ++#define DECLARE_MM_SEGMENT_T(name) ++#define access_set_kernel(saved_fs, fei) \ ++ (fei)->fi_flags |= LDISKFS_FIEMAP_FLAG_MEMCPY ++#define access_unset_kernel(saved_fs, fei) \ ++ (fei)->fi_flags &= ~(LDISKFS_FIEMAP_FLAG_MEMCPY) ++#endif /* KERNEL_DS */ ++ + static int osd_is_mapped(struct dt_object *dt, __u64 offset, + struct osd_fextent *cached_extent) + { +@@ -1366,6 +1382,7 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset, + struct fiemap_extent_info fei = { 0 }; + struct fiemap_extent fe = { 0 }; + int rc; ++ DECLARE_MM_SEGMENT_T(saved_fs); + + if (block >= cached_extent->start && block < cached_extent->end) + return cached_extent->mapped; +@@ -1379,8 +1396,9 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset, + + fei.fi_extents_max = 1; + fei.fi_extents_start = &fe; +- ++ access_set_kernel(saved_fs, &fei); + rc = inode->i_op->fiemap(inode, &fei, offset, FIEMAP_MAX_OFFSET-offset); ++ access_unset_kernel(saved_fs, &fei); + if (rc != 0) + return 0; + +@@ -2633,6 +2651,7 @@ static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt, + struct inode *inode = osd_dt_obj(dt)->oo_inode; + u64 len; + int rc; ++ DECLARE_MM_SEGMENT_T(saved_fs); + + LASSERT(inode); + if (inode->i_op->fiemap == NULL) +@@ -2652,7 +2671,9 @@ static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt, + if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) + filemap_write_and_wait(inode->i_mapping); + ++ access_set_kernel(saved_fs, &fieinfo); + rc = inode->i_op->fiemap(inode, &fieinfo, fm->fm_start, len); ++ access_unset_kernel(saved_fs, &fieinfo); + fm->fm_flags = fieinfo.fi_flags; + fm->fm_mapped_extents = fieinfo.fi_extents_mapped; + +-- +2.33.0 + diff --git a/0049-LU-13135-quota-improve-checks-in-OSDs-to-ignore-quot.patch b/0049-LU-13135-quota-improve-checks-in-OSDs-to-ignore-quot.patch new file mode 100644 index 0000000000000000000000000000000000000000..39711236229e29225c36d1d975d9ba9753a8f712 --- /dev/null +++ b/0049-LU-13135-quota-improve-checks-in-OSDs-to-ignore-quot.patch @@ -0,0 +1,105 @@ +From 7495ac45845283a6abf5cb12baf42cb876159081 Mon Sep 17 00:00:00 2001 +From: Alex Zhuravlev +Date: Tue, 14 Jan 2020 22:38:51 +0300 +Subject: [PATCH 49/61] LU-13135 quota: improve checks in OSDs to ignore quota + +for root-owned files. + +sanity/60a: + zfs before 80s, after 66s + ldiskfs before 65s, after 38s + +ave.write declaration in sanity/60a: + zfs before 3.21 usec, after 1.16 usec + ldiskfs before 4.06 usec, after 0.66 usec + +Change-Id: Ib9ba50d260eac408f1f5e43c4d722ff5024135cf +Signed-off-by: Alex Zhuravlev +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/37232 +Reviewed-by: Oleg Drokin +Reviewed-by: Andreas Dilger +Reviewed-by: Wang Shilong +Tested-by: jenkins +Tested-by: Maloo +Signed-off-by: Xinliang Liu +--- + lustre/osd-ldiskfs/osd_handler.c | 6 ++++++ + lustre/osd-ldiskfs/osd_quota.c | 4 ++++ + lustre/osd-zfs/osd_quota.c | 4 ++++ + lustre/quota/qsd_handler.c | 6 +++++- + 4 files changed, 19 insertions(+), 1 deletion(-) + +diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c +index cada2dd9c2..d14b22374d 100644 +--- a/lustre/osd-ldiskfs/osd_handler.c ++++ b/lustre/osd-ldiskfs/osd_handler.c +@@ -1975,6 +1975,12 @@ static int osd_trans_start(const struct lu_env *env, struct dt_device *d, + if (OBD_FAIL_CHECK(OBD_FAIL_OSD_TXN_START)) + GOTO(out, rc = -EIO); + ++ /* ++ * we ignore quota checks for system-owned files, but still ++ * need to count blocks for uid/gid/projid ++ */ ++ osd_trans_declare_op(env, oh, OSD_OT_QUOTA, 3); ++ + /* + * XXX temporary stuff. Some abstraction layer should + * be used. +diff --git a/lustre/osd-ldiskfs/osd_quota.c b/lustre/osd-ldiskfs/osd_quota.c +index 530e6fc068..57349d525e 100644 +--- a/lustre/osd-ldiskfs/osd_quota.c ++++ b/lustre/osd-ldiskfs/osd_quota.c +@@ -645,6 +645,10 @@ int osd_declare_inode_qid(const struct lu_env *env, qid_t uid, qid_t gid, + th->th_ignore_quota; + ENTRY; + ++ /* very fast path for special files like llog */ ++ if (uid == 0 && gid == 0 && projid == 0) ++ return 0; ++ + /* let's start with user quota */ + qi->lqi_id.qid_uid = uid; + qi->lqi_type = USRQUOTA; +diff --git a/lustre/osd-zfs/osd_quota.c b/lustre/osd-zfs/osd_quota.c +index 2e75b8c940..7296f3eb59 100644 +--- a/lustre/osd-zfs/osd_quota.c ++++ b/lustre/osd-zfs/osd_quota.c +@@ -529,6 +529,10 @@ int osd_declare_quota(const struct lu_env *env, struct osd_device *osd, + th->th_ignore_quota; + ENTRY; + ++ /* very fast path for special files like llog */ ++ if (uid == 0 && gid == 0 && projid == 0) ++ return 0; ++ + if (osd_qid_declare_flags & OSD_QID_INODE) + qsd = osd->od_quota_slave_md; + else if (osd_qid_declare_flags & OSD_QID_BLK) +diff --git a/lustre/quota/qsd_handler.c b/lustre/quota/qsd_handler.c +index 1c8e43a2d6..f9f8987fc6 100644 +--- a/lustre/quota/qsd_handler.c ++++ b/lustre/quota/qsd_handler.c +@@ -857,6 +857,10 @@ int qsd_op_begin(const struct lu_env *env, struct qsd_instance *qsd, + bool found = false; + ENTRY; + ++ /* fast path, ignore quota enforcement request for root owned files */ ++ if (qi->lqi_id.qid_uid == 0) ++ return 0; ++ + if (unlikely(qsd == NULL)) + RETURN(0); + +@@ -880,7 +884,7 @@ int qsd_op_begin(const struct lu_env *env, struct qsd_instance *qsd, + * - quota isn't enforced for this quota type + * or - the user/group is root + * or - quota accounting isn't enabled */ +- if (!qsd_type_enabled(qsd, qi->lqi_type) || qi->lqi_id.qid_uid == 0 || ++ if (!qsd_type_enabled(qsd, qi->lqi_type) || + (qsd->qsd_type_array[qi->lqi_type])->qqi_acct_failed) + RETURN(0); + +-- +2.33.0 + diff --git a/0050-LU-16893-libcfs-Remove-force_sig-usage-from-lfsck.patch b/0050-LU-16893-libcfs-Remove-force_sig-usage-from-lfsck.patch new file mode 100644 index 0000000000000000000000000000000000000000..0ad2da1c62721192f1faa210c35afb2c8c624a19 --- /dev/null +++ b/0050-LU-16893-libcfs-Remove-force_sig-usage-from-lfsck.patch @@ -0,0 +1,172 @@ +From df7fe641427f384e33d06cb837b09af59dd2830e Mon Sep 17 00:00:00 2001 +From: Shaun Tancheff +Date: Tue, 27 Jun 2023 15:10:19 +0700 +Subject: [PATCH 50/61] LU-16893 libcfs: Remove force_sig usage from lfsck + +The lfsck pool of kernel threads uses force_sig() to signal +the worker threads to stop. A signal is used here as the +lfsck workers may be waiting in various, and possibly +nested, states. + +As force_sig() has been removed let us simply enable SIGINT +to be passed to the worker threads using send_sig(). + +Test-parameters: testlist=sanity-lfsck,lfsck-performance +HPE-bug-id: LUS-11670 +Fixes: db9f9543ec ("LU-12634 libcfs: force_sig() removed task parameter") +Signed-off-by: Shaun Tancheff +Change-Id: Ibf6a67f43687960b3eff9cb9a7c7dc8b1be1da63 +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51470 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: James Simmons +Reviewed-by: Oleg Drokin +Reviewed-by: Neil Brown +Signed-off-by: Xinliang Liu +--- + libcfs/autoconf/lustre-libcfs.m4 | 23 ----------------------- + libcfs/include/libcfs/linux/linux-misc.h | 14 -------------- + lustre/lfsck/lfsck_engine.c | 10 ++++++++++ + lustre/lfsck/lfsck_lib.c | 12 ++++++------ + 4 files changed, 16 insertions(+), 43 deletions(-) + +diff --git a/libcfs/autoconf/lustre-libcfs.m4 b/libcfs/autoconf/lustre-libcfs.m4 +index 2e99cff179..34726fe8dc 100644 +--- a/libcfs/autoconf/lustre-libcfs.m4 ++++ b/libcfs/autoconf/lustre-libcfs.m4 +@@ -1742,27 +1742,6 @@ AC_DEFUN([LIBCFS_LOOKUP_USER_KEY], [ + ]) + ]) # LIBCFS_LOOKUP_USER_KEY + +-# +-# LIBCFS_FORCE_SIG_WITH_TASK +-# +-# kernel 5.3 commit 3cf5d076fb4d48979f382bc9452765bf8b79e740 +-# signal: Remove task parameter from force_sig +-# +-AC_DEFUN([LIBCFS_SRC_FORCE_SIG_WITH_TASK], [ +- LB2_LINUX_TEST_SRC([force_sig_with_task], [ +- #include +- ],[ +- force_sig(SIGINT, NULL); +- ],[-Werror]) +-]) +-AC_DEFUN([LIBCFS_FORCE_SIG_WITH_TASK], [ +- AC_MSG_CHECKING([if force_sig has task parameter]) +- LB2_LINUX_TEST_RESULT([force_sig_with_task], [ +- AC_DEFINE(HAVE_FORCE_SIG_WITH_TASK, 1, +- [force_sig() has task parameter]) +- ]) +-]) # LIBCFS_FORCE_SIG_WITH_TASK +- + # + # LIBCFS_CACHE_DETAIL_WRITERS + # +@@ -2305,7 +2284,6 @@ AC_DEFUN([LIBCFS_PROG_LINUX_SRC], [ + LIBCFS_SRC_KOBJ_TYPE_DEFAULT_GROUPS + # 5.3 + LIBCFS_SRC_LOOKUP_USER_KEY +- LIBCFS_SRC_FORCE_SIG_WITH_TASK + LIBCFS_SRC_CACHE_DETAIL_WRITERS + # 5.4 + LIBCFS_SRC_GENL_DUMPIT_INFO +@@ -2444,7 +2422,6 @@ AC_DEFUN([LIBCFS_PROG_LINUX_RESULTS], [ + LIBCFS_KOBJ_TYPE_DEFAULT_GROUPS + # 5.3 + LIBCFS_LOOKUP_USER_KEY +- LIBCFS_FORCE_SIG_WITH_TASK + LIBCFS_CACHE_DETAIL_WRITERS + # 5.4 + LIBCFS_GENL_DUMPIT_INFO +diff --git a/libcfs/include/libcfs/linux/linux-misc.h b/libcfs/include/libcfs/linux/linux-misc.h +index 841db69e28..02c645aa66 100644 +--- a/libcfs/include/libcfs/linux/linux-misc.h ++++ b/libcfs/include/libcfs/linux/linux-misc.h +@@ -108,20 +108,6 @@ static inline int kref_read(const struct kref *kref) + } + #endif /* HAVE_KREF_READ */ + +-#ifdef HAVE_FORCE_SIG_WITH_TASK +-#define cfs_force_sig(sig, task) force_sig((sig), (task)) +-#else +-#define cfs_force_sig(sig, task) \ +-do { \ +- unsigned long flags; \ +- \ +- spin_lock_irqsave(&task->sighand->siglock, flags); \ +- task->sighand->action[sig - 1].sa.sa_handler = SIG_DFL; \ +- send_sig(sig, task, 1); \ +- spin_unlock_irqrestore(&task->sighand->siglock, flags); \ +-} while (0) +-#endif +- + void cfs_arch_init(void); + + #ifndef container_of_safe +diff --git a/lustre/lfsck/lfsck_engine.c b/lustre/lfsck/lfsck_engine.c +index 2c2d90bb06..d33c4eeeb2 100644 +--- a/lustre/lfsck/lfsck_engine.c ++++ b/lustre/lfsck/lfsck_engine.c +@@ -1016,6 +1016,11 @@ int lfsck_master_engine(void *args) + int rc; + ENTRY; + ++ /* ++ * thread is spawned with all signals set to SIG_IGN, re-enable ++ * SIGINT for lfsck_stop() to awaken and stop the thread. ++ */ ++ allow_signal(SIGINT); + spin_lock(&lfsck->li_lock); + lfsck->li_task = current; + spin_unlock(&lfsck->li_lock); +@@ -1594,6 +1599,11 @@ int lfsck_assistant_engine(void *args) + GOTO(fini, rc); + } + ++ /* ++ * thread is spawned with all signals set to SIG_IGN, re-enable ++ * SIGINT for lfsck_stop() to awaken and stop the thread. ++ */ ++ allow_signal(SIGINT); + spin_lock(&lad->lad_lock); + lad->lad_task = current; + thread_set_flags(athread, SVC_RUNNING); +diff --git a/lustre/lfsck/lfsck_lib.c b/lustre/lfsck/lfsck_lib.c +index 4302f4bff9..effad952d8 100644 +--- a/lustre/lfsck/lfsck_lib.c ++++ b/lustre/lfsck/lfsck_lib.c +@@ -3410,8 +3410,8 @@ int lfsck_stop(const struct lu_env *env, struct dt_device *key, + + thread_set_flags(thread, SVC_STOPPING); + +- LASSERT(lfsck->li_task != NULL); +- cfs_force_sig(SIGINT, lfsck->li_task); ++ LASSERT(lfsck->li_task); ++ send_sig(SIGINT, lfsck->li_task, 1); + + if (lfsck->li_master) { + struct lfsck_component *com; +@@ -3420,16 +3420,16 @@ int lfsck_stop(const struct lu_env *env, struct dt_device *key, + list_for_each_entry(com, &lfsck->li_list_scan, lc_link) { + lad = com->lc_data; + spin_lock(&lad->lad_lock); +- if (lad->lad_task != NULL) +- cfs_force_sig(SIGINT, lad->lad_task); ++ if (lad->lad_task) ++ send_sig(SIGINT, lad->lad_task, 1); + spin_unlock(&lad->lad_lock); + } + + list_for_each_entry(com, &lfsck->li_list_double_scan, lc_link) { + lad = com->lc_data; + spin_lock(&lad->lad_lock); +- if (lad->lad_task != NULL) +- cfs_force_sig(SIGINT, lad->lad_task); ++ if (lad->lad_task) ++ send_sig(SIGINT, lad->lad_task, 1); + spin_unlock(&lad->lad_lock); + } + } +-- +2.33.0 + diff --git a/0051-LU-16534-build-Prefer-timer_delete-_sync.patch b/0051-LU-16534-build-Prefer-timer_delete-_sync.patch new file mode 100644 index 0000000000000000000000000000000000000000..5733d496dd548d6d3f0a16d68fec21db48e0190b --- /dev/null +++ b/0051-LU-16534-build-Prefer-timer_delete-_sync.patch @@ -0,0 +1,1978 @@ +From 929ab1c506f1f46a1860422e084545734819301a Mon Sep 17 00:00:00 2001 +From: Shaun Tancheff +Date: Tue, 7 Feb 2023 02:18:36 -0600 +Subject: [PATCH 51/61] LU-16534 build: Prefer timer_delete[_sync] + +Linux commit v6.1-rc1-7-g9a5a30568697 + timers: Get rid of del_singleshot_timer_sync() +Linux commit v6.1-rc1-11-g9b13df3fb64e + timers: Rename del_timer_sync() to timer_delete_sync() +Linux commit v6.1-rc1-12-gbb663f0f3c39 + timers: Rename del_timer() to timer_delete() + +Prefer timer_delete_sync() to del_singleshot_timer_sync() +Prefer timer_delete_sync() to del_timer_sync() +Prefer del_timer() to timer_delete() + +Provide del_timer and del_timer_sync when +timer_delete[_sync] is not available + +Test-Parameters: trivial +HPE-bug-id: LUS-11470 +Signed-off-by: Shaun Tancheff +Change-Id: I4c946c315a83482dd0bd69e5e89f0302a67bf81c +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49922 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: jsimmons +Reviewed-by: Andreas Dilger +Reviewed-by: Oleg Drokin +Signed-off-by: Xinliang Liu + + Conflicts: + contrib/scripts/spelling.txt + libcfs/autoconf/lustre-libcfs.m4 + lnet/klnds/kfilnd/kfilnd_tn.c +--- + contrib/scripts/spelling.txt | 3 + + libcfs/autoconf/lustre-libcfs.m4 | 58 ++ + lnet/klnds/gnilnd/gnilnd_cb.c | 8 +- + lnet/klnds/gnilnd/gnilnd_conn.c | 2 +- + lnet/klnds/kfilnd/kfilnd_tn.c | 1646 ++++++++++++++++++++++++++++++ + lnet/lnet/net_fault.c | 4 +- + lustre/ldlm/ldlm_lockd.c | 2 +- + lustre/lod/lod_qos.c | 2 +- + lustre/osp/osp_precreate.c | 6 +- + lustre/ptlrpc/gss/gss_keyring.c | 2 +- + lustre/ptlrpc/service.c | 6 +- + 11 files changed, 1723 insertions(+), 16 deletions(-) + create mode 100644 lnet/klnds/kfilnd/kfilnd_tn.c + +diff --git a/contrib/scripts/spelling.txt b/contrib/scripts/spelling.txt +index b812339ea3..5e34d04572 100644 +--- a/contrib/scripts/spelling.txt ++++ b/contrib/scripts/spelling.txt +@@ -100,6 +100,9 @@ cfs_time_current_sec||ktime_get_real_seconds + CLASSERT||BUILD_BUG_ON() + msecs_to_jiffies||cfs_time_seconds + DEFINE_TIMER||CFS_DEFINE_TIMER ++del_timer||timer_delete ++del_timer_sync||timer_delete_sync ++del_singleshot_timer_sync||timer_delete_sync + EWOULDBLOCK||EAGAIN + container_of0||container_of_safe + DN_MAX_BONUSLEN||DN_BONUS_SIZE(dnodesize) +diff --git a/libcfs/autoconf/lustre-libcfs.m4 b/libcfs/autoconf/lustre-libcfs.m4 +index 34726fe8dc..ec169c9b28 100644 +--- a/libcfs/autoconf/lustre-libcfs.m4 ++++ b/libcfs/autoconf/lustre-libcfs.m4 +@@ -2176,6 +2176,58 @@ AC_DEFUN([LIBCFS_PDE_DATA_EXISTS],[ + ]) + ]) # LIBCFS_PDE_DATA_EXISTS + ++# ++# LIBCFS_TIMER_DELETE_SYNC ++# ++# Linux commit v6.1-rc1-7-g9a5a30568697 ++# timers: Get rid of del_singleshot_timer_sync() ++# Linux commit v6.1-rc1-11-g9b13df3fb64e ++# timers: Rename del_timer_sync() to timer_delete_sync() ++# ++AC_DEFUN([LIBCFS_SRC_TIMER_DELETE_SYNC],[ ++ LB2_LINUX_TEST_SRC([timer_delete_sync], [ ++ #include ++ ],[ ++ struct timer_list *timer = NULL; ++ (void)timer_delete_sync(timer); ++ ],[]) ++]) ++AC_DEFUN([LIBCFS_TIMER_DELETE_SYNC],[ ++ AC_MSG_CHECKING([is timer_delete_sync() available]) ++ LB2_LINUX_TEST_RESULT([timer_delete_sync], [ ++ AC_DEFINE(HAVE_TIMER_DELETE_SYNC, 1, ++ [timer_delete_sync() is available]) ++ ],[ ++ AC_DEFINE(timer_delete_sync(t), del_timer_sync(t), ++ [timer_delete_sync() not is available]) ++ ]) ++]) # LIBCFS_TIMER_DELETE_SYNC ++ ++# ++# LIBCFS_TIMER_DELETE_SYNC ++# ++# Linux commit v6.1-rc1-12-gbb663f0f3c39 ++# timers: Rename del_timer() to timer_delete() ++# ++AC_DEFUN([LIBCFS_SRC_TIMER_DELETE],[ ++ LB2_LINUX_TEST_SRC([timer_delete], [ ++ #include ++ ],[ ++ struct timer_list *timer = NULL; ++ (void)timer_delete(timer); ++ ],[]) ++]) ++AC_DEFUN([LIBCFS_TIMER_DELETE],[ ++ AC_MSG_CHECKING([is timer_delete() available]) ++ LB2_LINUX_TEST_RESULT([timer_delete], [ ++ AC_DEFINE(HAVE_TIMER_DELETE, 1, ++ [timer_delete() is available]) ++ ],[ ++ AC_DEFINE(timer_delete(t), del_timer(t), ++ [timer_delete() not is available]) ++ ]) ++]) # LIBCFS_TIMER_DELETE ++ + dnl # + dnl # Generate and compile all of the kernel API test cases to determine + dnl # which interfaces are available. By invoking the kernel build system +@@ -2312,6 +2364,9 @@ AC_DEFUN([LIBCFS_PROG_LINUX_SRC], [ + LIBCFS_SRC_PARAM_SET_UINT_MINMAX + # 5.17 + LIBCFS_SRC_PDE_DATA_EXISTS ++ # 6.2 ++ LIBCFS_SRC_TIMER_DELETE_SYNC ++ LIBCFS_SRC_TIMER_DELETE + + LB2_LINUX_TEST_COMPILE_ALL([libcfs], + [for available kernel interfaces to libcfs]) +@@ -2450,6 +2505,9 @@ AC_DEFUN([LIBCFS_PROG_LINUX_RESULTS], [ + LIBCFS_PARAM_SET_UINT_MINMAX + # 5.17 + LIBCFS_PDE_DATA_EXISTS ++ # 6.2 ++ LIBCFS_TIMER_DELETE_SYNC ++ LIBCFS_TIMER_DELETE + ]) + + # +diff --git a/lnet/klnds/gnilnd/gnilnd_cb.c b/lnet/klnds/gnilnd/gnilnd_cb.c +index 6be0b2a74b..abf88d3928 100644 +--- a/lnet/klnds/gnilnd/gnilnd_cb.c ++++ b/lnet/klnds/gnilnd/gnilnd_cb.c +@@ -2988,7 +2988,7 @@ kgnilnd_reaper(void *arg) + CDEBUG(D_INFO, "awake after schedule\n"); + } + +- del_singleshot_timer_sync(&timer); ++ timer_delete_sync(&timer); + spin_lock(&kgnilnd_data.kgn_reaper_lock); + finish_wait(&kgnilnd_data.kgn_reaper_waitq, &wait); + continue; +@@ -3805,7 +3805,7 @@ kgnilnd_process_rdmaq(kgn_device_t *dev) + /* if we think we need to adjust, take lock to serialize and recheck */ + spin_lock(&dev->gnd_rdmaq_lock); + if (time_after_eq(jiffies, dev->gnd_rdmaq_deadline)) { +- del_singleshot_timer_sync(&dev->gnd_rdmaq_timer); ++ timer_delete_sync(&dev->gnd_rdmaq_timer); + + dead_bump = cfs_time_seconds(1) / *kgnilnd_tunables.kgn_rdmaq_intervals; + +@@ -4636,7 +4636,7 @@ kgnilnd_process_mapped_tx(kgn_device_t *dev) + spin_lock(&dev->gnd_lock); + if (list_empty(&dev->gnd_map_tx)) { + /* if the list is empty make sure we dont have a timer running */ +- del_singleshot_timer_sync(&dev->gnd_map_timer); ++ timer_delete_sync(&dev->gnd_map_timer); + spin_unlock(&dev->gnd_lock); + RETURN(0); + } +@@ -4663,7 +4663,7 @@ kgnilnd_process_mapped_tx(kgn_device_t *dev) + } + + /* delete the previous timer if it exists */ +- del_singleshot_timer_sync(&dev->gnd_map_timer); ++ timer_delete_sync(&dev->gnd_map_timer); + /* stash the last map version to let us know when a good one was seen */ + last_map_version = dev->gnd_map_version; + +diff --git a/lnet/klnds/gnilnd/gnilnd_conn.c b/lnet/klnds/gnilnd/gnilnd_conn.c +index 38dce1d475..9ac954957f 100644 +--- a/lnet/klnds/gnilnd/gnilnd_conn.c ++++ b/lnet/klnds/gnilnd/gnilnd_conn.c +@@ -2545,7 +2545,7 @@ kgnilnd_dgram_mover(void *arg) + deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_dgram_timeout); + } + +- del_singleshot_timer_sync(&timer.timer); ++ timer_delete_sync(&timer.timer); + finish_wait(&dev->gnd_dgram_waitq, &wait); + } + +diff --git a/lnet/klnds/kfilnd/kfilnd_tn.c b/lnet/klnds/kfilnd/kfilnd_tn.c +new file mode 100644 +index 0000000000..59fe030fb4 +--- /dev/null ++++ b/lnet/klnds/kfilnd/kfilnd_tn.c +@@ -0,0 +1,1646 @@ ++/* ++ * GPL HEADER START ++ * ++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 only, ++ * as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License version 2 for more details (a copy is included ++ * in the LICENSE file that accompanied this code). ++ * ++ * You should have received a copy of the GNU General Public License ++ * version 2 along with this program; If not, see ++ * http://www.gnu.org/licenses/gpl-2.0.html ++ * ++ * GPL HEADER END ++ */ ++/* ++ * Copyright 2022 Hewlett Packard Enterprise Development LP ++ */ ++/* ++ * This file is part of Lustre, http://www.lustre.org/ ++ */ ++/* ++ * kfilnd transaction and state machine processing. ++ */ ++ ++#include "kfilnd_tn.h" ++#include "kfilnd_ep.h" ++#include "kfilnd_dev.h" ++#include "kfilnd_dom.h" ++#include "kfilnd_peer.h" ++#include ++ ++static struct kmem_cache *tn_cache; ++static struct kmem_cache *imm_buf_cache; ++ ++static __sum16 kfilnd_tn_cksum(void *ptr, int nob) ++{ ++ if (cksum) ++ return csum_fold(csum_partial(ptr, nob, 0)); ++ return NO_CHECKSUM; ++} ++ ++static int kfilnd_tn_msgtype2size(enum kfilnd_msg_type type) ++{ ++ const int hdr_size = offsetof(struct kfilnd_msg, proto); ++ ++ switch (type) { ++ case KFILND_MSG_IMMEDIATE: ++ return offsetof(struct kfilnd_msg, proto.immed.payload[0]); ++ ++ case KFILND_MSG_BULK_PUT_REQ: ++ case KFILND_MSG_BULK_GET_REQ: ++ return hdr_size + sizeof(struct kfilnd_bulk_req_msg); ++ ++ default: ++ return -1; ++ } ++} ++ ++static void kfilnd_tn_pack_hello_req(struct kfilnd_transaction *tn) ++{ ++ struct kfilnd_msg *msg = tn->tn_tx_msg.msg; ++ ++ /* Pack the protocol header and payload. */ ++ msg->proto.hello.version = KFILND_MSG_VERSION; ++ msg->proto.hello.rx_base = kfilnd_peer_target_rx_base(tn->tn_kp); ++ msg->proto.hello.session_key = tn->tn_kp->kp_local_session_key; ++ ++ /* TODO: Support multiple RX contexts per peer. */ ++ msg->proto.hello.rx_count = 1; ++ ++ /* Pack the transport header. */ ++ msg->magic = KFILND_MSG_MAGIC; ++ ++ /* Mesage version zero is only valid for hello requests. */ ++ msg->version = 0; ++ msg->type = KFILND_MSG_HELLO_REQ; ++ msg->nob = sizeof(struct kfilnd_hello_msg) + ++ offsetof(struct kfilnd_msg, proto); ++ msg->cksum = NO_CHECKSUM; ++ msg->srcnid = lnet_nid_to_nid4(&tn->tn_ep->end_dev->kfd_ni->ni_nid); ++ msg->dstnid = tn->tn_kp->kp_nid; ++ ++ /* Checksum entire message. */ ++ msg->cksum = kfilnd_tn_cksum(msg, msg->nob); ++ ++ tn->tn_tx_msg.length = msg->nob; ++} ++ ++static void kfilnd_tn_pack_hello_rsp(struct kfilnd_transaction *tn) ++{ ++ struct kfilnd_msg *msg = tn->tn_tx_msg.msg; ++ ++ /* Pack the protocol header and payload. */ ++ msg->proto.hello.version = tn->tn_kp->kp_version; ++ msg->proto.hello.rx_base = kfilnd_peer_target_rx_base(tn->tn_kp); ++ msg->proto.hello.session_key = tn->tn_kp->kp_local_session_key; ++ ++ /* TODO: Support multiple RX contexts per peer. */ ++ msg->proto.hello.rx_count = 1; ++ ++ /* Pack the transport header. */ ++ msg->magic = KFILND_MSG_MAGIC; ++ ++ /* Mesage version zero is only valid for hello requests. */ ++ msg->version = 0; ++ msg->type = KFILND_MSG_HELLO_RSP; ++ msg->nob = sizeof(struct kfilnd_hello_msg) + ++ offsetof(struct kfilnd_msg, proto); ++ msg->cksum = NO_CHECKSUM; ++ msg->srcnid = lnet_nid_to_nid4(&tn->tn_ep->end_dev->kfd_ni->ni_nid); ++ msg->dstnid = tn->tn_kp->kp_nid; ++ ++ /* Checksum entire message. */ ++ msg->cksum = kfilnd_tn_cksum(msg, msg->nob); ++ ++ tn->tn_tx_msg.length = msg->nob; ++} ++ ++static void kfilnd_tn_pack_bulk_req(struct kfilnd_transaction *tn) ++{ ++ struct kfilnd_msg *msg = tn->tn_tx_msg.msg; ++ ++ /* Pack the protocol header and payload. */ ++ lnet_hdr_to_nid4(&tn->tn_lntmsg->msg_hdr, &msg->proto.bulk_req.hdr); ++ msg->proto.bulk_req.key = tn->tn_mr_key; ++ msg->proto.bulk_req.response_rx = tn->tn_response_rx; ++ ++ /* Pack the transport header. */ ++ msg->magic = KFILND_MSG_MAGIC; ++ msg->version = KFILND_MSG_VERSION; ++ msg->type = tn->msg_type; ++ msg->nob = sizeof(struct kfilnd_bulk_req_msg) + ++ offsetof(struct kfilnd_msg, proto); ++ msg->cksum = NO_CHECKSUM; ++ msg->srcnid = lnet_nid_to_nid4(&tn->tn_ep->end_dev->kfd_ni->ni_nid); ++ msg->dstnid = tn->tn_kp->kp_nid; ++ ++ /* Checksum entire message. */ ++ msg->cksum = kfilnd_tn_cksum(msg, msg->nob); ++ ++ tn->tn_tx_msg.length = msg->nob; ++} ++ ++static void kfilnd_tn_pack_immed_msg(struct kfilnd_transaction *tn) ++{ ++ struct kfilnd_msg *msg = tn->tn_tx_msg.msg; ++ ++ /* Pack the protocol header and payload. */ ++ lnet_hdr_to_nid4(&tn->tn_lntmsg->msg_hdr, &msg->proto.immed.hdr); ++ ++ lnet_copy_kiov2flat(KFILND_IMMEDIATE_MSG_SIZE, ++ msg, ++ offsetof(struct kfilnd_msg, ++ proto.immed.payload), ++ tn->tn_num_iovec, tn->tn_kiov, 0, ++ tn->tn_nob); ++ ++ /* Pack the transport header. */ ++ msg->magic = KFILND_MSG_MAGIC; ++ msg->version = KFILND_MSG_VERSION; ++ msg->type = tn->msg_type; ++ msg->nob = offsetof(struct kfilnd_msg, proto.immed.payload[tn->tn_nob]); ++ msg->cksum = NO_CHECKSUM; ++ msg->srcnid = lnet_nid_to_nid4(&tn->tn_ep->end_dev->kfd_ni->ni_nid); ++ msg->dstnid = tn->tn_kp->kp_nid; ++ ++ /* Checksum entire message. */ ++ msg->cksum = kfilnd_tn_cksum(msg, msg->nob); ++ ++ tn->tn_tx_msg.length = msg->nob; ++} ++ ++static int kfilnd_tn_unpack_msg(struct kfilnd_ep *ep, struct kfilnd_msg *msg, ++ unsigned int nob) ++{ ++ const unsigned int hdr_size = offsetof(struct kfilnd_msg, proto); ++ ++ if (nob < hdr_size) { ++ KFILND_EP_ERROR(ep, "Short message: %u", nob); ++ return -EPROTO; ++ } ++ ++ /* TODO: Support byte swapping on mixed endian systems. */ ++ if (msg->magic != KFILND_MSG_MAGIC) { ++ KFILND_EP_ERROR(ep, "Bad magic: %#x", msg->magic); ++ return -EPROTO; ++ } ++ ++ /* TODO: Allow for older versions. */ ++ if (msg->version > KFILND_MSG_VERSION) { ++ KFILND_EP_ERROR(ep, "Bad version: %#x", msg->version); ++ return -EPROTO; ++ } ++ ++ if (msg->nob > nob) { ++ KFILND_EP_ERROR(ep, "Short message: got=%u, expected=%u", nob, ++ msg->nob); ++ return -EPROTO; ++ } ++ ++ /* If kfilnd_tn_cksum() returns a non-zero value, checksum is bad. */ ++ if (msg->cksum != NO_CHECKSUM && kfilnd_tn_cksum(msg, msg->nob)) { ++ KFILND_EP_ERROR(ep, "Bad checksum"); ++ return -EPROTO; ++ } ++ ++ if (msg->dstnid != lnet_nid_to_nid4(&ep->end_dev->kfd_ni->ni_nid)) { ++ KFILND_EP_ERROR(ep, "Bad destination nid: %s", ++ libcfs_nid2str(msg->dstnid)); ++ return -EPROTO; ++ } ++ ++ if (msg->srcnid == LNET_NID_ANY) { ++ KFILND_EP_ERROR(ep, "Bad source nid: %s", ++ libcfs_nid2str(msg->srcnid)); ++ return -EPROTO; ++ } ++ ++ if (msg->nob < kfilnd_tn_msgtype2size(msg->type)) { ++ KFILND_EP_ERROR(ep, "Short %s: %d(%d)\n", ++ msg_type_to_str(msg->type), ++ msg->nob, kfilnd_tn_msgtype2size(msg->type)); ++ return -EPROTO; ++ } ++ ++ switch ((enum kfilnd_msg_type)msg->type) { ++ case KFILND_MSG_IMMEDIATE: ++ case KFILND_MSG_BULK_PUT_REQ: ++ case KFILND_MSG_BULK_GET_REQ: ++ if (msg->version == 0) { ++ KFILND_EP_ERROR(ep, ++ "Bad message type and version: type=%s version=%u", ++ msg_type_to_str(msg->type), ++ msg->version); ++ return -EPROTO; ++ } ++ break; ++ ++ case KFILND_MSG_HELLO_REQ: ++ case KFILND_MSG_HELLO_RSP: ++ if (msg->version != 0) { ++ KFILND_EP_ERROR(ep, ++ "Bad message type and version: type=%s version=%u", ++ msg_type_to_str(msg->type), ++ msg->version); ++ return -EPROTO; ++ } ++ break; ++ ++ default: ++ CERROR("Unknown message type %x\n", msg->type); ++ return -EPROTO; ++ } ++ return 0; ++} ++ ++static void kfilnd_tn_record_state_change(struct kfilnd_transaction *tn) ++{ ++ unsigned int data_size_bucket = ++ kfilnd_msg_len_to_data_size_bucket(tn->lnet_msg_len); ++ struct kfilnd_tn_duration_stat *stat; ++ ++ if (tn->is_initiator) ++ stat = &tn->tn_ep->end_dev->initiator_state_stats.state[tn->tn_state].data_size[data_size_bucket]; ++ else ++ stat = &tn->tn_ep->end_dev->target_state_stats.state[tn->tn_state].data_size[data_size_bucket]; ++ ++ atomic64_add(ktime_to_ns(ktime_sub(ktime_get(), tn->tn_state_ts)), ++ &stat->accumulated_duration); ++ atomic_inc(&stat->accumulated_count); ++} ++ ++static void kfilnd_tn_state_change(struct kfilnd_transaction *tn, ++ enum tn_states new_state) ++{ ++ KFILND_TN_DEBUG(tn, "%s -> %s state change", ++ tn_state_to_str(tn->tn_state), ++ tn_state_to_str(new_state)); ++ ++ kfilnd_tn_record_state_change(tn); ++ ++ tn->tn_state = new_state; ++ tn->tn_state_ts = ktime_get(); ++} ++ ++static void kfilnd_tn_status_update(struct kfilnd_transaction *tn, int status, ++ enum lnet_msg_hstatus hstatus) ++{ ++ /* Only the first non-ok status will take. */ ++ if (tn->tn_status == 0) { ++ KFILND_TN_DEBUG(tn, "%d -> %d status change", tn->tn_status, ++ status); ++ tn->tn_status = status; ++ } ++ ++ if (tn->hstatus == LNET_MSG_STATUS_OK) { ++ KFILND_TN_DEBUG(tn, "%d -> %d health status change", ++ tn->hstatus, hstatus); ++ tn->hstatus = hstatus; ++ } ++} ++ ++static bool kfilnd_tn_has_failed(struct kfilnd_transaction *tn) ++{ ++ return tn->tn_status != 0; ++} ++ ++/** ++ * kfilnd_tn_process_rx_event() - Process an immediate receive event. ++ * ++ * For each immediate receive, a transaction structure needs to be allocated to ++ * process the receive. ++ */ ++void kfilnd_tn_process_rx_event(struct kfilnd_immediate_buffer *bufdesc, ++ struct kfilnd_msg *rx_msg, int msg_size) ++{ ++ struct kfilnd_transaction *tn; ++ bool alloc_msg = true; ++ int rc; ++ enum tn_events event = TN_EVENT_RX_HELLO; ++ ++ /* Increment buf ref count for this work */ ++ atomic_inc(&bufdesc->immed_ref); ++ ++ /* Unpack the message */ ++ rc = kfilnd_tn_unpack_msg(bufdesc->immed_end, rx_msg, msg_size); ++ if (rc || CFS_FAIL_CHECK(CFS_KFI_FAIL_MSG_UNPACK)) { ++ kfilnd_ep_imm_buffer_put(bufdesc); ++ KFILND_EP_ERROR(bufdesc->immed_end, ++ "Failed to unpack message %d", rc); ++ return; ++ } ++ ++ switch ((enum kfilnd_msg_type)rx_msg->type) { ++ case KFILND_MSG_IMMEDIATE: ++ case KFILND_MSG_BULK_PUT_REQ: ++ case KFILND_MSG_BULK_GET_REQ: ++ event = TN_EVENT_RX_OK; ++ fallthrough; ++ case KFILND_MSG_HELLO_RSP: ++ alloc_msg = false; ++ fallthrough; ++ case KFILND_MSG_HELLO_REQ: ++ /* Context points to a received buffer and status is the length. ++ * Allocate a Tn structure, set its values, then launch the ++ * receive. ++ */ ++ tn = kfilnd_tn_alloc(bufdesc->immed_end->end_dev, ++ bufdesc->immed_end->end_cpt, ++ rx_msg->srcnid, alloc_msg, false, ++ false); ++ if (IS_ERR(tn)) { ++ kfilnd_ep_imm_buffer_put(bufdesc); ++ KFILND_EP_ERROR(bufdesc->immed_end, ++ "Failed to allocate transaction struct: rc=%ld", ++ PTR_ERR(tn)); ++ return; ++ } ++ ++ tn->tn_rx_msg.msg = rx_msg; ++ tn->tn_rx_msg.length = msg_size; ++ tn->tn_posted_buf = bufdesc; ++ ++ KFILND_EP_DEBUG(bufdesc->immed_end, "%s transaction ID %u", ++ msg_type_to_str((enum kfilnd_msg_type)rx_msg->type), ++ tn->tn_mr_key); ++ break; ++ ++ default: ++ KFILND_EP_ERROR(bufdesc->immed_end, ++ "Unhandled kfilnd message type: %d", ++ (enum kfilnd_msg_type)rx_msg->type); ++ LBUG(); ++ }; ++ ++ kfilnd_tn_event_handler(tn, event, 0); ++} ++ ++static void kfilnd_tn_record_duration(struct kfilnd_transaction *tn) ++{ ++ unsigned int data_size_bucket = ++ kfilnd_msg_len_to_data_size_bucket(tn->lnet_msg_len); ++ struct kfilnd_tn_duration_stat *stat; ++ ++ if (tn->is_initiator) ++ stat = &tn->tn_ep->end_dev->initiator_stats.data_size[data_size_bucket]; ++ else ++ stat = &tn->tn_ep->end_dev->target_stats.data_size[data_size_bucket]; ++ ++ atomic64_add(ktime_to_ns(ktime_sub(ktime_get(), tn->tn_alloc_ts)), ++ &stat->accumulated_duration); ++ atomic_inc(&stat->accumulated_count); ++} ++ ++/** ++ * kfilnd_tn_finalize() - Cleanup resources and finalize LNet operation. ++ * ++ * All state machine functions should call kfilnd_tn_finalize() instead of ++ * kfilnd_tn_free(). Once all expected asynchronous events have been received, ++ * if the transaction lock has not been released, it will now be released, ++ * transaction resources cleaned up, and LNet finalized will be called. ++ */ ++static void kfilnd_tn_finalize(struct kfilnd_transaction *tn, bool *tn_released) ++{ ++ if (!*tn_released) { ++ mutex_unlock(&tn->tn_lock); ++ *tn_released = true; ++ } ++ ++ /* Release the reference on the multi-receive buffer. */ ++ if (tn->tn_posted_buf) ++ kfilnd_ep_imm_buffer_put(tn->tn_posted_buf); ++ ++ /* Finalize LNet operation. */ ++ if (tn->tn_lntmsg) { ++ tn->tn_lntmsg->msg_health_status = tn->hstatus; ++ lnet_finalize(tn->tn_lntmsg, tn->tn_status); ++ } ++ ++ if (tn->tn_getreply) { ++ tn->tn_getreply->msg_health_status = tn->hstatus; ++ lnet_set_reply_msg_len(tn->tn_ep->end_dev->kfd_ni, ++ tn->tn_getreply, ++ tn->tn_status ? 0 : tn->tn_nob); ++ lnet_finalize(tn->tn_getreply, tn->tn_status); ++ } ++ ++ if (KFILND_TN_PEER_VALID(tn)) ++ kfilnd_peer_put(tn->tn_kp); ++ ++ kfilnd_tn_record_state_change(tn); ++ kfilnd_tn_record_duration(tn); ++ ++ kfilnd_tn_free(tn); ++} ++ ++/** ++ * kfilnd_tn_cancel_tag_recv() - Attempt to cancel a tagged receive. ++ * @tn: Transaction to have tagged received cancelled. ++ * ++ * Return: 0 on success. Else, negative errno. If an error occurs, resources may ++ * be leaked. ++ */ ++static int kfilnd_tn_cancel_tag_recv(struct kfilnd_transaction *tn) ++{ ++ int rc; ++ ++ /* Issue a cancel. A return code of zero means the operation issued an ++ * async cancel. A return code of -ENOENT means the tagged receive was ++ * not found. The assumption here is that a tagged send landed thus ++ * removing the tagged receive buffer from hardware. For both cases, ++ * async events should occur. ++ */ ++ rc = kfilnd_ep_cancel_tagged_recv(tn->tn_ep, tn); ++ if (rc != 0 && rc != -ENOENT) { ++ KFILND_TN_ERROR(tn, "Failed to cancel tag receive. Resources may leak."); ++ return rc; ++ } ++ ++ return 0; ++} ++ ++static void kfilnd_tn_timeout_work(struct work_struct *work) ++{ ++ struct kfilnd_transaction *tn = ++ container_of(work, struct kfilnd_transaction, timeout_work); ++ ++ KFILND_TN_ERROR(tn, "Bulk operation timeout"); ++ kfilnd_tn_event_handler(tn, TN_EVENT_TIMEOUT, 0); ++} ++ ++static void kfilnd_tn_timeout(cfs_timer_cb_arg_t data) ++{ ++ struct kfilnd_transaction *tn = cfs_from_timer(tn, data, timeout_timer); ++ ++ queue_work(kfilnd_wq, &tn->timeout_work); ++} ++ ++static bool kfilnd_tn_timeout_cancel(struct kfilnd_transaction *tn) ++{ ++ return timer_delete(&tn->timeout_timer); ++} ++ ++static void kfilnd_tn_timeout_enable(struct kfilnd_transaction *tn) ++{ ++ ktime_t remaining_time = max_t(ktime_t, 0, ++ tn->deadline - ktime_get_seconds()); ++ unsigned long expires = remaining_time * HZ + jiffies; ++ ++ if (CFS_FAIL_CHECK(CFS_KFI_FAIL_BULK_TIMEOUT)) ++ expires = jiffies; ++ ++ cfs_timer_setup(&tn->timeout_timer, kfilnd_tn_timeout, ++ (unsigned long)tn, 0); ++ mod_timer(&tn->timeout_timer, expires); ++} ++ ++/* The following are the state machine routines for the transactions. */ ++static int kfilnd_tn_state_send_failed(struct kfilnd_transaction *tn, ++ enum tn_events event, int status, ++ bool *tn_released) ++{ ++ int rc; ++ ++ KFILND_TN_DEBUG(tn, "%s event status %d", tn_event_to_str(event), ++ status); ++ ++ switch (event) { ++ case TN_EVENT_INIT_BULK: ++ /* Need to cancel the tagged receive to prevent resources from ++ * being leaked. ++ */ ++ rc = kfilnd_tn_cancel_tag_recv(tn); ++ ++ switch (rc) { ++ /* Async event will progress transaction. */ ++ case 0: ++ kfilnd_tn_state_change(tn, TN_STATE_FAIL); ++ return 0; ++ ++ /* Need to replay TN_EVENT_INIT_BULK event while in the ++ * TN_STATE_SEND_FAILED state. ++ */ ++ case -EAGAIN: ++ KFILND_TN_DEBUG(tn, ++ "Need to replay cancel tagged recv"); ++ return -EAGAIN; ++ ++ default: ++ KFILND_TN_ERROR(tn, ++ "Unexpected error during cancel tagged receive: rc=%d", ++ rc); ++ LBUG(); ++ } ++ break; ++ ++ default: ++ KFILND_TN_ERROR(tn, "Invalid %s event", tn_event_to_str(event)); ++ LBUG(); ++ } ++} ++ ++static int kfilnd_tn_state_tagged_recv_posted(struct kfilnd_transaction *tn, ++ enum tn_events event, int status, ++ bool *tn_released) ++{ ++ int rc; ++ ++ KFILND_TN_DEBUG(tn, "%s event status %d", tn_event_to_str(event), ++ status); ++ ++ switch (event) { ++ case TN_EVENT_INIT_BULK: ++ tn->tn_target_addr = kfilnd_peer_get_kfi_addr(tn->tn_kp); ++ KFILND_TN_DEBUG(tn, "Using peer %s(%#llx)", ++ libcfs_nid2str(tn->tn_kp->kp_nid), ++ tn->tn_target_addr); ++ ++ kfilnd_tn_pack_bulk_req(tn); ++ ++ rc = kfilnd_ep_post_send(tn->tn_ep, tn); ++ switch (rc) { ++ /* Async event will progress immediate send. */ ++ case 0: ++ kfilnd_tn_state_change(tn, TN_STATE_WAIT_COMP); ++ return 0; ++ ++ /* Need to replay TN_EVENT_INIT_BULK event while in the ++ * TN_STATE_TAGGED_RECV_POSTED state. ++ */ ++ case -EAGAIN: ++ KFILND_TN_DEBUG(tn, ++ "Need to replay post send to %s(%#llx)", ++ libcfs_nid2str(tn->tn_kp->kp_nid), ++ tn->tn_target_addr); ++ return -EAGAIN; ++ ++ /* Need to transition to the TN_STATE_SEND_FAILED to cleanup ++ * posted tagged receive buffer. ++ */ ++ default: ++ KFILND_TN_ERROR(tn, ++ "Failed to post send to %s(%#llx): rc=%d", ++ libcfs_nid2str(tn->tn_kp->kp_nid), ++ tn->tn_target_addr, rc); ++ kfilnd_tn_status_update(tn, rc, ++ LNET_MSG_STATUS_LOCAL_ERROR); ++ kfilnd_tn_state_change(tn, TN_STATE_SEND_FAILED); ++ ++ /* Propogate TN_EVENT_INIT_BULK event to ++ * TN_STATE_SEND_FAILED handler. ++ */ ++ return kfilnd_tn_state_send_failed(tn, event, rc, ++ tn_released); ++ } ++ ++ default: ++ KFILND_TN_ERROR(tn, "Invalid %s event", tn_event_to_str(event)); ++ LBUG(); ++ } ++} ++ ++static int kfilnd_tn_state_idle(struct kfilnd_transaction *tn, ++ enum tn_events event, int status, ++ bool *tn_released) ++{ ++ struct kfilnd_msg *msg; ++ int rc = 0; ++ bool finalize = false; ++ struct lnet_hdr hdr; ++ struct lnet_nid srcnid; ++ ++ KFILND_TN_DEBUG(tn, "%s event status %d", tn_event_to_str(event), ++ status); ++ ++ /* For new peers, send a hello request message and queue the true LNet ++ * message for replay. ++ */ ++ if (kfilnd_peer_needs_throttle(tn->tn_kp) && ++ (event == TN_EVENT_INIT_IMMEDIATE || event == TN_EVENT_INIT_BULK)) { ++ if (kfilnd_peer_deleted(tn->tn_kp)) { ++ /* We'll assign a NETWORK_TIMEOUT message health status ++ * below because we don't know why this peer was marked ++ * for removal ++ */ ++ rc = -ESTALE; ++ KFILND_TN_DEBUG(tn, "Drop message to deleted peer"); ++ } else if (kfilnd_peer_needs_hello(tn->tn_kp, false)) { ++ /* We're throttling transactions to this peer until ++ * a handshake can be completed, but there is no HELLO ++ * currently in flight. This implies the HELLO has ++ * failed, and we should cancel this TN. Otherwise we ++ * are stuck waiting for the TN deadline. ++ * ++ * We assign NETWORK_TIMEOUT health status below because ++ * we do not know why the HELLO failed. ++ */ ++ rc = -ECANCELED; ++ KFILND_TN_DEBUG(tn, "Cancel throttled TN"); ++ } else if (ktime_before(ktime_get_seconds(), ++ tn->tn_replay_deadline)) { ++ /* If the transaction replay deadline has not been met, ++ * then return -EAGAIN. This will cause this transaction ++ * event to be replayed. During this time, an async ++ * hello message from the peer should occur at which ++ * point we can resume sending new messages to this peer ++ */ ++ KFILND_TN_DEBUG(tn, "hello response pending"); ++ return -EAGAIN; ++ } else { ++ rc = -ETIMEDOUT; ++ } ++ ++ kfilnd_tn_status_update(tn, rc, ++ LNET_MSG_STATUS_NETWORK_TIMEOUT); ++ rc = 0; ++ goto out; ++ } ++ ++ if ((event == TN_EVENT_INIT_IMMEDIATE || event == TN_EVENT_INIT_BULK) && ++ ktime_after(ktime_get_seconds(), tn->tn_replay_deadline)) { ++ kfilnd_tn_status_update(tn, -ETIMEDOUT, ++ LNET_MSG_STATUS_NETWORK_TIMEOUT); ++ rc = 0; ++ goto out; ++ } ++ ++ switch (event) { ++ case TN_EVENT_INIT_IMMEDIATE: ++ case TN_EVENT_TX_HELLO: ++ tn->tn_target_addr = kfilnd_peer_get_kfi_addr(tn->tn_kp); ++ KFILND_TN_DEBUG(tn, "Using peer %s(%#llx)", ++ libcfs_nid2str(tn->tn_kp->kp_nid), ++ tn->tn_target_addr); ++ ++ if (event == TN_EVENT_INIT_IMMEDIATE) ++ kfilnd_tn_pack_immed_msg(tn); ++ else ++ kfilnd_tn_pack_hello_req(tn); ++ ++ /* Send immediate message. */ ++ rc = kfilnd_ep_post_send(tn->tn_ep, tn); ++ switch (rc) { ++ /* Async event will progress immediate send. */ ++ case 0: ++ kfilnd_tn_state_change(tn, TN_STATE_IMM_SEND); ++ return 0; ++ ++ /* Need to TN_EVENT_INIT_IMMEDIATE event while in TN_STATE_IDLE ++ * state. ++ */ ++ case -EAGAIN: ++ KFILND_TN_DEBUG(tn, "Need to replay send to %s(%#llx)", ++ libcfs_nid2str(tn->tn_kp->kp_nid), ++ tn->tn_target_addr); ++ return -EAGAIN; ++ ++ default: ++ KFILND_TN_ERROR(tn, ++ "Failed to post send to %s(%#llx): rc=%d", ++ libcfs_nid2str(tn->tn_kp->kp_nid), ++ tn->tn_target_addr, rc); ++ if (event == TN_EVENT_TX_HELLO) ++ kfilnd_peer_clear_hello_pending(tn->tn_kp); ++ kfilnd_tn_status_update(tn, rc, ++ LNET_MSG_STATUS_LOCAL_ERROR); ++ } ++ break; ++ ++ case TN_EVENT_INIT_BULK: ++ /* Post tagged receive buffer used to land bulk response. */ ++ rc = kfilnd_ep_post_tagged_recv(tn->tn_ep, tn); ++ ++ switch (rc) { ++ /* Transition to TN_STATE_TAGGED_RECV_POSTED on success. */ ++ case 0: ++ kfilnd_tn_state_change(tn, TN_STATE_TAGGED_RECV_POSTED); ++ ++ /* Propogate TN_EVENT_INIT_BULK event to ++ * TN_STATE_TAGGED_RECV_POSTED handler. ++ */ ++ return kfilnd_tn_state_tagged_recv_posted(tn, event, ++ rc, ++ tn_released); ++ ++ /* Need to replay TN_EVENT_INIT_BULK event in the TN_STATE_IDLE ++ * state. ++ */ ++ case -EAGAIN: ++ KFILND_TN_DEBUG(tn, "Need to replay tagged recv"); ++ return -EAGAIN; ++ ++ default: ++ KFILND_TN_ERROR(tn, "Failed to post tagged recv %d", ++ rc); ++ kfilnd_tn_status_update(tn, rc, ++ LNET_MSG_STATUS_LOCAL_ERROR); ++ } ++ break; ++ ++ case TN_EVENT_RX_OK: ++ if (kfilnd_peer_needs_hello(tn->tn_kp, false)) { ++ rc = kfilnd_send_hello_request(tn->tn_ep->end_dev, ++ tn->tn_ep->end_cpt, ++ tn->tn_kp); ++ if (rc) ++ KFILND_TN_ERROR(tn, ++ "Failed to send hello request: rc=%d", ++ rc); ++ rc = 0; ++ } ++ ++ /* If this is a new peer then we cannot progress the transaction ++ * and must drop it ++ */ ++ if (kfilnd_peer_is_new_peer(tn->tn_kp)) { ++ KFILND_TN_ERROR(tn, ++ "Dropping message from %s due to stale peer", ++ libcfs_nid2str(tn->tn_kp->kp_nid)); ++ kfilnd_tn_status_update(tn, -EPROTO, ++ LNET_MSG_STATUS_LOCAL_DROPPED); ++ rc = 0; ++ goto out; ++ } ++ ++ LASSERT(kfilnd_peer_is_new_peer(tn->tn_kp) == false); ++ msg = tn->tn_rx_msg.msg; ++ ++ /* Update the NID address with the new preferred RX context. */ ++ kfilnd_peer_alive(tn->tn_kp); ++ ++ /* Pass message up to LNet ++ * The TN will be reused in this call chain so we need to ++ * release the lock on the TN before proceeding. ++ */ ++ KFILND_TN_DEBUG(tn, "%s -> TN_STATE_IMM_RECV state change", ++ tn_state_to_str(tn->tn_state)); ++ ++ /* TODO: Do not manually update this state change. */ ++ tn->tn_state = TN_STATE_IMM_RECV; ++ mutex_unlock(&tn->tn_lock); ++ *tn_released = true; ++ lnet_nid4_to_nid(msg->srcnid, &srcnid); ++ if (msg->type == KFILND_MSG_IMMEDIATE) { ++ lnet_hdr_from_nid4(&hdr, &msg->proto.immed.hdr); ++ rc = lnet_parse(tn->tn_ep->end_dev->kfd_ni, ++ &hdr, &srcnid, tn, 0); ++ } else { ++ lnet_hdr_from_nid4(&hdr, &msg->proto.bulk_req.hdr); ++ rc = lnet_parse(tn->tn_ep->end_dev->kfd_ni, ++ &hdr, &srcnid, tn, 1); ++ } ++ ++ /* If successful, transaction has been accepted by LNet and we ++ * cannot process the transaction anymore within this context. ++ */ ++ if (!rc) ++ return 0; ++ ++ KFILND_TN_ERROR(tn, "Failed to parse LNet message: rc=%d", rc); ++ kfilnd_tn_status_update(tn, rc, LNET_MSG_STATUS_LOCAL_ERROR); ++ break; ++ ++ case TN_EVENT_RX_HELLO: ++ msg = tn->tn_rx_msg.msg; ++ ++ kfilnd_peer_alive(tn->tn_kp); ++ ++ switch (msg->type) { ++ case KFILND_MSG_HELLO_REQ: ++ kfilnd_peer_process_hello(tn->tn_kp, msg); ++ tn->tn_target_addr = kfilnd_peer_get_kfi_addr(tn->tn_kp); ++ KFILND_TN_DEBUG(tn, "Using peer %s(%#llx)", ++ libcfs_nid2str(tn->tn_kp->kp_nid), ++ tn->tn_target_addr); ++ ++ kfilnd_tn_pack_hello_rsp(tn); ++ ++ /* Send immediate message. */ ++ rc = kfilnd_ep_post_send(tn->tn_ep, tn); ++ switch (rc) { ++ case 0: ++ kfilnd_tn_state_change(tn, TN_STATE_IMM_SEND); ++ return 0; ++ ++ case -EAGAIN: ++ KFILND_TN_DEBUG(tn, "Need to replay send to %s(%#llx)", ++ libcfs_nid2str(tn->tn_kp->kp_nid), ++ tn->tn_target_addr); ++ return -EAGAIN; ++ ++ default: ++ KFILND_TN_ERROR(tn, ++ "Failed to post send to %s(%#llx): rc=%d", ++ libcfs_nid2str(tn->tn_kp->kp_nid), ++ tn->tn_target_addr, rc); ++ kfilnd_tn_status_update(tn, rc, ++ LNET_MSG_STATUS_LOCAL_ERROR); ++ } ++ break; ++ ++ case KFILND_MSG_HELLO_RSP: ++ rc = 0; ++ kfilnd_peer_process_hello(tn->tn_kp, msg); ++ finalize = true; ++ break; ++ ++ default: ++ KFILND_TN_ERROR(tn, "Invalid message type: %s", ++ msg_type_to_str(msg->type)); ++ LBUG(); ++ } ++ break; ++ ++ default: ++ KFILND_TN_ERROR(tn, "Invalid %s event", tn_event_to_str(event)); ++ LBUG(); ++ } ++ ++out: ++ if (kfilnd_tn_has_failed(tn)) ++ finalize = true; ++ ++ if (finalize) ++ kfilnd_tn_finalize(tn, tn_released); ++ ++ return rc; ++} ++ ++static int kfilnd_tn_state_imm_send(struct kfilnd_transaction *tn, ++ enum tn_events event, int status, ++ bool *tn_released) ++{ ++ enum lnet_msg_hstatus hstatus; ++ ++ KFILND_TN_DEBUG(tn, "%s event status %d", tn_event_to_str(event), ++ status); ++ ++ switch (event) { ++ case TN_EVENT_TX_FAIL: ++ if (status == -ETIMEDOUT || status == -EIO) ++ hstatus = LNET_MSG_STATUS_NETWORK_TIMEOUT; ++ else ++ hstatus = LNET_MSG_STATUS_REMOTE_ERROR; ++ ++ kfilnd_tn_status_update(tn, status, hstatus); ++ kfilnd_peer_tn_failed(tn->tn_kp, status); ++ if (tn->msg_type == KFILND_MSG_HELLO_REQ) ++ kfilnd_peer_clear_hello_pending(tn->tn_kp); ++ break; ++ ++ case TN_EVENT_TX_OK: ++ kfilnd_peer_alive(tn->tn_kp); ++ break; ++ ++ default: ++ KFILND_TN_ERROR(tn, "Invalid %s event", tn_event_to_str(event)); ++ LBUG(); ++ } ++ ++ kfilnd_tn_finalize(tn, tn_released); ++ ++ return 0; ++} ++ ++static int kfilnd_tn_state_imm_recv(struct kfilnd_transaction *tn, ++ enum tn_events event, int status, ++ bool *tn_released) ++{ ++ int rc = 0; ++ bool finalize = false; ++ ++ KFILND_TN_DEBUG(tn, "%s event status %d", tn_event_to_str(event), ++ status); ++ ++ switch (event) { ++ case TN_EVENT_INIT_TAG_RMA: ++ case TN_EVENT_SKIP_TAG_RMA: ++ /* Release the buffer we received the request on. All relevant ++ * information to perform the RMA operation is stored in the ++ * transaction structure. This should be done before the RMA ++ * operation to prevent two contexts from potentially processing ++ * the same transaction. ++ * ++ * TODO: Prevent this from returning -EAGAIN. ++ */ ++ if (tn->tn_posted_buf) { ++ kfilnd_ep_imm_buffer_put(tn->tn_posted_buf); ++ tn->tn_posted_buf = NULL; ++ } ++ ++ /* Update the KFI address to use the response RX context. */ ++ tn->tn_target_addr = ++ kfi_rx_addr(KFILND_BASE_ADDR(tn->tn_kp->kp_addr), ++ tn->tn_response_rx, KFILND_FAB_RX_CTX_BITS); ++ KFILND_TN_DEBUG(tn, "Using peer %s(0x%llx)", ++ libcfs_nid2str(tn->tn_kp->kp_nid), ++ tn->tn_target_addr); ++ ++ /* Initiate the RMA operation to push/pull the LNet payload or ++ * send a tagged message to finalize the bulk operation if the ++ * RMA operation should be skipped. ++ */ ++ if (event == TN_EVENT_INIT_TAG_RMA) { ++ if (tn->sink_buffer) ++ rc = kfilnd_ep_post_read(tn->tn_ep, tn); ++ else ++ rc = kfilnd_ep_post_write(tn->tn_ep, tn); ++ ++ switch (rc) { ++ /* Async tagged RMA event will progress transaction. */ ++ case 0: ++ kfilnd_tn_state_change(tn, ++ TN_STATE_WAIT_TAG_RMA_COMP); ++ return 0; ++ ++ /* Need to replay TN_EVENT_INIT_TAG_RMA event while in ++ * the TN_STATE_IMM_RECV state. ++ */ ++ case -EAGAIN: ++ KFILND_TN_DEBUG(tn, ++ "Need to replay tagged %s to %s(%#llx)", ++ tn->sink_buffer ? "read" : "write", ++ libcfs_nid2str(tn->tn_kp->kp_nid), ++ tn->tn_target_addr); ++ return -EAGAIN; ++ ++ default: ++ KFILND_TN_ERROR(tn, ++ "Failed to post tagged %s to %s(%#llx): rc=%d", ++ tn->sink_buffer ? "read" : "write", ++ libcfs_nid2str(tn->tn_kp->kp_nid), ++ tn->tn_target_addr, rc); ++ kfilnd_tn_status_update(tn, rc, ++ LNET_MSG_STATUS_LOCAL_ERROR); ++ } ++ } else { ++ kfilnd_tn_status_update(tn, status, ++ LNET_MSG_STATUS_OK); ++ ++ /* Since the LNet initiator has posted a unique tagged ++ * buffer specific for this LNet transaction and the ++ * LNet target has decide not to push/pull to/for the ++ * LNet initiator tagged buffer, a noop operation is ++ * done to this tagged buffer (i/e payload transfer size ++ * is zero). But, immediate data, which contains the ++ * LNet target status for the transaction, is sent to ++ * the LNet initiator. Immediate data only appears in ++ * the completion event at the LNet initiator and not in ++ * the tagged buffer. ++ */ ++ tn->tagged_data = cpu_to_be64(abs(tn->tn_status)); ++ ++ rc = kfilnd_ep_post_tagged_send(tn->tn_ep, tn); ++ switch (rc) { ++ /* Async tagged RMA event will progress transaction. */ ++ case 0: ++ kfilnd_tn_state_change(tn, ++ TN_STATE_WAIT_TAG_COMP); ++ return 0; ++ ++ /* Need to replay TN_EVENT_SKIP_TAG_RMA event while in ++ * the TN_STATE_IMM_RECV state. ++ */ ++ case -EAGAIN: ++ KFILND_TN_DEBUG(tn, ++ "Need to replay tagged send to %s(%#llx)", ++ libcfs_nid2str(tn->tn_kp->kp_nid), ++ tn->tn_target_addr); ++ return -EAGAIN; ++ ++ default: ++ KFILND_TN_ERROR(tn, ++ "Failed to post tagged send to %s(%#llx): rc=%d", ++ libcfs_nid2str(tn->tn_kp->kp_nid), ++ tn->tn_target_addr, rc); ++ kfilnd_tn_status_update(tn, rc, ++ LNET_MSG_STATUS_LOCAL_ERROR); ++ } ++ } ++ break; ++ ++ case TN_EVENT_RX_OK: ++ finalize = true; ++ break; ++ ++ default: ++ KFILND_TN_ERROR(tn, "Invalid %s event", tn_event_to_str(event)); ++ LBUG(); ++ } ++ ++ if (kfilnd_tn_has_failed(tn)) ++ finalize = true; ++ ++ if (finalize) ++ kfilnd_tn_finalize(tn, tn_released); ++ ++ return rc; ++} ++ ++static int kfilnd_tn_state_wait_comp(struct kfilnd_transaction *tn, ++ enum tn_events event, int status, ++ bool *tn_released) ++{ ++ int rc; ++ enum lnet_msg_hstatus hstatus; ++ ++ KFILND_TN_DEBUG(tn, "%s event status %d", tn_event_to_str(event), ++ status); ++ ++ switch (event) { ++ case TN_EVENT_TX_OK: ++ kfilnd_peer_alive(tn->tn_kp); ++ kfilnd_tn_timeout_enable(tn); ++ kfilnd_tn_state_change(tn, TN_STATE_WAIT_TAG_COMP); ++ break; ++ ++ case TN_EVENT_TAG_RX_OK: ++ kfilnd_tn_state_change(tn, TN_STATE_WAIT_SEND_COMP); ++ break; ++ ++ case TN_EVENT_TX_FAIL: ++ if (status == -ETIMEDOUT) ++ hstatus = LNET_MSG_STATUS_NETWORK_TIMEOUT; ++ else ++ hstatus = LNET_MSG_STATUS_REMOTE_ERROR; ++ ++ kfilnd_tn_status_update(tn, status, hstatus); ++ kfilnd_peer_tn_failed(tn->tn_kp, status); ++ ++ /* Need to cancel the tagged receive to prevent resources from ++ * being leaked. ++ */ ++ rc = kfilnd_tn_cancel_tag_recv(tn); ++ ++ switch (rc) { ++ /* Async cancel event will progress transaction. */ ++ case 0: ++ kfilnd_tn_status_update(tn, status, ++ LNET_MSG_STATUS_LOCAL_ERROR); ++ kfilnd_tn_state_change(tn, TN_STATE_FAIL); ++ return 0; ++ ++ /* Need to replay TN_EVENT_INIT_BULK event while in the ++ * TN_STATE_SEND_FAILED state. ++ */ ++ case -EAGAIN: ++ KFILND_TN_DEBUG(tn, ++ "Need to replay cancel tagged recv"); ++ return -EAGAIN; ++ ++ default: ++ KFILND_TN_ERROR(tn, ++ "Unexpected error during cancel tagged receive: rc=%d", ++ rc); ++ LBUG(); ++ } ++ break; ++ ++ case TN_EVENT_TAG_RX_FAIL: ++ kfilnd_tn_status_update(tn, status, ++ LNET_MSG_STATUS_LOCAL_ERROR); ++ kfilnd_tn_state_change(tn, TN_STATE_FAIL); ++ break; ++ ++ default: ++ KFILND_TN_ERROR(tn, "Invalid %s event", tn_event_to_str(event)); ++ LBUG(); ++ } ++ ++ return 0; ++} ++ ++static int kfilnd_tn_state_wait_send_comp(struct kfilnd_transaction *tn, ++ enum tn_events event, int status, ++ bool *tn_released) ++{ ++ KFILND_TN_DEBUG(tn, "%s event status %d", tn_event_to_str(event), ++ status); ++ ++ if (event == TN_EVENT_TX_OK) { ++ kfilnd_peer_alive(tn->tn_kp); ++ kfilnd_tn_finalize(tn, tn_released); ++ } else { ++ KFILND_TN_ERROR(tn, "Invalid %s event", tn_event_to_str(event)); ++ LBUG(); ++ } ++ ++ return 0; ++} ++ ++static int kfilnd_tn_state_wait_tag_rma_comp(struct kfilnd_transaction *tn, ++ enum tn_events event, int status, ++ bool *tn_released) ++{ ++ enum lnet_msg_hstatus hstatus; ++ ++ KFILND_TN_DEBUG(tn, "%s event status %d", tn_event_to_str(event), ++ status); ++ ++ switch (event) { ++ case TN_EVENT_TAG_TX_OK: ++ kfilnd_peer_alive(tn->tn_kp); ++ break; ++ ++ case TN_EVENT_TAG_TX_FAIL: ++ if (status == -ETIMEDOUT) ++ hstatus = LNET_MSG_STATUS_NETWORK_TIMEOUT; ++ else ++ hstatus = LNET_MSG_STATUS_REMOTE_ERROR; ++ ++ kfilnd_tn_status_update(tn, status, hstatus); ++ kfilnd_peer_tn_failed(tn->tn_kp, status); ++ break; ++ ++ default: ++ KFILND_TN_ERROR(tn, "Invalid %s event", tn_event_to_str(event)); ++ LBUG(); ++ } ++ ++ kfilnd_tn_finalize(tn, tn_released); ++ ++ return 0; ++} ++ ++static int kfilnd_tn_state_wait_tag_comp(struct kfilnd_transaction *tn, ++ enum tn_events event, int status, ++ bool *tn_released) ++{ ++ int rc; ++ enum lnet_msg_hstatus hstatus; ++ ++ KFILND_TN_DEBUG(tn, "%s event status %d", tn_event_to_str(event), ++ status); ++ ++ switch (event) { ++ case TN_EVENT_TAG_RX_FAIL: ++ case TN_EVENT_TAG_RX_OK: ++ /* Status can be set for both TN_EVENT_TAG_RX_FAIL and ++ * TN_EVENT_TAG_RX_OK. For TN_EVENT_TAG_RX_OK, if status is set, ++ * LNet target returned -ENODATA. ++ */ ++ if (status) { ++ if (event == TN_EVENT_TAG_RX_FAIL) ++ kfilnd_tn_status_update(tn, status, ++ LNET_MSG_STATUS_LOCAL_ERROR); ++ else ++ kfilnd_tn_status_update(tn, status, ++ LNET_MSG_STATUS_OK); ++ } ++ ++ if (!kfilnd_tn_timeout_cancel(tn)) { ++ kfilnd_tn_state_change(tn, TN_STATE_WAIT_TIMEOUT_COMP); ++ return 0; ++ } ++ break; ++ ++ case TN_EVENT_TIMEOUT: ++ /* Need to cancel the tagged receive to prevent resources from ++ * being leaked. ++ */ ++ rc = kfilnd_tn_cancel_tag_recv(tn); ++ ++ switch (rc) { ++ /* Async cancel event will progress transaction. */ ++ case 0: ++ kfilnd_tn_state_change(tn, ++ TN_STATE_WAIT_TIMEOUT_TAG_COMP); ++ return 0; ++ ++ /* Need to replay TN_EVENT_INIT_BULK event while in the ++ * TN_STATE_WAIT_TAG_COMP state. ++ */ ++ case -EAGAIN: ++ KFILND_TN_DEBUG(tn, ++ "Need to replay cancel tagged recv"); ++ return -EAGAIN; ++ ++ default: ++ KFILND_TN_ERROR(tn, ++ "Unexpected error during cancel tagged receive: rc=%d", ++ rc); ++ LBUG(); ++ } ++ break; ++ ++ case TN_EVENT_TAG_TX_FAIL: ++ if (status == -ETIMEDOUT) ++ hstatus = LNET_MSG_STATUS_NETWORK_TIMEOUT; ++ else ++ hstatus = LNET_MSG_STATUS_REMOTE_ERROR; ++ ++ kfilnd_tn_status_update(tn, status, hstatus); ++ kfilnd_peer_tn_failed(tn->tn_kp, status); ++ break; ++ ++ case TN_EVENT_TAG_TX_OK: ++ kfilnd_peer_alive(tn->tn_kp); ++ break; ++ ++ default: ++ KFILND_TN_ERROR(tn, "Invalid %s event", tn_event_to_str(event)); ++ LBUG(); ++ } ++ ++ kfilnd_tn_finalize(tn, tn_released); ++ ++ return 0; ++} ++ ++static int kfilnd_tn_state_fail(struct kfilnd_transaction *tn, ++ enum tn_events event, int status, ++ bool *tn_released) ++{ ++ KFILND_TN_DEBUG(tn, "%s event status %d", tn_event_to_str(event), ++ status); ++ ++ switch (event) { ++ case TN_EVENT_TX_FAIL: ++ kfilnd_peer_tn_failed(tn->tn_kp, status); ++ break; ++ ++ case TN_EVENT_TX_OK: ++ kfilnd_peer_alive(tn->tn_kp); ++ break; ++ ++ case TN_EVENT_TAG_RX_FAIL: ++ case TN_EVENT_TAG_RX_CANCEL: ++ break; ++ ++ default: ++ KFILND_TN_ERROR(tn, "Invalid %s event", tn_event_to_str(event)); ++ LBUG(); ++ } ++ ++ kfilnd_tn_finalize(tn, tn_released); ++ ++ return 0; ++} ++ ++static int kfilnd_tn_state_wait_timeout_tag_comp(struct kfilnd_transaction *tn, ++ enum tn_events event, ++ int status, bool *tn_released) ++{ ++ KFILND_TN_DEBUG(tn, "%s event status %d", tn_event_to_str(event), ++ status); ++ ++ switch (event) { ++ case TN_EVENT_TAG_RX_CANCEL: ++ kfilnd_tn_status_update(tn, -ETIMEDOUT, ++ LNET_MSG_STATUS_REMOTE_TIMEOUT); ++ kfilnd_peer_tn_failed(tn->tn_kp, -ETIMEDOUT); ++ break; ++ ++ case TN_EVENT_TAG_RX_FAIL: ++ kfilnd_tn_status_update(tn, status, ++ LNET_MSG_STATUS_LOCAL_ERROR); ++ break; ++ ++ case TN_EVENT_TAG_RX_OK: ++ break; ++ ++ default: ++ KFILND_TN_ERROR(tn, "Invalid %s event", tn_event_to_str(event)); ++ LBUG(); ++ } ++ ++ kfilnd_tn_finalize(tn, tn_released); ++ ++ return 0; ++} ++ ++static int kfilnd_tn_state_wait_timeout_comp(struct kfilnd_transaction *tn, ++ enum tn_events event, int status, ++ bool *tn_released) ++{ ++ KFILND_TN_DEBUG(tn, "%s event status %d", tn_event_to_str(event), ++ status); ++ ++ if (event == TN_EVENT_TIMEOUT) { ++ kfilnd_tn_finalize(tn, tn_released); ++ } else { ++ KFILND_TN_ERROR(tn, "Invalid %s event", tn_event_to_str(event)); ++ LBUG(); ++ } ++ ++ return 0; ++} ++ ++static int ++(* const kfilnd_tn_state_dispatch_table[TN_STATE_MAX])(struct kfilnd_transaction *tn, ++ enum tn_events event, ++ int status, ++ bool *tn_released) = { ++ [TN_STATE_IDLE] = kfilnd_tn_state_idle, ++ [TN_STATE_WAIT_TAG_COMP] = kfilnd_tn_state_wait_tag_comp, ++ [TN_STATE_IMM_SEND] = kfilnd_tn_state_imm_send, ++ [TN_STATE_TAGGED_RECV_POSTED] = kfilnd_tn_state_tagged_recv_posted, ++ [TN_STATE_SEND_FAILED] = kfilnd_tn_state_send_failed, ++ [TN_STATE_WAIT_COMP] = kfilnd_tn_state_wait_comp, ++ [TN_STATE_WAIT_TIMEOUT_COMP] = kfilnd_tn_state_wait_timeout_comp, ++ [TN_STATE_WAIT_SEND_COMP] = kfilnd_tn_state_wait_send_comp, ++ [TN_STATE_WAIT_TIMEOUT_TAG_COMP] = ++ kfilnd_tn_state_wait_timeout_tag_comp, ++ [TN_STATE_FAIL] = kfilnd_tn_state_fail, ++ [TN_STATE_IMM_RECV] = kfilnd_tn_state_imm_recv, ++ [TN_STATE_WAIT_TAG_RMA_COMP] = kfilnd_tn_state_wait_tag_rma_comp, ++}; ++ ++/** ++ * kfilnd_tn_event_handler() - Update transaction state machine with an event. ++ * @tn: Transaction to be updated. ++ * @event: Transaction event. ++ * @status: Errno status associated with the event. ++ * ++ * When the transaction event handler is first called on a new transaction, the ++ * transaction is now own by the transaction system. This means that will be ++ * freed by the system as the transaction is progressed through the state ++ * machine. ++ */ ++void kfilnd_tn_event_handler(struct kfilnd_transaction *tn, ++ enum tn_events event, int status) ++{ ++ bool tn_released = false; ++ int rc; ++ ++ if (!tn) ++ return; ++ ++ mutex_lock(&tn->tn_lock); ++ rc = kfilnd_tn_state_dispatch_table[tn->tn_state](tn, event, status, ++ &tn_released); ++ if (rc == -EAGAIN) { ++ tn->replay_event = event; ++ tn->replay_status = status; ++ kfilnd_ep_queue_tn_replay(tn->tn_ep, tn); ++ } ++ ++ if (!tn_released) ++ mutex_unlock(&tn->tn_lock); ++} ++ ++/** ++ * kfilnd_tn_free() - Free a transaction. ++ */ ++void kfilnd_tn_free(struct kfilnd_transaction *tn) ++{ ++ spin_lock(&tn->tn_ep->tn_list_lock); ++ list_del(&tn->tn_entry); ++ spin_unlock(&tn->tn_ep->tn_list_lock); ++ ++ KFILND_TN_DEBUG(tn, "Transaction freed"); ++ ++ if (tn->tn_mr_key) ++ kfilnd_ep_put_key(tn->tn_ep, tn->tn_mr_key); ++ ++ /* Free send message buffer if needed. */ ++ if (tn->tn_tx_msg.msg) ++ kmem_cache_free(imm_buf_cache, tn->tn_tx_msg.msg); ++ ++ kmem_cache_free(tn_cache, tn); ++} ++ ++/** ++ * kfilnd_tn_alloc() - Allocate a new KFI LND transaction. ++ * @dev: KFI LND device used to look the KFI LND endpoint to associate with the ++ * transaction. ++ * @cpt: CPT of the transaction. ++ * @target_nid: Target NID of the transaction. ++ * @alloc_msg: Allocate an immediate message for the transaction. ++ * @is_initiator: Is initiator of LNet transaction. ++ * @key: Is transaction memory region key need. ++ * ++ * During transaction allocation, each transaction is associated with a KFI LND ++ * endpoint use to post data transfer operations. The CPT argument is used to ++ * lookup the KFI LND endpoint within the KFI LND device. ++ * ++ * Return: On success, valid pointer. Else, negative errno pointer. ++ */ ++struct kfilnd_transaction *kfilnd_tn_alloc(struct kfilnd_dev *dev, int cpt, ++ lnet_nid_t target_nid, ++ bool alloc_msg, bool is_initiator, ++ bool key) ++{ ++ struct kfilnd_transaction *tn; ++ struct kfilnd_peer *kp; ++ int rc; ++ ++ if (!dev) { ++ rc = -EINVAL; ++ goto err; ++ } ++ ++ kp = kfilnd_peer_get(dev, target_nid); ++ if (IS_ERR(kp)) { ++ rc = PTR_ERR(kp); ++ goto err; ++ } ++ ++ tn = kfilnd_tn_alloc_for_peer(dev, cpt, kp, alloc_msg, is_initiator, ++ key); ++ if (IS_ERR(tn)) { ++ rc = PTR_ERR(tn); ++ kfilnd_peer_put(kp); ++ goto err; ++ } ++ ++ return tn; ++ ++err: ++ return ERR_PTR(rc); ++} ++ ++/* See kfilnd_tn_alloc() ++ * Note: Caller must have a reference on @kp ++ */ ++struct kfilnd_transaction *kfilnd_tn_alloc_for_peer(struct kfilnd_dev *dev, ++ int cpt, ++ struct kfilnd_peer *kp, ++ bool alloc_msg, ++ bool is_initiator, ++ bool key) ++{ ++ struct kfilnd_transaction *tn; ++ struct kfilnd_ep *ep; ++ int rc; ++ ktime_t tn_alloc_ts; ++ ++ if (!dev) { ++ rc = -EINVAL; ++ goto err; ++ } ++ ++ tn_alloc_ts = ktime_get(); ++ ++ /* If the CPT does not fall into the LNet NI CPT range, force the CPT ++ * into the LNet NI CPT range. This should never happen. ++ */ ++ ep = dev->cpt_to_endpoint[cpt]; ++ if (!ep) { ++ CWARN("%s used invalid cpt=%d\n", ++ libcfs_nidstr(&dev->kfd_ni->ni_nid), cpt); ++ ep = dev->kfd_endpoints[0]; ++ } ++ ++ tn = kmem_cache_zalloc(tn_cache, GFP_KERNEL); ++ if (!tn) { ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ if (alloc_msg) { ++ tn->tn_tx_msg.msg = kmem_cache_alloc(imm_buf_cache, GFP_KERNEL); ++ if (!tn->tn_tx_msg.msg) { ++ rc = -ENOMEM; ++ goto err_free_tn; ++ } ++ } ++ ++ if (key) { ++ rc = kfilnd_ep_get_key(ep); ++ if (rc < 0) ++ goto err_free_tn; ++ tn->tn_mr_key = rc; ++ } ++ ++ tn->tn_kp = kp; ++ refcount_inc(&kp->kp_cnt); ++ ++ mutex_init(&tn->tn_lock); ++ tn->tn_ep = ep; ++ tn->tn_response_rx = ep->end_context_id; ++ tn->tn_state = TN_STATE_IDLE; ++ tn->hstatus = LNET_MSG_STATUS_OK; ++ tn->deadline = ktime_get_seconds() + lnet_get_lnd_timeout(); ++ tn->tn_replay_deadline = ktime_sub(tn->deadline, ++ (lnet_get_lnd_timeout() / 2)); ++ tn->is_initiator = is_initiator; ++ INIT_WORK(&tn->timeout_work, kfilnd_tn_timeout_work); ++ ++ /* Add the transaction to an endpoint. This is like ++ * incrementing a ref counter. ++ */ ++ spin_lock(&ep->tn_list_lock); ++ list_add_tail(&tn->tn_entry, &ep->tn_list); ++ spin_unlock(&ep->tn_list_lock); ++ ++ tn->tn_alloc_ts = tn_alloc_ts; ++ tn->tn_state_ts = ktime_get(); ++ ++ KFILND_EP_DEBUG(ep, "Transaction ID %u allocated", tn->tn_mr_key); ++ ++ return tn; ++ ++err_free_tn: ++ if (tn->tn_tx_msg.msg) ++ kmem_cache_free(imm_buf_cache, tn->tn_tx_msg.msg); ++ kmem_cache_free(tn_cache, tn); ++err: ++ return ERR_PTR(rc); ++} ++ ++/** ++ * kfilnd_tn_cleanup() - Cleanup KFI LND transaction system. ++ * ++ * This function should only be called when there are no outstanding ++ * transactions. ++ */ ++void kfilnd_tn_cleanup(void) ++{ ++ kmem_cache_destroy(imm_buf_cache); ++ kmem_cache_destroy(tn_cache); ++} ++ ++/** ++ * kfilnd_tn_init() - Initialize KFI LND transaction system. ++ * ++ * Return: On success, zero. Else, negative errno. ++ */ ++int kfilnd_tn_init(void) ++{ ++ tn_cache = kmem_cache_create("kfilnd_tn", ++ sizeof(struct kfilnd_transaction), 0, ++ SLAB_HWCACHE_ALIGN, NULL); ++ if (!tn_cache) ++ goto err; ++ ++ imm_buf_cache = kmem_cache_create("kfilnd_imm_buf", ++ KFILND_IMMEDIATE_MSG_SIZE, 0, ++ SLAB_HWCACHE_ALIGN, NULL); ++ if (!imm_buf_cache) ++ goto err_tn_cache_destroy; ++ ++ return 0; ++ ++err_tn_cache_destroy: ++ kmem_cache_destroy(tn_cache); ++err: ++ return -ENOMEM; ++} ++ ++/** ++ * kfilnd_tn_set_kiov_buf() - Set the buffer used for a transaction. ++ * @tn: Transaction to have buffer set. ++ * @kiov: LNet KIOV buffer. ++ * @num_iov: Number of IOVs. ++ * @offset: Offset into IOVs where the buffer starts. ++ * @len: Length of the buffer. ++ * ++ * This function takes the user provided IOV, offset, and len, and sets the ++ * transaction buffer. The user provided IOV is an LNet KIOV. When the ++ * transaction buffer is configured, the user provided offset is applied ++ * when the transaction buffer is configured (i.e. the transaction buffer ++ * offset is zero). ++ */ ++int kfilnd_tn_set_kiov_buf(struct kfilnd_transaction *tn, ++ struct bio_vec *kiov, size_t num_iov, ++ size_t offset, size_t len) ++{ ++ size_t i; ++ size_t cur_len = 0; ++ size_t cur_offset = offset; ++ size_t cur_iov = 0; ++ size_t tmp_len; ++ size_t tmp_offset; ++ ++ for (i = 0; (i < num_iov) && (cur_len < len); i++) { ++ /* Skip KIOVs until a KIOV with a length less than the current ++ * offset is found. ++ */ ++ if (kiov[i].bv_len <= cur_offset) { ++ cur_offset -= kiov[i].bv_len; ++ continue; ++ } ++ ++ tmp_len = kiov[i].bv_len - cur_offset; ++ tmp_offset = kiov[i].bv_len - tmp_len + kiov[i].bv_offset; ++ ++ if (tmp_len + cur_len > len) ++ tmp_len = len - cur_len; ++ ++ /* tn_kiov is an array of size LNET_MAX_IOV */ ++ if (cur_iov >= LNET_MAX_IOV) ++ return -EINVAL; ++ ++ tn->tn_kiov[cur_iov].bv_page = kiov[i].bv_page; ++ tn->tn_kiov[cur_iov].bv_len = tmp_len; ++ tn->tn_kiov[cur_iov].bv_offset = tmp_offset; ++ ++ cur_iov++; ++ cur_len += tmp_len; ++ cur_offset = 0; ++ } ++ ++ tn->tn_num_iovec = cur_iov; ++ tn->tn_nob = cur_len; ++ ++ return 0; ++} +diff --git a/lnet/lnet/net_fault.c b/lnet/lnet/net_fault.c +index 0a201a0b3c..98bdb1664f 100644 +--- a/lnet/lnet/net_fault.c ++++ b/lnet/lnet/net_fault.c +@@ -649,7 +649,7 @@ delayed_msg_check(struct lnet_delay_rule *rule, bool all, + } + + if (list_empty(&rule->dl_msg_list)) { +- del_timer(&rule->dl_timer); ++ timer_delete(&rule->dl_timer); + rule->dl_msg_send = -1; + + } else if (!list_empty(msg_list)) { +@@ -936,7 +936,7 @@ lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown) + list_for_each_entry_safe(rule, tmp, &rule_list, dl_link) { + list_del_init(&rule->dl_link); + +- del_timer_sync(&rule->dl_timer); ++ timer_delete_sync(&rule->dl_timer); + delayed_msg_check(rule, true, &msg_list); + delay_rule_decref(rule); /* -1 for the_lnet.ln_delay_rules */ + n++; +diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c +index 7e59709ea6..7a42d5d3df 100644 +--- a/lustre/ldlm/ldlm_lockd.c ++++ b/lustre/ldlm/ldlm_lockd.c +@@ -511,7 +511,7 @@ static int __ldlm_del_waiting_lock(struct ldlm_lock *lock) + /* Removing the head of the list, adjust timer. */ + if (list_next == &waiting_locks_list) { + /* No more, just cancel. */ +- del_timer(&waiting_locks_timer); ++ timer_delete(&waiting_locks_timer); + } else { + time64_t now = ktime_get_seconds(); + struct ldlm_lock *next; +diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c +index 6447dd90db..a4ba6786b0 100644 +--- a/lustre/lod/lod_qos.c ++++ b/lustre/lod/lod_qos.c +@@ -1488,7 +1488,7 @@ static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo, + /* Do actual allocation, use write lock here. */ + rc = down_write_killable(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); + +- del_singleshot_timer_sync(&timer.timer); ++ timer_delete_sync(&timer.timer); + kernel_sigaction(SIGKILL, SIG_IGN); + if (rc) { + flush_signals(current); +diff --git a/lustre/osp/osp_precreate.c b/lustre/osp/osp_precreate.c +index 5585ce1bf4..d9d64ff536 100644 +--- a/lustre/osp/osp_precreate.c ++++ b/lustre/osp/osp_precreate.c +@@ -239,7 +239,7 @@ static int osp_statfs_update(const struct lu_env *env, struct osp_device *d) + /* + * no updates till reply + */ +- del_timer(&d->opd_statfs_timer); ++ timer_delete(&d->opd_statfs_timer); + d->opd_statfs_fresh_till = ktime_add_ns(ktime_get(), expire); + d->opd_statfs_update_in_progress = 1; + +@@ -293,7 +293,7 @@ void osp_statfs_need_now(struct osp_device *d) + * is replied + */ + d->opd_statfs_fresh_till = ktime_sub_ns(ktime_get(), NSEC_PER_SEC); +- del_timer(&d->opd_statfs_timer); ++ timer_delete(&d->opd_statfs_timer); + wake_up(&d->opd_pre_waitq); + } + } +@@ -1837,7 +1837,7 @@ void osp_statfs_fini(struct osp_device *d) + struct task_struct *task = d->opd_pre_task; + ENTRY; + +- del_timer(&d->opd_statfs_timer); ++ timer_delete(&d->opd_statfs_timer); + + d->opd_pre_task = NULL; + if (task) +diff --git a/lustre/ptlrpc/gss/gss_keyring.c b/lustre/ptlrpc/gss/gss_keyring.c +index 124ebe1dc1..5d4e3c19a0 100644 +--- a/lustre/ptlrpc/gss/gss_keyring.c ++++ b/lustre/ptlrpc/gss/gss_keyring.c +@@ -149,7 +149,7 @@ void ctx_clear_timer_kr(struct ptlrpc_cli_ctx *ctx) + + CDEBUG(D_SEC, "ctx %p, key %p\n", ctx, gctx_kr->gck_key); + +- del_singleshot_timer_sync(timer); ++ timer_delete_sync(timer); + } + + static +diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c +index 16fcaba8f0..366dc5a20b 100644 +--- a/lustre/ptlrpc/service.c ++++ b/lustre/ptlrpc/service.c +@@ -1248,7 +1248,7 @@ static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt) + time64_t next; + + if (array->paa_count == 0) { +- del_timer(&svcpt->scp_at_timer); ++ timer_delete(&svcpt->scp_at_timer); + return; + } + +@@ -3390,7 +3390,7 @@ ptlrpc_service_del_atimer(struct ptlrpc_service *svc) + /* early disarm AT timer... */ + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service != NULL) +- del_timer(&svcpt->scp_at_timer); ++ timer_delete(&svcpt->scp_at_timer); + } + } + +@@ -3558,7 +3558,7 @@ ptlrpc_service_free(struct ptlrpc_service *svc) + break; + + /* In case somebody rearmed this in the meantime */ +- del_timer(&svcpt->scp_at_timer); ++ timer_delete(&svcpt->scp_at_timer); + array = &svcpt->scp_at_array; + + if (array->paa_reqs_array != NULL) { +-- +2.33.0 + diff --git a/0052-LU-16541-tests-Improve-test-64f.patch b/0052-LU-16541-tests-Improve-test-64f.patch new file mode 100644 index 0000000000000000000000000000000000000000..be634137910bfe679b129667f6baab841ba3265c --- /dev/null +++ b/0052-LU-16541-tests-Improve-test-64f.patch @@ -0,0 +1,61 @@ +From 53e17f37ab36d4c70cff671115b6dfc5cf9b44db Mon Sep 17 00:00:00 2001 +From: Patrick Farrell +Date: Tue, 22 Aug 2023 12:32:52 -0400 +Subject: [PATCH 52/61] LU-16541 tests: Improve test 64f + +The buffered IO part of test 64f has several timing related +holes and other oddities. The use of multiop in the +background does not guarantee the RPC will not be sent, AND +the test doesn't kill it correctly. + +Clean this up and make a more reliable version of the test. +Hopefully this will resolve the failure issues, if not, a +better version of the test will allow debugging. + +Test-Parameters: trivial +Test-Parameters: testlist=sanity envdefinitions=ONLY=64f,ONLY_REPEAT=20 +Test-Parameters: clientarch=aarch64 testlist=sanity envdefinitions=ONLY=64f,ONLY_REPEAT=20 +Signed-off-by: Patrick Farrell +Change-Id: I25b825e1d9d516635ef8cbd26dd12809625c34df +Signed-off-by: Xinliang Liu +--- + lustre/tests/sanity.sh | 17 +++++++++++------ + 1 file changed, 11 insertions(+), 6 deletions(-) + +diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh +index 7b4d43ef12..c2693abd2d 100755 +--- a/lustre/tests/sanity.sh ++++ b/lustre/tests/sanity.sh +@@ -8953,18 +8953,23 @@ test_64f() { + + $LFS setstripe -c 1 -i 0 $DIR/$tfile || error "lfs setstripe failed" + +- local cmd="oO_WRONLY:w${write_bytes}_yc" ++ # Testing that buffered IO consumes grant on the client + +- $MULTIOP $DIR/$tfile $cmd & +- MULTIPID=$! +- sleep 1 ++ # Delay the RPC on the server so it's guaranteed to not complete even ++ # if the RPC is sent from the client ++ #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a ++ $LCTL set_param fail_loc=0x50a fail_val=3 ++ dd if=/dev/zero of=$DIR/$tfile bs=$write_bytes count=1 conv=notrunc || ++ error "error writing to $DIR/$tfile with buffered IO" + + check_grants $osc_tgt $((init_grants - grants)) \ + "buffered io, not write rpc" + +- kill -USR1 $MULTIPID +- wait ++ # Clear the fail loc and do a sync on the client ++ $LCTL set_param fail_loc=0 fail_val=0 ++ sync + ++ # RPC is now known to have sent + check_grants $osc_tgt $((init_grants - grants + chunk)) \ + "buffered io, one RPC" + } +-- +2.33.0 + diff --git a/0053-LU-16788-tests-sanity-should-remove-temp-files.patch b/0053-LU-16788-tests-sanity-should-remove-temp-files.patch new file mode 100644 index 0000000000000000000000000000000000000000..888acce0f7bed8d38e7b0060bbc18e5b1e6e92a9 --- /dev/null +++ b/0053-LU-16788-tests-sanity-should-remove-temp-files.patch @@ -0,0 +1,237 @@ +From bc58052e3b1a1883e4a22eccdd43a3992b99ec42 Mon Sep 17 00:00:00 2001 +From: Alex Zhuravlev +Date: Mon, 1 May 2023 18:05:35 +0300 +Subject: [PATCH 53/61] LU-16788 tests: sanity should remove temp files + +during the test to fit OSTSIZE + +Test-Parameters: trivial +Signed-off-by: Alex Zhuravlev +Change-Id: I2f1cfe0511061794d81d0349cf36a50f40470553 +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/50819 +Reviewed-by: Arshad Hussain +Reviewed-by: Oleg Drokin +Reviewed-by: Andreas Dilger +Tested-by: jenkins +Tested-by: Maloo +Signed-off-by: Xinliang Liu +--- + lustre/tests/sanity.sh | 29 ++++++++++++++++++++++++----- + 1 file changed, 24 insertions(+), 5 deletions(-) + +diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh +index c2693abd2d..a004738552 100755 +--- a/lustre/tests/sanity.sh ++++ b/lustre/tests/sanity.sh +@@ -7094,6 +7094,7 @@ test_56w() { + local dir=$DIR/$tdir + + setup_56 $dir $NUMFILES $NUMDIRS "-c $OSTCOUNT" "-c1" ++ stack_trap "rm -rf $dir" + + local stripe_size=$($LFS getstripe -S -d $dir) || + error "$LFS getstripe -S -d $dir failed" +@@ -7695,6 +7696,7 @@ test_56xd() { + local layout_after; + + test_mkdir "$dir" || error "cannot create dir $dir" ++ stack_trap "rm -rf $dir" + $LFS setstripe $layout_yaml $f_yaml || + error "cannot setstripe $f_yaml with layout $layout_yaml" + $LFS getstripe --yaml $f_yaml > $yamlfile +@@ -7731,6 +7733,7 @@ test_56xe() { + local layout_after="" + + test_mkdir "$dir" || error "cannot create dir $dir" ++ stack_trap "rm -rf $dir" + $LFS setstripe $layout $f_comp || + error "cannot setstripe $f_comp with layout $layout" + layout_before=$(get_layout_param $f_comp) +@@ -7763,6 +7766,7 @@ test_56xf() { + local fid_after="" + + test_mkdir "$dir" || error "cannot create dir $dir" ++ stack_trap "rm -rf $dir" + $LFS setstripe $layout $f_comp || + error "cannot setstripe $f_comp with layout $layout" + fid_before=$($LFS getstripe --fid $f_comp) +@@ -7847,6 +7851,7 @@ test_56xg() { + # init the file to migrate + $LFS setstripe -c1 -i1 $DIR/$tfile || + error "Unable to create $tfile on OST1" ++ stack_trap "rm -f $DIR/$tfile" + dd if=/dev/urandom of=$DIR/$tfile bs=1M count=4 status=none || + error "Unable to write on $tfile" + +@@ -8884,6 +8889,7 @@ test_64e() { + local grants=$((wb_round_up + extent_tax)) + + $LFS setstripe -c 1 -i 0 $DIR/$tfile || error "lfs setstripe failed" ++ stack_trap "rm -f $DIR/$tfile" + + # define OBD_FAIL_TGT_NO_GRANT 0x725 + # make the server not grant more back +@@ -9042,6 +9048,7 @@ test_64h() { + $LCTL set_param osc.*OST0000*.grant_shrink_interval=10 + + $LFS setstripe -c 1 -i 0 $DIR/$tfile ++ stack_trap "rm -f $DIR/$tfile" + dd if=/dev/zero of=$DIR/$tfile bs=1M count=10 oflag=sync + + # drop cache so that coming read would do rpc +@@ -9081,6 +9088,7 @@ test_64i() { + remote_ost_nodsh && skip "remote OSTs with nodsh" + + $LFS setstripe -c 1 -i 0 $DIR/$tfile ++ stack_trap "rm -f $DIR/$tfile" + + dd if=/dev/zero of=$DIR/$tfile bs=1M count=64 + +@@ -9527,6 +9535,7 @@ test_69() { + + f="$DIR/$tfile" + $LFS setstripe -c 1 -i 0 $f ++ stack_trap "rm -f $f ${f}.2" + + $DIRECTIO write ${f}.2 0 1 || error "directio write error" + +@@ -9544,7 +9553,6 @@ test_69() { + $DIRECTIO read $f 1 1 && error "read succeeded, expect -ENOENT" + + do_facet ost1 lctl set_param fail_loc=0 +- rm -f $f + } + run_test 69 "verify oa2dentry return -ENOENT doesn't LBUG ======" + +@@ -11518,6 +11526,7 @@ test_103b() { + declare -a pids + local U + ++ stack_trap "rm -f $DIR/$tfile.*" + for U in {0..511}; do + { + local O=$(printf "%04o" $U) +@@ -13697,6 +13706,7 @@ OLDIFS="$IFS" + cleanup_130() { + trap 0 + IFS="$OLDIFS" ++ rm -f $DIR/$tfile + } + + test_130a() { +@@ -13942,6 +13952,7 @@ test_130e() { + + local fm_file=$DIR/$tfile + $LFS setstripe -S 131072 -c 2 $fm_file || error "setstripe on $fm_file" ++ stack_trap "rm -f $fm_file" + + NUM_BLKS=512 + EXPECTED_LEN=$(( (NUM_BLKS / 2) * 64 )) +@@ -14793,7 +14804,7 @@ test_150d() { + + [[ "x$DOM" == "xyes" ]] && striping="-L mdt" + +- stack_trap "rm -f $DIR/$tfile; wait_delete_completed" ++ stack_trap "rm -f $DIR/$tdir; wait_delete_completed" + $LFS setstripe -E1M $striping -E eof -c $OSTCOUNT -S1M $DIR/$tdir || + error "setstripe failed" + fallocate -o 1G -l ${OSTCOUNT}m $DIR/$tdir || error "fallocate failed" +@@ -18305,14 +18316,12 @@ test_187a() { + + local file=$dir0/file1 + dd if=/dev/urandom of=$file count=10 bs=1M conv=fsync ++ stack_trap "rm -f $file" + local dv1=$($LFS data_version $file) + dd if=/dev/urandom of=$file seek=10 count=1 bs=1M conv=fsync + local dv2=$($LFS data_version $file) + [[ $dv1 != $dv2 ]] || + error "data version did not change on write $dv1 == $dv2" +- +- # clean up +- rm -f $file1 + } + run_test 187a "Test data version change" + +@@ -19331,6 +19340,7 @@ test_224c() { # LU-6441 + #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB3 0x520 + do_facet ost1 "$LCTL set_param fail_loc=0x520" + $LFS setstripe -c 1 -i 0 $DIR/$tfile ++ stack_trap "rm -f $DIR/$tfile" + dd if=/dev/zero of=$DIR/$tfile bs=8MB count=1 + sync + do_facet ost1 "$LCTL set_param fail_loc=0" +@@ -20714,6 +20724,7 @@ test_231a() + mkdir -p $DIR/$tdir + $LFS setstripe -S ${brw_size}M $DIR/$tdir || + error "failed to set stripe with -S ${brw_size}M option" ++ stack_trap "rm -rf $DIR/$tdir" + + # clear the OSC stats + $LCTL set_param osc.*.stats=0 &>/dev/null +@@ -20751,6 +20762,7 @@ run_test 231a "checking that reading/writing of BRW RPC size results in one RPC" + + test_231b() { + mkdir -p $DIR/$tdir ++ stack_trap "rm -rf $DIR/$tdir" + local i + for i in {0..1023}; do + dd if=/dev/zero of=$DIR/$tdir/$tfile conv=notrunc \ +@@ -20770,6 +20782,7 @@ test_232a() { + + # ignore dd failure + dd if=/dev/zero of=$DIR/$tdir/$tfile bs=1M count=1 || true ++ stack_trap "rm -f $DIR/$tdir/$tfile" + + do_facet ost1 $LCTL set_param fail_loc=0 + umount_client $MOUNT || error "umount failed" +@@ -20786,6 +20799,7 @@ test_232b() { + mkdir -p $DIR/$tdir + $LFS setstripe -c1 -i0 $DIR/$tdir/$tfile + dd if=/dev/zero of=$DIR/$tdir/$tfile bs=1M count=1 ++ stack_trap "rm -f $DIR/$tdir/$tfile" + sync + cancel_lru_locks osc + +@@ -21549,6 +21563,7 @@ test_253() { + wait_mds_ost_sync + wait_delete_completed + mkdir $DIR/$tdir ++ stack_trap "rm -rf $DIR/$tdir" + + pool_add $TESTNAME || error "Pool creation failed" + pool_add_targets $TESTNAME 0 || error "Pool add targets failed" +@@ -22856,6 +22871,7 @@ test_272b() { + local dom=$DIR/$tdir/dom + mkdir -p $DIR/$tdir + $LFS setstripe -E 1M -L mdt -E -1 -c1 $dom ++ stack_trap "rm -rf $DIR/$tdir" + + local mdtidx=$($LFS getstripe -m $dom) + local mdtname=MDT$(printf %04x $mdtidx) +@@ -22898,6 +22914,7 @@ test_272c() { + local dom=$DIR/$tdir/$tfile + mkdir -p $DIR/$tdir + $LFS setstripe -E 1M -L mdt -E -1 -c1 $dom ++ stack_trap "rm -rf $DIR/$tdir" + + local mdtidx=$($LFS getstripe -m $dom) + local mdtname=MDT$(printf %04x $mdtidx) +@@ -23085,6 +23102,7 @@ test_275() { + + dd if=/dev/urandom of=$file bs=1M count=2 || + error "failed to create a file" ++ stack_trap "rm -f $file" + cancel_lru_locks osc + + #lock 1 +@@ -24439,6 +24457,7 @@ test_398a() { # LU-4198 + cut -d'.' -f2) + + $LFS setstripe -c 1 -i 0 $DIR/$tfile ++ stack_trap "rm -f $DIR/$tfile" + $LCTL set_param ldlm.namespaces.*.lru_size=clear + + # request a new lock on client +-- +2.33.0 + diff --git a/0054-LU-14992-tests-sanity-replay-vbr-mkdir-on-MDT0.patch b/0054-LU-14992-tests-sanity-replay-vbr-mkdir-on-MDT0.patch new file mode 100644 index 0000000000000000000000000000000000000000..66ec0a25a3813268d32a6492c693ab171071b736 --- /dev/null +++ b/0054-LU-14992-tests-sanity-replay-vbr-mkdir-on-MDT0.patch @@ -0,0 +1,152 @@ +From 24eaacb00262fe64001255f274f28e8a3837349a Mon Sep 17 00:00:00 2001 +From: James Nunez +Date: Mon, 13 Sep 2021 10:35:30 -0600 +Subject: [PATCH 54/61] LU-14992 tests: sanity/replay-vbr mkdir on MDT0 + +Replace mkdir with mkdir_on_mdt0() for sanity test 133a +and relay-vbr test 7a. These tests expect the newly +created directory is on MDT0. + +Test-Parameters: trivial mdscount=2 mdtcount=4 testlist=sanity +Test-Parameters: env=SLOW=yes mdscount=2 mdtcount=4 testlist=replay-vbr +Signed-off-by: James Nunez +Change-Id: Icea2923a8d8d3a3aa0ddf0401f0a025480b2f6f0 +Reviewed-on: https://review.whamcloud.com/44902 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: Kevin Zhao +Reviewed-by: Andreas Dilger +Reviewed-by: Oleg Drokin +Signed-off-by: Xinliang Liu +(cherry picked from commit f0324c5c2f4390d6d7e93ed799e95d8eef4704f4) +--- + lustre/tests/replay-vbr.sh | 26 +++++++++++------------ + lustre/tests/sanity.sh | 42 +++++++++++++++++++++++++------------- + 2 files changed, 41 insertions(+), 27 deletions(-) + +diff --git a/lustre/tests/replay-vbr.sh b/lustre/tests/replay-vbr.sh +index 4c0f697494..2c785c459e 100755 +--- a/lustre/tests/replay-vbr.sh ++++ b/lustre/tests/replay-vbr.sh +@@ -716,21 +716,21 @@ test_7_cycle() { + } + + test_7a() { +- first="createmany -o $DIR/$tdir/$tfile- 1" +- lost="rm $MOUNT2/$tdir/$tfile-0" +- last="createmany -o $DIR/$tdir/$tfile- 1" +- test_7_cycle "$first" "$lost" "$last" || error "Test 7a.1 failed" ++ first="createmany -o $DIR/$tdir/$tfile- 1" ++ lost="rm $MOUNT2/$tdir/$tfile-0" ++ last="createmany -o $DIR/$tdir/$tfile- 1" ++ test_7_cycle "$first" "$lost" "$last" || error "Test 7a.1 failed" + +- first="createmany -o $DIR/$tdir/$tfile- 1" +- lost="rm $MOUNT2/$tdir/$tfile-0" +- last="mkdir $DIR/$tdir/$tfile-0" +- test_7_cycle "$first" "$lost" "$last" || error "Test 7a.2 failed" ++ first="createmany -o $DIR/$tdir/$tfile- 1" ++ lost="rm $MOUNT2/$tdir/$tfile-0" ++ last="$LFS mkdir -i 0 -c 1 $DIR/$tdir/$tfile-0" ++ test_7_cycle "$first" "$lost" "$last" || error "Test 7a.2 failed" + +- first="mkdir $DIR/$tdir/$tfile-0" +- lost="mv $MOUNT2/$tdir/$tfile-0 $MOUNT2/$tdir/$tfile-1" +- last="createmany -o $DIR/$tdir/$tfile- 1" +- test_7_cycle "$first" "$lost" "$last" || error "Test 7a.3 failed" +- return 0 ++ first="mkdir $DIR/$tdir/$tfile-0" ++ lost="mv $MOUNT2/$tdir/$tfile-0 $MOUNT2/$tdir/$tfile-1" ++ last="createmany -o $DIR/$tdir/$tfile- 1" ++ test_7_cycle "$first" "$lost" "$last" || error "Test 7a.3 failed" ++ return 0 + } + run_test 7a "create, {lost}, create" + +diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh +index a004738552..3ebb944ab2 100755 +--- a/lustre/tests/sanity.sh ++++ b/lustre/tests/sanity.sh +@@ -14092,21 +14092,31 @@ check_stats() { + local want=${3:-0} + local res + ++ # open 11 samples [usecs] 468 4793 13658 35791898 + case $facet in +- mds*) res=$(do_facet $facet \ +- $LCTL get_param mdt.$FSNAME-MDT0000.md_stats | grep "$op") ++ mds*) res=($(do_facet $facet \ ++ $LCTL get_param mdt.$FSNAME-MDT0000.md_stats | grep "$op")) + ;; +- ost*) res=$(do_facet $facet \ +- $LCTL get_param obdfilter.$FSNAME-OST0000.stats | grep "$op") ++ ost*) res=($(do_facet $facet \ ++ $LCTL get_param obdfilter.$FSNAME-OST0000.stats | grep "$op")) + ;; + *) error "Wrong facet '$facet'" ;; + esac +- [ "$res" ] || error "The counter for $op on $facet was not incremented" +- # if the argument $3 is zero, it means any stat increment is ok. +- if [[ $want -gt 0 ]]; then +- local count=$(echo $res | awk '{ print $2 }') +- [[ $count -ne $want ]] && ++ [[ -n "$res" ]] || error "counter for $op on $facet not incremented" ++ # if $want is zero, it means any stat increment is ok. ++ if (( $want > 0 )); then ++ local count=${res[1]} ++ ++ if (( $count != $want )); then ++ if [[ $facet =~ "mds" ]]; then ++ do_nodes $(comma_list $(mdts_nodes)) \ ++ $LCTL get_param mdt.*.md_stats ++ else ++ do_nodes $(comma_list $(osts-nodes)) \ ++ $LCTL get_param obdfilter.*.stats ++ fi + error "The $op counter on $facet is $count, not $want" ++ fi + fi + } + +@@ -14126,8 +14136,11 @@ test_133a() { + do_facet ost1 $LCTL set_param obdfilter.*.stats=clear + + # verify mdt stats first. +- mkdir ${testdir} || error "mkdir failed" ++ mkdir_on_mdt0 ${testdir} || error "mkdir_on_mdt0 failed" + check_stats $SINGLEMDS "mkdir" 1 ++ ++ # clear "open" from "lfs mkdir" above ++ do_facet $SINGLEMDS $LCTL set_param mdt.*.md_stats=clear + touch ${testdir}/${tfile} || error "touch failed" + check_stats $SINGLEMDS "open" 1 + check_stats $SINGLEMDS "close" 1 +@@ -14167,7 +14180,8 @@ test_133b() { + + local testdir=$DIR/${tdir}/stats_testdir + +- mkdir -p ${testdir} || error "mkdir failed" ++ mkdir -p $DIR/$tdir || error "mkdir $tdir failed" ++ mkdir_on_mdt0 ${testdir} || error "mkdir_on_mdt0 failed" + touch ${testdir}/${tfile} || error "touch failed" + cancel_lru_locks mdc + +@@ -14322,12 +14336,12 @@ test_133d() { + + local testdir1=$DIR/${tdir}/stats_testdir1 + local testdir2=$DIR/${tdir}/stats_testdir2 +- mkdir -p $DIR/${tdir} ++ mkdir -p $DIR/${tdir} || error "mkdir $tdir failed" + + do_facet $SINGLEMDS $LCTL set_param mdt.*.rename_stats=clear + +- lfs mkdir -i 0 -c 1 ${testdir1} || error "mkdir failed" +- lfs mkdir -i 0 -c 1 ${testdir2} || error "mkdir failed" ++ mkdir_on_mdt0 ${testdir1} || error "mkdir $testdir1 failed" ++ mkdir_on_mdt0 ${testdir2} || error "mkdir $testdir2 failed" + + createmany -o $testdir1/test 512 || error "createmany failed" + +-- +2.33.0 + diff --git a/0055-LU-14992-tests-add-more-mkdir_on_mdt0-calls.patch b/0055-LU-14992-tests-add-more-mkdir_on_mdt0-calls.patch new file mode 100644 index 0000000000000000000000000000000000000000..8d556ba513b1bcc772cec81357a5d2269671f3c5 --- /dev/null +++ b/0055-LU-14992-tests-add-more-mkdir_on_mdt0-calls.patch @@ -0,0 +1,47 @@ +From e1528526b18088a6c03af79b240e9b41e7cabfea Mon Sep 17 00:00:00 2001 +From: Mr NeilBrown +Date: Mon, 28 Nov 2022 07:49:50 +1100 +Subject: [PATCH 55/61] LU-14992 tests: add more mkdir_on_mdt0 calls + +A previous patch changed some mkdir calls in test_133a to +mkdir_on_mdt0. This allows stats collected from mdt0 to +reflect the mkdir. + +However two mkdir calls were missed, so "crossdir_rename" stats can be +wrong. + +Test-Parameters: trivial mdscount=2 mdtcount=4 testlist=sanity env=ONLY=133a + +Fixes: f0324c5c2f ("LU-14992 tests: sanity/replay-vbr mkdir on MDT0") +Signed-off-by: Mr NeilBrown +Change-Id: I4e5c2e5504307462bff4012a13ef9deb24f8da8c +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49252 +Reviewed-by: Andreas Dilger +Reviewed-by: Oleg Drokin +Reviewed-by: Jian Yu +Tested-by: jenkins +Tested-by: Maloo +Signed-off-by: Xinliang Liu +(cherry picked from commit d56ea0c80a959ebd9b393f2da048cc179cb16127) +--- + lustre/tests/sanity.sh | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh +index 3ebb944ab2..5e52f9c868 100755 +--- a/lustre/tests/sanity.sh ++++ b/lustre/tests/sanity.sh +@@ -14158,8 +14158,8 @@ test_133a() { + check_stats $SINGLEMDS "rmdir" 1 + + local testdir1=$DIR/${tdir}/stats_testdir1 +- mkdir -p ${testdir} +- mkdir -p ${testdir1} ++ mkdir_on_mdt0 -p ${testdir} ++ mkdir_on_mdt0 -p ${testdir1} + touch ${testdir1}/test1 + mv ${testdir1}/test1 ${testdir} || error "file crossdir rename" + check_stats $SINGLEMDS "crossdir_rename" 1 +-- +2.33.0 + diff --git a/0056-LU-15816-tests-use-correct-ost-host-to-manage-failur.patch b/0056-LU-15816-tests-use-correct-ost-host-to-manage-failur.patch new file mode 100644 index 0000000000000000000000000000000000000000..afeb9a9ed0cab9f6403fe9abae1d4198cb0b352d --- /dev/null +++ b/0056-LU-15816-tests-use-correct-ost-host-to-manage-failur.patch @@ -0,0 +1,72 @@ +From d64f9f6849d46b98fc1a1fbe52484cf805baf6a4 Mon Sep 17 00:00:00 2001 +From: Mr NeilBrown +Date: Fri, 25 Nov 2022 16:13:20 +1100 +Subject: [PATCH 56/61] LU-15816 tests: use correct ost host to manage failure + +sanity test_398m sets up striping across 2 OSTs. It ensures that +failing IO to either OST individually will fail the total IO. + +However it sends the command to fail IO for the second OST (OST1) to +the host managing the first OST (ost1). If the first 2 OSTs are on +the same host, this works. If not, it fails. + +Also there error messages when testing the second stripe say "first +stripe". + +Test-Parameters: trivial env=ONLY=398m +Signed-off-by: Mr NeilBrown +Change-Id: Ic7085dab2610fa2c044a966fd8de40def0438ca4 +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49248 +Reviewed-by: Andreas Dilger +Reviewed-by: James Simmons +Reviewed-by: Oleg Drokin +Tested-by: jenkins +Tested-by: Maloo +Signed-off-by: Xinliang Liu +(cherry picked from commit 6e66cbdb5c8c08193c36262649667747127b6d90) +--- + lustre/tests/sanity.sh | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh +index 5e52f9c868..76f261b2d1 100755 +--- a/lustre/tests/sanity.sh ++++ b/lustre/tests/sanity.sh +@@ -24839,6 +24839,7 @@ test_398m() { # LU-13798 + # Set up failure on OST0, the first stripe: + #define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e + #NB: Fail val is ost # + 1, because we cannot use cfs_fail_val = 0 ++ # OST0 is on ost1, OST1 is on ost2. + # So this fail_val specifies OST0 + do_facet ost1 $LCTL set_param fail_loc=0x20e fail_val=1 + stack_trap "do_facet ost1 $LCTL set_param fail_loc=0" +@@ -24864,13 +24865,13 @@ test_398m() { # LU-13798 + # Clear file contents, maintain striping + echo > $DIR/$tfile + # Set up failure on OST1, second stripe: +- do_facet ost1 $LCTL set_param fail_loc=0x20e fail_val=2 +- stack_trap "do_facet ost1 $LCTL set_param fail_loc=0" ++ do_facet ost2 $LCTL set_param fail_loc=0x20e fail_val=2 ++ stack_trap "do_facet ost2 $LCTL set_param fail_loc=0" + + dd if=/dev/urandom of=$DIR/$tfile bs=8M count=8 oflag=direct && +- error "parallel dio write with failure on first stripe succeeded" ++ error "parallel dio write with failure on second stripe succeeded" + stack_trap "rm -f $DIR/$tfile" +- do_facet ost1 $LCTL set_param fail_loc=0 fail_val=0 ++ do_facet ost2 $LCTL set_param fail_loc=0 fail_val=0 + + # Place data in file for read + dd if=/dev/urandom of=$DIR/$tfile bs=8M count=8 oflag=direct || +@@ -24880,7 +24881,7 @@ test_398m() { # LU-13798 + #define OBD_FAIL_OST_BRW_READ_BULK 0x20f + do_facet ost2 $LCTL set_param fail_loc=0x20f fail_val=2 + dd if=$DIR/$tfile of=$DIR/$tfile.2 bs=8M count=8 iflag=direct && +- error "parallel dio read with error on first stripe succeeded" ++ error "parallel dio read with error on second stripe succeeded" + rm -f $DIR/$tfile.2 + do_facet ost2 $LCTL set_param fail_loc=0 fail_val=0 + } +-- +2.33.0 + diff --git a/0057-LU-16571-utils-fix-parallel-lfs-migrate-b-on-hard-li.patch b/0057-LU-16571-utils-fix-parallel-lfs-migrate-b-on-hard-li.patch new file mode 100644 index 0000000000000000000000000000000000000000..c6a519dc3ede4d4f2ffbc2c72061d4144511604e --- /dev/null +++ b/0057-LU-16571-utils-fix-parallel-lfs-migrate-b-on-hard-li.patch @@ -0,0 +1,154 @@ +From 6b25f2b394f1d1e83963fa9dcba5a40015e672a4 Mon Sep 17 00:00:00 2001 +From: Etienne AUJAMES +Date: Wed, 22 Feb 2023 11:37:49 +0100 +Subject: [PATCH 57/61] LU-16571 utils: fix parallel "lfs migrate -b" on hard + links + +Multiple blocking "lfs migrate" on the same file can exhaust "ost" +service threads of an OSS CPT. + +llapi_get_data_version(...,LL_DV_RD_FLUSH) causes the OSS server to +take a server-side extent lock PR to force clients with write lock to +update the data version of the object. + +migrate_block() (lfs.c) checks the file data version is check with +LL_DV_RD_FLUSH before taking the group lock. +So "ofd_getattr_hdl()" server side lock will conflict with the lfs +instance that has the group lock. +Each attempt to get server-side extent lock will take an "ost" service +thread slot waiting the group lock to be released. + +If all threads of the "ost" servive are exhausted on a CPT, the OSS +can not handle requests from the client and it will get queued inside +the NRS policy. This causes the lfs process with the group lock to +hang (pread needs "ost" service to get sizes of objects). + +This patch check the file data version inside the group lock without +LL_DV_RD_FLUSH. This flag is not needed, the client already has an +extent group lock on all the OST objects. + +Add the regression test sanity 56xj. + +Lustre-change: https://review.whamcloud.com/50113 +Lustre-commit: 2310f4b8a6b6050cccedd4982ce80aa1cfbd3fe1 + +Test-Parameters: testlist=sanity env=ONLY=56xj,ONLY_REPEAT=20 +Test-Parameters: testlist=sanity env=ONLY=56 +Test-Parameters: testlist=sanity env=ONLY=56 +Test-Parameters: testlist=sanity env=ONLY=56 +Signed-off-by: Etienne AUJAMES +Change-Id: I0bacd372dd6f36a4ac776133dff45dc836c7c7f7 +Reviewed-by: Andreas Dilger +Reviewed-by: Patrick Farrell +--- + lustre/tests/sanity.sh | 38 ++++++++++++++++++++++++++++++++++++++ + lustre/utils/lfs.c | 35 +++++++++++++++++++---------------- + 2 files changed, 57 insertions(+), 16 deletions(-) + +diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh +index 76f261b2d1..70f43a2dcd 100755 +--- a/lustre/tests/sanity.sh ++++ b/lustre/tests/sanity.sh +@@ -7874,6 +7874,44 @@ test_56xg() { + } + run_test 56xg "lfs migrate pool support" + ++test_56xj() { # LU-16571 "lfs migrate -b" can cause thread starvation on OSS ++ (( $OSTCOUNT >= 2 )) || skip "needs >= 2 OSTs" ++ ++ local file=$DIR/$tfile ++ local linkdir=$DIR/$tdir ++ ++ test_mkdir $linkdir || error "fail to create $linkdir" ++ $LFS setstripe -i 0 -c 1 -S1M $file ++ dd if=/dev/urandom of=$file bs=1M count=10 || ++ error "fail to create $file" ++ ++ # Create file links ++ local cpts ++ local threads_max ++ local nlinks ++ ++ thread_max=$(do_facet ost1 "$LCTL get_param -n ost.OSS.ost.threads_max") ++ cpts=$(do_facet ost1 "$LCTL get_param -n cpu_partition_table | wc -l") ++ (( nlinks = thread_max * 3 / 2 / cpts)) ++ ++ echo "create $nlinks hard links of $file" ++ createmany -l $file $linkdir/link $nlinks ++ ++ # Parallel migrates (should not block) ++ local i ++ for ((i = 0; i < nlinks; i++)); do ++ echo $linkdir/link$i ++ done | xargs -n1 -P $nlinks $LFS migrate -c2 ++ ++ local stripe_count ++ stripe_count=$($LFS getstripe -c $file) || ++ error "fail to get stripe count on $file" ++ ++ ((stripe_count == 2)) || ++ error "fail to migrate $file (stripe_count = $stripe_count)" ++} ++run_test 56xj "lfs migrate -b should not cause starvation of threads on OSS" ++ + test_56y() { + [ $MDS1_VERSION -lt $(version_code 2.4.53) ] && + skip "No HSM $(lustre_build_version $SINGLEMDS) MDS < 2.4.53" +diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c +index 7946d9e050..4ae2a5f098 100644 +--- a/lustre/utils/lfs.c ++++ b/lustre/utils/lfs.c +@@ -900,31 +900,34 @@ static int migrate_block(int fd, int fdv) + int rc; + int rc2; + +- rc = fstat(fd, &st); +- if (rc < 0) { +- error_loc = "cannot stat source file"; +- return -errno; +- } ++ do ++ gid = random(); ++ while (gid == 0); + +- rc = llapi_get_data_version(fd, &dv1, LL_DV_RD_FLUSH); ++ ++ /* The grouplock blocks all concurrent accesses to the file. */ ++ rc = llapi_group_lock(fd, gid); + if (rc < 0) { +- error_loc = "cannot get dataversion"; ++ error_loc = "cannot get group lock"; + return rc; + } + +- do +- gid = random(); +- while (gid == 0); ++ rc = fstat(fd, &st); ++ if (rc < 0) { ++ error_loc = "cannot stat source file"; ++ rc = -errno; ++ goto out_unlock; ++ } + + /* +- * The grouplock blocks all concurrent accesses to the file. +- * It has to be taken after llapi_get_data_version as it would +- * block it too. ++ * LL_DV_RD_FLUSH should not be set, otherwise the servers will try to ++ * get extent locks on the OST objects. This will conflict with our ++ * extent group locks. + */ +- rc = llapi_group_lock(fd, gid); ++ rc = llapi_get_data_version(fd, &dv1, 0); + if (rc < 0) { +- error_loc = "cannot get group lock"; +- return rc; ++ error_loc = "cannot get dataversion"; ++ goto out_unlock; + } + + rc = migrate_copy_data(fd, fdv, NULL); +-- +2.33.0 + diff --git a/0058-LU-14073-ldiskfs-don-t-test-LDISKFS_IOC_FSSETXATTR.patch b/0058-LU-14073-ldiskfs-don-t-test-LDISKFS_IOC_FSSETXATTR.patch new file mode 100644 index 0000000000000000000000000000000000000000..9702f9bf13d762f7305a0967b47f977e515690d9 --- /dev/null +++ b/0058-LU-14073-ldiskfs-don-t-test-LDISKFS_IOC_FSSETXATTR.patch @@ -0,0 +1,45 @@ +From 827664003a452e249ed423cb2113dbbbae28e3a9 Mon Sep 17 00:00:00 2001 +From: Mr NeilBrown +Date: Fri, 9 Dec 2022 16:31:13 +1100 +Subject: [PATCH 58/61] LU-14073 ldiskfs: don't test LDISKFS_IOC_FSSETXATTR + +EXT4_IOC_FSSETXATTR was removed upstream in Linux 5.9, Commit +cb29a02d3a9d ("ext4: use generic names for generic ioctls"). +So we cannot use it to test if project quotas are supported. + +Instead test if EXT4_MAXQUOTAS is 3. This was changed to 3 upstream +in the commit immediately before EXT4_IOC_FSSETXATTR was added, so it +is effectively the same test. + +Test-Parameters: trivial +Signed-off-by: Mr NeilBrown +Change-Id: I88c51c03959ebe98cd5066596f5158fac570a625 +Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49353 +Tested-by: jenkins +Tested-by: Maloo +Reviewed-by: James Simmons +Reviewed-by: Andreas Dilger +Reviewed-by: Oleg Drokin +Signed-off-by: Xinliang Liu +(cherry picked from commit 40389067f5645fed903304d19dd6c58de72d0b88) +Signed-off-by: Xinliang Liu +--- + lustre/osd-ldiskfs/osd_internal.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h +index 7300278b80..baf2dc9baa 100644 +--- a/lustre/osd-ldiskfs/osd_internal.h ++++ b/lustre/osd-ldiskfs/osd_internal.h +@@ -101,7 +101,7 @@ extern struct kmem_cache *dynlock_cachep; + #define OSD_DEFAULT_EXTENT_BYTES (1U << 20) + + /* check if ldiskfs support project quota */ +-#ifndef LDISKFS_IOC_FSSETXATTR ++#if LDISKFS_MAXQUOTAS < 3 + #undef HAVE_PROJECT_QUOTA + #endif + +-- +2.33.0 + diff --git a/0059-LU-16019-llite-fully-disable-readahead-in-kernel-I-O.patch b/0059-LU-16019-llite-fully-disable-readahead-in-kernel-I-O.patch new file mode 100644 index 0000000000000000000000000000000000000000..2a429ac8f5092dc5d1ba3c12bba1bc119d1250f8 --- /dev/null +++ b/0059-LU-16019-llite-fully-disable-readahead-in-kernel-I-O.patch @@ -0,0 +1,127 @@ +From 4a3d324f5827d64216c4b7ea62adbfb5183bb795 Mon Sep 17 00:00:00 2001 +From: Qian Yingjin +Date: Mon, 15 Aug 2022 11:15:25 -0700 +Subject: [PATCH 59/61] LU-16019 llite: fully disable readahead in kernel I/O + path + +In the new kernel (rhel9 or ubuntu 2204), the readahead path may +be out of the control of Lustre CLIO engine: + +generic_file_read_iter() + ->filemap_read() + ->filemap_get_pages() + ->page_cache_sync_readahead() + ->page_cache_sync_ra() + +void page_cache_sync_ra() +{ + if (!ractl->ra->ra_pages || blk_cgroup_congested()) { + if (!ractl->file) + return; + req_count = 1; + do_forced_ra = true; + } + + /* be dumb */ + if (do_forced_ra) { + force_page_cache_ra(ractl, req_count); + return; + } + ... +} + +From the kernel readahead code, even if read-ahead is disabled +(via @ra_pages == 0), it still issues this request as read-ahead +as we will need it to satisfy the requested range. The forced +read-ahead will do the right thing and limit the read to just +the requested range, which we will set to 1 page for this case. + +Thus it can not totally avoid the read-ahead in the kernel I/O +path only by setting @ra_pages with 0. +To fully disable the read-ahead in the Linux kernel I/O path, we +still need to set @io_pages to 0, it will set I/O range to 0 in +@force_page_cache_ra(): +void force_page_cache_ra() +{ + ... + max_pages = = max_t(unsigned long, bdi->io_pages, + ra->ra_pages); + nr_to_read = min_t(unsigned long, nr_to_read, max_pages); + while (nr_to_read) { + ... + } + ... +} + +After set bdi->io_pages with 0, it can pass the sanity/101j. + +Lustre-change: https://review.whamcloud.com/47993 +Lustre-commit: f0cf7fd3cccb2313fa94a307cf862afba256b8d8 + +Signed-off-by: Qian Yingjin +Change-Id: I859a6404abb9116d9acfa03de91e61d3536d3554 +Reviewed-by: Andreas Dilger +Reviewed-by: Li Xi +Signed-off-by: Xinliang Liu +--- + lustre/autoconf/lustre-core.m4 | 22 ++++++++++++++++++++++ + lustre/llite/llite_lib.c | 3 +++ + 2 files changed, 25 insertions(+) + +diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 +index 251c977ea4..1bfc917dde 100644 +--- a/lustre/autoconf/lustre-core.m4 ++++ b/lustre/autoconf/lustre-core.m4 +@@ -1964,6 +1964,27 @@ posix_acl_update_mode, [ + ]) + ]) # LC_POSIX_ACL_UPDATE_MODE + ++# ++# LC_HAVE_BDI_IO_PAGES ++# ++# Kernel version 4.9 commit 9491ae4aade6814afcfa67f4eb3e3342c2b39750 ++# mm: don't cap request size based on read-ahead setting ++# This patch introduces a bdi hint, io_pages. ++# ++AC_DEFUN([LC_HAVE_BDI_IO_PAGES], [ ++LB_CHECK_COMPILE([if 'struct backing_dev_info' has 'io_pages' field], ++bdi_has_io_pages, [ ++ #include ++],[ ++ struct backing_dev_info info; ++ ++ info.io_pages = 0; ++],[ ++ AC_DEFINE(HAVE_BDI_IO_PAGES, 1, ++ [backing_dev_info has io_pages]) ++]) ++]) # LC_HAVE_BDI_IO_PAGES ++ + # + # LC_IOP_GENERIC_READLINK + # +@@ -2990,6 +3011,7 @@ AC_DEFUN([LC_PROG_LINUX], [ + LC_GROUP_INFO_GID + LC_VFS_SETXATTR + LC_POSIX_ACL_UPDATE_MODE ++ LC_HAVE_BDI_IO_PAGES + + # 4.10 + LC_IOP_GENERIC_READLINK +diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c +index f67ea3f7eb..ab8363e7a3 100644 +--- a/lustre/llite/llite_lib.c ++++ b/lustre/llite/llite_lib.c +@@ -1366,6 +1366,9 @@ int ll_fill_super(struct super_block *sb) + + /* disable kernel readahead */ + sb->s_bdi->ra_pages = 0; ++#ifdef HAVE_BDI_IO_PAGES ++ sb->s_bdi->io_pages = 0; ++#endif + + /* Call ll_debugfs_register_super() before lustre_process_log() + * so that "llite.*.*" params can be processed correctly. +-- +2.33.0 + diff --git a/0060-Update-openEuler-22.03-kernels.patch b/0060-Update-openEuler-22.03-kernels.patch new file mode 100644 index 0000000000000000000000000000000000000000..12360f63573d08700242917a0ea9fe484aaefc9d --- /dev/null +++ b/0060-Update-openEuler-22.03-kernels.patch @@ -0,0 +1,510 @@ +From 16169e4dfb7f3383ff62dc218c74aa8ff00dc0ba Mon Sep 17 00:00:00 2001 +From: Xinliang Liu +Date: Fri, 20 Oct 2023 10:12:18 +0000 +Subject: [PATCH 60/61] Update openEuler 22.03 kernels + +Update the ldiskfs patch ext4-data-in-dirent.patch. +Also fix kernel download URL for 22.03 LTS. + +Signed-off-by: Xinliang Liu +--- + contrib/lbuild/lbuild-openeuler | 4 +- + .../oe2203sp1/ext4-data-in-dirent.patch | 118 +++++++++--------- + .../targets/5.10-oe2203.target.in | 2 +- + .../targets/5.10-oe2203sp1.target.in | 2 +- + .../targets/5.10-oe2203sp2.target.in | 2 +- + 5 files changed, 64 insertions(+), 64 deletions(-) + +diff --git a/contrib/lbuild/lbuild-openeuler b/contrib/lbuild/lbuild-openeuler +index 35845f336b..3d1ba0233b 100644 +--- a/contrib/lbuild/lbuild-openeuler ++++ b/contrib/lbuild/lbuild-openeuler +@@ -17,8 +17,8 @@ kernel_debuginfo_location() { + local distro=${DISTRO^^} + # convert OEYYMM.SPx to openEuler-YY.MM-LTS-SPx + distro=$(echo $distro | sed -E -e 's/OE/openEuler-/' \ +- -e 's/([0-9]{2})([0-9]{2})/\1.\2-/' \ +- -e 's/.(SP[0-9]+)/LTS-\1/') ++ -e 's/([0-9]{2})([0-9]{2})/\1.\2-LTS/' \ ++ -e 's/.(SP[0-9]+)/-\1/') + + echo "${base_url}/${distro}/update/$TARGET_ARCH/Packages" + } +diff --git a/ldiskfs/kernel_patches/patches/oe2203sp1/ext4-data-in-dirent.patch b/ldiskfs/kernel_patches/patches/oe2203sp1/ext4-data-in-dirent.patch +index 9512f94634..5c77ca637c 100644 +--- a/ldiskfs/kernel_patches/patches/oe2203sp1/ext4-data-in-dirent.patch ++++ b/ldiskfs/kernel_patches/patches/oe2203sp1/ext4-data-in-dirent.patch +@@ -1,7 +1,7 @@ +-From ef3b0235735794064352d9b053802b368ecdfcc9 Mon Sep 17 00:00:00 2001 ++From 0ac1aed9c2c4e090bccebcace6913f80098a8d3f Mon Sep 17 00:00:00 2001 + From: Xinliang Liu +-Date: Thu, 11 May 2023 09:57:05 +0000 +-Subject: [PATCH] ext4 data in dirent ++Date: Wed, 8 Nov 2023 02:41:00 +0000 ++Subject: [PATCH] ext4: data in dirent + + this patch implements feature which allows ext4 fs users (e.g. Lustre) + to store data in ext4 dirent. +@@ -18,9 +18,9 @@ Signed-off-by: Xinliang Liu + fs/ext4/ext4.h | 100 +++++++++++++++++++-- + fs/ext4/fast_commit.c | 2 +- + fs/ext4/inline.c | 8 +- +- fs/ext4/namei.c | 201 +++++++++++++++++++++++++++++++++--------- ++ fs/ext4/namei.c | 200 +++++++++++++++++++++++++++++++++--------- + fs/ext4/super.c | 4 +- +- 6 files changed, 270 insertions(+), 58 deletions(-) ++ 6 files changed, 269 insertions(+), 58 deletions(-) + + diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c + index 70a0f5e..ff7d8c3 100644 +@@ -73,10 +73,10 @@ index 70a0f5e..ff7d8c3 100644 + while (*p) { + parent = *p; + diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +-index 143ce00..98786d8 100644 ++index 0a4ecbe..4c1fbe0 100644 + --- a/fs/ext4/ext4.h + +++ b/fs/ext4/ext4.h +-@@ -1167,6 +1167,7 @@ struct ext4_inode_info { ++@@ -1172,6 +1172,7 @@ struct ext4_inode_info { + __u32 i_csum_seed; + + kprojid_t i_projid; +@@ -84,7 +84,7 @@ index 143ce00..98786d8 100644 + + /* Protect concurrent add cluster delayed block and remove block */ + struct mutex i_clu_lock; +-@@ -1191,6 +1192,7 @@ struct ext4_inode_info { ++@@ -1196,6 +1197,7 @@ struct ext4_inode_info { + * Mount flags set via mount options or defaults + */ + #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ +@@ -92,7 +92,7 @@ index 143ce00..98786d8 100644 + #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ + #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ + #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +-@@ -2086,6 +2088,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) ++@@ -2091,6 +2093,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ +@@ -100,7 +100,7 @@ index 143ce00..98786d8 100644 + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD | \ +-@@ -2268,6 +2271,43 @@ struct ext4_dir_entry_tail { ++@@ -2273,6 +2276,43 @@ struct ext4_dir_entry_tail { + #define EXT4_FT_SYMLINK 7 + + #define EXT4_FT_MAX 8 +@@ -144,7 +144,7 @@ index 143ce00..98786d8 100644 + + #define EXT4_FT_DIR_CSUM 0xDE + +-@@ -2278,8 +2318,16 @@ struct ext4_dir_entry_tail { ++@@ -2283,8 +2323,16 @@ struct ext4_dir_entry_tail { + */ + #define EXT4_DIR_PAD 4 + #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +@@ -162,7 +162,7 @@ index 143ce00..98786d8 100644 + #define EXT4_MAX_REC_LEN ((1<<16)-1) + + /* +-@@ -2746,11 +2794,11 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, ++@@ -2751,11 +2799,11 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, +@@ -176,7 +176,7 @@ index 143ce00..98786d8 100644 + static inline void ext4_update_dx_flag(struct inode *inode) + { + if (!ext4_has_feature_dir_index(inode->i_sb) && +-@@ -2766,10 +2814,17 @@ static const unsigned char ext4_filetype_table[] = { ++@@ -2771,10 +2819,17 @@ static const unsigned char ext4_filetype_table[] = { + + static inline unsigned char get_dtype(struct super_block *sb, int filetype) + { +@@ -196,7 +196,7 @@ index 143ce00..98786d8 100644 + } + extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); +-@@ -2965,7 +3020,8 @@ extern int ext4_ind_migrate(struct inode *inode); ++@@ -2970,7 +3025,8 @@ extern int ext4_ind_migrate(struct inode *inode); + + /* namei.c */ + extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, +@@ -206,7 +206,7 @@ index 143ce00..98786d8 100644 + extern int ext4_dirblock_csum_verify(struct inode *inode, + struct buffer_head *bh); + extern int ext4_orphan_add(handle_t *, struct inode *); +-@@ -2976,6 +3032,8 @@ extern struct inode *ext4_create_inode(handle_t *handle, ++@@ -2981,6 +3037,8 @@ extern struct inode *ext4_create_inode(handle_t *handle, + extern int ext4_delete_entry(handle_t *handle, struct inode * dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh); +@@ -215,7 +215,7 @@ index 143ce00..98786d8 100644 + extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); + extern int ext4_search_dir(struct buffer_head *bh, +-@@ -3765,6 +3823,36 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) ++@@ -3773,6 +3831,36 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) + return buffer_uptodate(bh); + } + +@@ -253,10 +253,10 @@ index 143ce00..98786d8 100644 + + #define EFSBADCRC EBADMSG /* Bad CRC detected */ + diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c +-index 41dcf21..1023ace 100644 ++index be768ef..ff0292e 100644 + --- a/fs/ext4/fast_commit.c + +++ b/fs/ext4/fast_commit.c +-@@ -1547,7 +1547,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, ++@@ -1560,7 +1560,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, + jbd_debug(1, "Dir %d not found.", darg.ino); + goto out; + } +@@ -266,10 +266,10 @@ index 41dcf21..1023ace 100644 + if (ret) { + ret = 0; + diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c +-index c2c688c..686d14a 100644 ++index baf2878..4667f9a 100644 + --- a/fs/ext4/inline.c + +++ b/fs/ext4/inline.c +-@@ -1033,7 +1033,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, ++@@ -1032,7 +1032,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, + struct ext4_dir_entry_2 *de; + + err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, +@@ -278,7 +278,7 @@ index c2c688c..686d14a 100644 + if (err) + return err; + +-@@ -1041,7 +1041,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, ++@@ -1040,7 +1040,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, + err = ext4_journal_get_write_access(handle, iloc->bh); + if (err) + return err; +@@ -287,7 +287,7 @@ index c2c688c..686d14a 100644 + + ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); + +-@@ -1398,7 +1398,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, ++@@ -1397,7 +1397,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, + fake.name_len = 1; + strcpy(fake.name, "."); + fake.rec_len = ext4_rec_len_to_disk( +@@ -296,7 +296,7 @@ index c2c688c..686d14a 100644 + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; +-@@ -1408,7 +1408,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, ++@@ -1407,7 +1407,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, + fake.name_len = 2; + strcpy(fake.name, ".."); + fake.rec_len = ext4_rec_len_to_disk( +@@ -306,7 +306,7 @@ index c2c688c..686d14a 100644 + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; + diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +-index 1537a76..24e1276 100644 ++index 1ce7f95..846ceb3 100644 + --- a/fs/ext4/namei.c + +++ b/fs/ext4/namei.c + @@ -295,7 +295,8 @@ static unsigned dx_get_count(struct dx_entry *entries); +@@ -319,23 +319,23 @@ index 1537a76..24e1276 100644 + static unsigned dx_node_limit(struct inode *dir); + static struct dx_frame *dx_probe(struct ext4_filename *fname, + struct inode *dir, +-@@ -439,22 +440,23 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, ++@@ -440,23 +441,23 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, + { + struct ext4_dir_entry *dp; + struct dx_root_info *root; + - int count_offset; +-+ int count_offset, dot_rec_len, dotdot_rec_len; +++ int count_offset, dotdot_rec_len; ++ int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); ++ unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize); + +- if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb)) ++ if (rlen == blocksize) + count_offset = 8; +-- else if (le16_to_cpu(dirent->rec_len) == 12) { ++- else if (rlen == 12) { + - dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); ++- if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12) + + else { +-+ dot_rec_len = le16_to_cpu(dirent->rec_len); +-+ dp = (struct ext4_dir_entry *)(((void *)dirent) + dot_rec_len); +- if (le16_to_cpu(dp->rec_len) != +-- EXT4_BLOCK_SIZE(inode->i_sb) - 12) +-+ EXT4_BLOCK_SIZE(inode->i_sb) - dot_rec_len) +++ dp = (struct ext4_dir_entry *)(((void *)dirent) + rlen); +++ if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - rlen) + return NULL; + - root = (struct dx_root_info *)(((void *)dp + 12)); + + dotdot_rec_len = EXT4_DIR_ENTRY_LEN((struct ext4_dir_entry_2 *)dp); +@@ -346,12 +346,12 @@ index 1537a76..24e1276 100644 + - count_offset = 32; + - } else + - return NULL; +-+ count_offset = 8 + dot_rec_len + dotdot_rec_len; +++ count_offset = 8 + rlen + dotdot_rec_len; + + } + + if (offset) + *offset = count_offset; +-@@ -559,11 +561,12 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) ++@@ -561,11 +562,12 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) + */ + struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de) + { +@@ -366,7 +366,7 @@ index 1537a76..24e1276 100644 + + return (struct dx_root_info *)de; + } +-@@ -608,10 +611,16 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value) ++@@ -610,10 +612,16 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value) + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); + } + +@@ -386,7 +386,7 @@ index 1537a76..24e1276 100644 + + if (ext4_has_metadata_csum(dir->i_sb)) + entry_space -= sizeof(struct dx_tail); +-@@ -731,7 +740,7 @@ static struct stats dx_show_leaf(struct inode *dir, ++@@ -733,7 +741,7 @@ static struct stats dx_show_leaf(struct inode *dir, + (unsigned) ((char *) de - base)); + #endif + } +@@ -395,7 +395,7 @@ index 1537a76..24e1276 100644 + names++; + } + de = ext4_next_entry(de, size); +-@@ -840,11 +849,14 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, ++@@ -842,11 +850,14 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + + entries = (struct dx_entry *)(((char *)info) + info->info_length); + +@@ -413,7 +413,7 @@ index 1537a76..24e1276 100644 + goto fail; + } + +-@@ -1851,7 +1863,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, ++@@ -1854,7 +1865,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, + while (count--) { + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) + (from + (map->offs<<2)); +@@ -422,7 +422,7 @@ index 1537a76..24e1276 100644 + memcpy (to, de, rec_len); + ((struct ext4_dir_entry_2 *) to)->rec_len = + ext4_rec_len_to_disk(rec_len, blocksize); +-@@ -1882,7 +1894,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) ++@@ -1885,7 +1896,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) + while ((char*)de < base + blocksize) { + next = ext4_next_entry(de, blocksize); + if (de->inode && de->name_len) { +@@ -431,7 +431,7 @@ index 1537a76..24e1276 100644 + if (de > to) + memmove(to, de, rec_len); + to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); +-@@ -2023,14 +2035,16 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, ++@@ -2026,14 +2037,16 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, +@@ -450,7 +450,7 @@ index 1537a76..24e1276 100644 + de = (struct ext4_dir_entry_2 *)buf; + top = buf + buf_size - reclen; + while ((char *) de <= top) { +-@@ -2039,10 +2053,26 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, ++@@ -2042,10 +2055,26 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, + return -EFSCORRUPTED; + if (ext4_match(dir, fname, de)) + return -EEXIST; +@@ -478,7 +478,7 @@ index 1537a76..24e1276 100644 + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } +-@@ -2056,12 +2086,12 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, ++@@ -2059,12 +2088,12 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, + void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, +@@ -493,7 +493,7 @@ index 1537a76..24e1276 100644 + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = +-@@ -2075,6 +2105,11 @@ void ext4_insert_dentry(struct inode *inode, ++@@ -2078,6 +2107,11 @@ void ext4_insert_dentry(struct inode *inode, + ext4_set_de_type(inode->i_sb, de, inode->i_mode); + de->name_len = fname_len(fname); + memcpy(de->name, fname_name(fname), fname_len(fname)); +@@ -505,7 +505,7 @@ index 1537a76..24e1276 100644 + } + + /* +-@@ -2092,14 +2127,19 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, ++@@ -2095,14 +2129,19 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, + { + unsigned int blocksize = dir->i_sb->s_blocksize; + int csum_size = 0; +@@ -527,7 +527,7 @@ index 1537a76..24e1276 100644 + if (err) + return err; + } +-@@ -2111,7 +2151,10 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, ++@@ -2114,7 +2153,10 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, + } + + /* By now the buffer is marked for journaling */ +@@ -539,7 +539,7 @@ index 1537a76..24e1276 100644 + + /* + * XXX shouldn't update any times until successful +-@@ -2217,7 +2260,8 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, ++@@ -2228,7 +2270,8 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + + dx_set_block(entries, 1); + dx_set_count(entries, 1); +@@ -549,7 +549,7 @@ index 1537a76..24e1276 100644 + + /* Initialize as for dx_probe */ + fname->hinfo.hash_version = dx_info->hash_version; +-@@ -2267,6 +2311,8 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, ++@@ -2278,6 +2321,8 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, + struct buffer_head *dir_block; + struct ext4_dir_entry_2 *de; + int len, journal = 0, err = 0; +@@ -558,7 +558,7 @@ index 1537a76..24e1276 100644 + + if (IS_ERR(handle)) + return PTR_ERR(handle); +-@@ -2292,11 +2338,16 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, ++@@ -2303,11 +2348,16 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, + goto out_journal; + + journal = 1; +@@ -578,7 +578,7 @@ index 1537a76..24e1276 100644 + de = (struct ext4_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + if (!journal) { +-@@ -2313,7 +2364,12 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, ++@@ -2324,7 +2374,12 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, + assert(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2)); + de->name_len = 2; + strcpy(de->name, ".."); +@@ -592,7 +592,7 @@ index 1537a76..24e1276 100644 + + out_journal: + if (journal) { +-@@ -2351,6 +2407,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ++@@ -2362,6 +2417,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + ext4_lblk_t block, blocks; + int csum_size = 0; + +@@ -600,7 +600,7 @@ index 1537a76..24e1276 100644 + if (ext4_has_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + +-@@ -2918,37 +2975,70 @@ err_unlock_inode: ++@@ -2929,37 +2985,70 @@ err_unlock_inode: + return err; + } + +@@ -678,7 +678,7 @@ index 1537a76..24e1276 100644 + struct buffer_head *dir_block = NULL; + struct ext4_dir_entry_2 *de; + ext4_lblk_t block = 0; +-@@ -2972,7 +3062,11 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, ++@@ -2983,7 +3072,11 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, + if (IS_ERR(dir_block)) + return PTR_ERR(dir_block); + de = (struct ext4_dir_entry_2 *)dir_block->b_data; +@@ -691,7 +691,7 @@ index 1537a76..24e1276 100644 + set_nlink(inode, 2); + if (csum_size) + ext4_initialize_dirent_tail(dir_block, blocksize); +-@@ -2987,6 +3081,29 @@ out: ++@@ -2998,6 +3091,29 @@ out: + return err; + } + +@@ -721,7 +721,7 @@ index 1537a76..24e1276 100644 + static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) + { + handle_t *handle; +-@@ -3013,7 +3130,7 @@ retry: ++@@ -3024,7 +3140,7 @@ retry: + + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; +@@ -731,10 +731,10 @@ index 1537a76..24e1276 100644 + goto out_clear_inode; + err = ext4_mark_inode_dirty(handle, inode); + diff --git a/fs/ext4/super.c b/fs/ext4/super.c +-index 6c33a10..59b87b4 100644 ++index 1392659..e787f88 100644 + --- a/fs/ext4/super.c + +++ b/fs/ext4/super.c +-@@ -1719,7 +1719,7 @@ enum { ++@@ -1724,7 +1724,7 @@ enum { + Opt_inlinecrypt, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, +@@ -743,7 +743,7 @@ index 6c33a10..59b87b4 100644 + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, + Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, +-@@ -1803,6 +1803,7 @@ static const match_table_t tokens = { ++@@ -1808,6 +1808,7 @@ static const match_table_t tokens = { + {Opt_nolazytime, "nolazytime"}, + {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"}, + {Opt_nodelalloc, "nodelalloc"}, +@@ -751,7 +751,7 @@ index 6c33a10..59b87b4 100644 + {Opt_removed, "mblk_io_submit"}, + {Opt_removed, "nomblk_io_submit"}, + {Opt_block_validity, "block_validity"}, +-@@ -2043,6 +2044,7 @@ static const struct mount_opts { ++@@ -2048,6 +2049,7 @@ static const struct mount_opts { + {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING}, + {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING}, + {Opt_offusrjquota, 0, MOPT_Q}, +diff --git a/lustre/kernel_patches/targets/5.10-oe2203.target.in b/lustre/kernel_patches/targets/5.10-oe2203.target.in +index 69c7e332d0..7148e4abd9 100644 +--- a/lustre/kernel_patches/targets/5.10-oe2203.target.in ++++ b/lustre/kernel_patches/targets/5.10-oe2203.target.in +@@ -1,5 +1,5 @@ + lnxmaj="5.10.0" +-lnxrel="60.94.0.118.oe2203" ++lnxrel="60.113.0.140.oe2203" + + KERNEL_SRPM=kernel-${lnxmaj}-${lnxrel}.src.rpm + SERIES="" +diff --git a/lustre/kernel_patches/targets/5.10-oe2203sp1.target.in b/lustre/kernel_patches/targets/5.10-oe2203sp1.target.in +index 81112d1897..164ea0ebb0 100644 +--- a/lustre/kernel_patches/targets/5.10-oe2203sp1.target.in ++++ b/lustre/kernel_patches/targets/5.10-oe2203sp1.target.in +@@ -1,5 +1,5 @@ + lnxmaj="5.10.0" +-lnxrel="136.32.0.108.oe2203sp1" ++lnxrel="136.51.0.130.oe2203sp1" + + KERNEL_SRPM=kernel-${lnxmaj}-${lnxrel}.src.rpm + SERIES="" +diff --git a/lustre/kernel_patches/targets/5.10-oe2203sp2.target.in b/lustre/kernel_patches/targets/5.10-oe2203sp2.target.in +index 805f11c315..1e6f568213 100644 +--- a/lustre/kernel_patches/targets/5.10-oe2203sp2.target.in ++++ b/lustre/kernel_patches/targets/5.10-oe2203sp2.target.in +@@ -1,5 +1,5 @@ + lnxmaj="5.10.0" +-lnxrel="153.19.0.95.oe2203sp2" ++lnxrel="153.31.0.108.oe2203sp2" + + KERNEL_SRPM=kernel-${lnxmaj}-${lnxrel}.src.rpm + SERIES="" +-- +2.33.0 + diff --git a/0061-Update-kernel-for-openEuler-20.03-LTS.patch b/0061-Update-kernel-for-openEuler-20.03-LTS.patch new file mode 100644 index 0000000000000000000000000000000000000000..3b3f542077925cc342252e0b0c30131e14f480d5 --- /dev/null +++ b/0061-Update-kernel-for-openEuler-20.03-LTS.patch @@ -0,0 +1,788 @@ +From 9f5b25ec2254a00041aa7f57c2df899c205c137e Mon Sep 17 00:00:00 2001 +From: Xinliang Liu +Date: Thu, 9 Nov 2023 02:25:34 +0000 +Subject: [PATCH 61/61] Update kernel for openEuler 20.03 LTS + +Update ldiskfs patch ext4-data-in-dirent.patch. + +Signed-off-by: Xinliang Liu +--- + .../patches/oe2003/ext4-data-in-dirent.patch | 740 ++++++++++++++++++ + .../series/ldiskfs-4.19.90-oe2003.series | 2 +- + .../targets/4.19-oe2003sp3.target.in | 2 +- + 3 files changed, 742 insertions(+), 2 deletions(-) + create mode 100644 ldiskfs/kernel_patches/patches/oe2003/ext4-data-in-dirent.patch + +diff --git a/ldiskfs/kernel_patches/patches/oe2003/ext4-data-in-dirent.patch b/ldiskfs/kernel_patches/patches/oe2003/ext4-data-in-dirent.patch +new file mode 100644 +index 0000000000..e5c9e95e07 +--- /dev/null ++++ b/ldiskfs/kernel_patches/patches/oe2003/ext4-data-in-dirent.patch +@@ -0,0 +1,740 @@ ++From 5cee5905bac6f9b8f80cbcbf9d814ac02d840393 Mon Sep 17 00:00:00 2001 ++From: Xinliang Liu ++Date: Thu, 9 Nov 2023 02:59:24 +0000 ++Subject: [PATCH] ext4: data in dirent ++ ++this patch implements feature which allows ext4 fs users (e.g. Lustre) ++to store data in ext4 dirent. ++data is stored in ext4 dirent after file-name, this space is accounted ++in de->rec_len. flag EXT4_DIRENT_LUFID added to d_type if extra data ++is present. ++ ++make use of dentry->d_fsdata to pass fid to ext4. so no ++changes in ext4_add_entry() interface required. ++ ++Signed-off-by: Xinliang Liu ++--- ++ fs/ext4/dir.c | 13 ++- ++ fs/ext4/ext4.h | 97 +++++++++++++++++++++-- ++ fs/ext4/inline.c | 8 +- ++ fs/ext4/namei.c | 200 +++++++++++++++++++++++++++++++++++++---------- ++ fs/ext4/super.c | 4 +- ++ 5 files changed, 266 insertions(+), 56 deletions(-) ++ ++diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c ++index aed33af..fe7149b 100644 ++--- a/fs/ext4/dir.c +++++ b/fs/ext4/dir.c ++@@ -73,7 +73,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, ++ error_msg = "rec_len is smaller than minimal"; ++ else if (unlikely(rlen % 4 != 0)) ++ error_msg = "rec_len % 4 != 0"; ++- else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) +++ else if (unlikely(rlen < EXT4_DIR_ENTRY_LEN(de))) ++ error_msg = "rec_len is too small for name_len"; ++ else if (unlikely(((char *) de - buf) + rlen > size)) ++ error_msg = "directory entry overrun"; ++@@ -224,7 +224,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) ++ * failure will be detected in the ++ * dirent test below. */ ++ if (ext4_rec_len_from_disk(de->rec_len, ++- sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) +++ sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) ++ break; ++ i += ext4_rec_len_from_disk(de->rec_len, ++ sb->s_blocksize); ++@@ -449,12 +449,17 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, ++ struct fname *fname, *new_fn; ++ struct dir_private_info *info; ++ int len; +++ int extra_data = 0; ++ ++ info = dir_file->private_data; ++ p = &info->root.rb_node; ++ ++ /* Create and allocate the fname structure */ ++- len = sizeof(struct fname) + ent_name->len + 1; +++ if (dirent->file_type & EXT4_DIRENT_LUFID) +++ extra_data = ext4_get_dirent_data_len(dirent); +++ +++ len = sizeof(struct fname) + ent_name->len + extra_data + 1; +++ ++ new_fn = kzalloc(len, GFP_KERNEL); ++ if (!new_fn) ++ return -ENOMEM; ++@@ -463,7 +468,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, ++ new_fn->inode = le32_to_cpu(dirent->inode); ++ new_fn->name_len = ent_name->len; ++ new_fn->file_type = dirent->file_type; ++- memcpy(new_fn->name, ent_name->name, ent_name->len); +++ memcpy(new_fn->name, ent_name->name, ent_name->len + extra_data); ++ new_fn->name[ent_name->len] = 0; ++ ++ while (*p) { ++diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h ++index b41edfa..7739f43 100644 ++--- a/fs/ext4/ext4.h +++++ b/fs/ext4/ext4.h ++@@ -1088,6 +1088,7 @@ struct ext4_inode_info { ++ __u32 i_csum_seed; ++ ++ kprojid_t i_projid; +++ void *i_dirdata; ++ }; ++ ++ /* ++@@ -1108,6 +1109,7 @@ struct ext4_inode_info { ++ * Mount flags set via mount options or defaults ++ */ ++ #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ +++#define EXT4_MOUNT_DIRDATA 0x00002 /* Data in directory entries */ ++ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ ++ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ ++ #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ ++@@ -1887,6 +1889,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) ++ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ ++ EXT4_FEATURE_INCOMPAT_EA_INODE| \ ++ EXT4_FEATURE_INCOMPAT_MMP | \ +++ EXT4_FEATURE_INCOMPAT_DIRDATA| \ ++ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ ++ EXT4_FEATURE_INCOMPAT_ENCRYPT | \ ++ EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ ++@@ -2066,6 +2069,43 @@ struct ext4_dir_entry_tail { ++ #define EXT4_FT_SYMLINK 7 ++ ++ #define EXT4_FT_MAX 8 +++#define EXT4_FT_MASK 0xf +++ +++#if EXT4_FT_MAX > EXT4_FT_MASK +++#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK" +++#endif +++ +++/* +++ * d_type has 4 unused bits, so it can hold four types data. these different +++ * type of data (e.g. lustre data, high 32 bits of 64-bit inode number) can be +++ * stored, in flag order, after file-name in ext4 dirent. +++*/ +++/* +++ * this flag is added to d_type if ext4 dirent has extra data after +++ * filename. this data length is variable and length is stored in first byte +++ * of data. data start after filename NUL byte. +++ * This is used by Lustre FS. +++ */ +++#define EXT4_DIRENT_LUFID 0x10 +++ +++#define EXT4_LUFID_MAGIC 0xAD200907UL +++struct ext4_dentry_param { +++ __u32 edp_magic; /* EXT4_LUFID_MAGIC */ +++ char edp_len; /* size of edp_data in bytes */ +++ char edp_data[0]; /* packed array of data */ +++} __packed; +++ +++static inline unsigned char *ext4_dentry_get_data(struct super_block *sb, +++ struct ext4_dentry_param *p) +++ +++{ +++ if (!ext4_has_feature_dirdata(sb)) +++ return NULL; +++ if (p && p->edp_magic == EXT4_LUFID_MAGIC) +++ return &p->edp_len; +++ else +++ return NULL; +++} ++ ++ #define EXT4_FT_DIR_CSUM 0xDE ++ ++@@ -2076,8 +2116,16 @@ struct ext4_dir_entry_tail { ++ */ ++ #define EXT4_DIR_PAD 4 ++ #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) ++-#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ +++#define EXT4_DIR_REC_LEN_(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ ++ ~EXT4_DIR_ROUND) +++#define EXT4_DIR_ENTRY_LEN_(de) (EXT4_DIR_REC_LEN_((de)->name_len +\ +++ ext4_get_dirent_data_len(de))) +++/* ldiskfs */ +++#define EXT4_DIR_REC_LEN(name_len) EXT4_DIR_REC_LEN_((name_len)) +++#define EXT4_DIR_ENTRY_LEN(de) EXT4_DIR_ENTRY_LEN_((de)) +++/* lustre osd_handler compat */ +++#define __EXT4_DIR_REC_LEN(name_len) EXT4_DIR_REC_LEN_((name_len)) +++ ++ #define EXT4_MAX_REC_LEN ((1<<16)-1) ++ ++ /* ++@@ -2478,11 +2526,11 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, ++ struct buffer_head *bh, ++ void *buf, int buf_size, ++ struct ext4_filename *fname, ++- struct ext4_dir_entry_2 **dest_de); +++ struct ext4_dir_entry_2 **dest_de, int *dlen); ++ void ext4_insert_dentry(struct inode *inode, ++ struct ext4_dir_entry_2 *de, ++ int buf_size, ++- struct ext4_filename *fname); +++ struct ext4_filename *fname, void *data); ++ static inline void ext4_update_dx_flag(struct inode *inode) ++ { ++ if (!ext4_has_feature_dir_index(inode->i_sb) && ++@@ -2498,10 +2546,17 @@ static const unsigned char ext4_filetype_table[] = { ++ ++ static inline unsigned char get_dtype(struct super_block *sb, int filetype) ++ { ++- if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) +++ int fl_index = filetype & EXT4_FT_MASK; +++ +++ if (!ext4_has_feature_filetype(sb) || fl_index >= EXT4_FT_MAX) ++ return DT_UNKNOWN; ++ ++- return ext4_filetype_table[filetype]; +++ if (!test_opt(sb, DIRDATA)) +++ return ext4_filetype_table[fl_index]; +++ +++ return (ext4_filetype_table[fl_index]) | +++ (filetype & EXT4_DIRENT_LUFID); +++ ++ } ++ extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, ++ void *buf, int buf_size); ++@@ -2663,6 +2718,8 @@ extern struct inode *ext4_create_inode(handle_t *handle, ++ extern int ext4_delete_entry(handle_t *handle, struct inode * dir, ++ struct ext4_dir_entry_2 *de_del, ++ struct buffer_head *bh); +++extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, +++ struct inode *inode, const void *, const void *); ++ extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, ++ __u32 start_minor_hash, __u32 *next_hash); ++ extern int ext4_search_dir(struct buffer_head *bh, ++@@ -3454,6 +3511,36 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) ++ return buffer_uptodate(bh); ++ } ++ +++/* +++ * Compute the total directory entry data length. +++ * This includes the filename and an implicit NUL terminator (always present), +++ * and optional extensions. Each extension has a bit set in the high 4 bits of +++ * de->file_type, and the extension length is the first byte in each entry. +++ */ +++static inline int ext4_get_dirent_data_len(struct ext4_dir_entry_2 *de) +++{ +++ char *len = de->name + de->name_len + 1 /* NUL terminator */; +++ int dlen = 0; +++ __u8 extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4; +++ struct ext4_dir_entry_tail *t = (struct ext4_dir_entry_tail *)de; +++ +++ if (!t->det_reserved_zero1 && +++ le16_to_cpu(t->det_rec_len) == +++ sizeof(struct ext4_dir_entry_tail) && +++ !t->det_reserved_zero2 && +++ t->det_reserved_ft == EXT4_FT_DIR_CSUM) +++ return 0; +++ +++ while (extra_data_flags) { +++ if (extra_data_flags & 1) { +++ dlen += *len + (dlen == 0); +++ len += *len; +++ } +++ extra_data_flags >>= 1; +++ } +++ return dlen; +++} +++ ++ #endif /* __KERNEL__ */ ++ ++ #define EFSBADCRC EBADMSG /* Bad CRC detected */ ++diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c ++index f147371..31c6d4b 100644 ++--- a/fs/ext4/inline.c +++++ b/fs/ext4/inline.c ++@@ -1033,7 +1033,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, ++ struct ext4_dir_entry_2 *de; ++ ++ err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, ++- inline_size, fname, &de); +++ inline_size, fname, &de, NULL); ++ if (err) ++ return err; ++ ++@@ -1041,7 +1041,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, ++ err = ext4_journal_get_write_access(handle, iloc->bh); ++ if (err) ++ return err; ++- ext4_insert_dentry(inode, de, inline_size, fname); +++ ext4_insert_dentry(inode, de, inline_size, fname, NULL); ++ ++ ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); ++ ++@@ -1399,7 +1399,7 @@ int htree_inlinedir_to_tree(struct file *dir_file, ++ fake.name_len = 1; ++ strcpy(fake.name, "."); ++ fake.rec_len = ext4_rec_len_to_disk( ++- EXT4_DIR_REC_LEN(fake.name_len), +++ EXT4_DIR_ENTRY_LEN(&fake), ++ inline_size); ++ ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); ++ de = &fake; ++@@ -1409,7 +1409,7 @@ int htree_inlinedir_to_tree(struct file *dir_file, ++ fake.name_len = 2; ++ strcpy(fake.name, ".."); ++ fake.rec_len = ext4_rec_len_to_disk( ++- EXT4_DIR_REC_LEN(fake.name_len), +++ EXT4_DIR_ENTRY_LEN(&fake), ++ inline_size); ++ ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); ++ de = &fake; ++diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c ++index e5f728e..e4920de 100644 ++--- a/fs/ext4/namei.c +++++ b/fs/ext4/namei.c ++@@ -287,7 +287,8 @@ static unsigned dx_get_count(struct dx_entry *entries); ++ static unsigned dx_get_limit(struct dx_entry *entries); ++ static void dx_set_count(struct dx_entry *entries, unsigned value); ++ static void dx_set_limit(struct dx_entry *entries, unsigned value); ++-static unsigned dx_root_limit(struct inode *dir, unsigned infosize); +++static inline unsigned dx_root_limit(struct inode *dir, +++ struct ext4_dir_entry_2 *dot_de, unsigned infosize); ++ static unsigned dx_node_limit(struct inode *dir); ++ static struct dx_frame *dx_probe(struct ext4_filename *fname, ++ struct inode *dir, ++@@ -431,23 +432,23 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, ++ { ++ struct ext4_dir_entry *dp; ++ struct dx_root_info *root; ++- int count_offset; +++ int count_offset, dotdot_rec_len; ++ int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); ++ unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize); ++ ++ if (rlen == blocksize) ++ count_offset = 8; ++- else if (rlen == 12) { ++- dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); ++- if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12) +++ else { +++ dp = (struct ext4_dir_entry *)(((void *)dirent) + rlen); +++ if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - rlen) ++ return NULL; ++- root = (struct dx_root_info *)(((void *)dp + 12)); +++ dotdot_rec_len = EXT4_DIR_ENTRY_LEN((struct ext4_dir_entry_2 *)dp); +++ root = (struct dx_root_info *)(((void *)dp + dotdot_rec_len)); ++ if (root->reserved_zero || ++ root->info_length != sizeof(struct dx_root_info)) ++ return NULL; ++- count_offset = 32; ++- } else ++- return NULL; +++ count_offset = 8 + rlen + dotdot_rec_len; +++ } ++ ++ if (offset) ++ *offset = count_offset; ++@@ -552,11 +553,12 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) ++ */ ++ struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de) ++ { +++ BUG_ON(de->name_len != 1); ++ /* get dotdot first */ ++- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1)); +++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_ENTRY_LEN(de)); ++ ++ /* dx root info is after dotdot entry */ ++- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2)); +++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_ENTRY_LEN(de)); ++ ++ return (struct dx_root_info *)de; ++ } ++@@ -601,10 +603,16 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value) ++ ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); ++ } ++ ++-static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) +++static inline unsigned dx_root_limit(struct inode *dir, +++ struct ext4_dir_entry_2 *dot_de, unsigned infosize) ++ { ++- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - ++- EXT4_DIR_REC_LEN(2) - infosize; +++ struct ext4_dir_entry_2 *dotdot_de; +++ unsigned entry_space; +++ +++ BUG_ON(dot_de->name_len != 1); +++ dotdot_de = ext4_next_entry(dot_de, dir->i_sb->s_blocksize); +++ entry_space = dir->i_sb->s_blocksize - EXT4_DIR_ENTRY_LEN(dot_de) - +++ EXT4_DIR_ENTRY_LEN(dotdot_de) - infosize; ++ ++ if (ext4_has_metadata_csum(dir->i_sb)) ++ entry_space -= sizeof(struct dx_tail); ++@@ -725,7 +733,7 @@ static struct stats dx_show_leaf(struct inode *dir, ++ (unsigned) ((char *) de - base)); ++ #endif ++ } ++- space += EXT4_DIR_REC_LEN(de->name_len); +++ space += EXT4_DIR_ENTRY_LEN(de); ++ names++; ++ } ++ de = ext4_next_entry(de, size); ++@@ -834,11 +842,14 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, ++ ++ entries = (struct dx_entry *)(((char *)info) + info->info_length); ++ ++- if (dx_get_limit(entries) != dx_root_limit(dir, ++- info->info_length)) { +++ if (dx_get_limit(entries) != +++ dx_root_limit(dir, (struct ext4_dir_entry_2 *)frame->bh->b_data, +++ info->info_length)) { ++ ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", ++ dx_get_limit(entries), ++- dx_root_limit(dir, info->info_length)); +++ dx_root_limit(dir, +++ (struct ext4_dir_entry_2 *)frame->bh->b_data, +++ info->info_length)); ++ goto fail; ++ } ++ ++@@ -1732,7 +1743,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, ++ while (count--) { ++ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) ++ (from + (map->offs<<2)); ++- rec_len = EXT4_DIR_REC_LEN(de->name_len); +++ rec_len = EXT4_DIR_ENTRY_LEN(de); ++ memcpy (to, de, rec_len); ++ ((struct ext4_dir_entry_2 *) to)->rec_len = ++ ext4_rec_len_to_disk(rec_len, blocksize); ++@@ -1756,7 +1767,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) ++ while ((char*)de < base + blocksize) { ++ next = ext4_next_entry(de, blocksize); ++ if (de->inode && de->name_len) { ++- rec_len = EXT4_DIR_REC_LEN(de->name_len); +++ rec_len = EXT4_DIR_ENTRY_LEN(de); ++ if (de > to) ++ memmove(to, de, rec_len); ++ to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); ++@@ -1901,14 +1912,16 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, ++ struct buffer_head *bh, ++ void *buf, int buf_size, ++ struct ext4_filename *fname, ++- struct ext4_dir_entry_2 **dest_de) +++ struct ext4_dir_entry_2 **dest_de, int *dlen) ++ { ++ struct ext4_dir_entry_2 *de; ++- unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)); +++ unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)) + +++ (dlen ? *dlen : 0); ++ int nlen, rlen; ++ unsigned int offset = 0; ++ char *top; ++ +++ dlen ? *dlen = 0 : 0; /* default set to 0 */ ++ de = (struct ext4_dir_entry_2 *)buf; ++ top = buf + buf_size - reclen; ++ while ((char *) de <= top) { ++@@ -1917,10 +1930,26 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, ++ return -EFSCORRUPTED; ++ if (ext4_match(fname, de)) ++ return -EEXIST; ++- nlen = EXT4_DIR_REC_LEN(de->name_len); +++ nlen = EXT4_DIR_ENTRY_LEN(de); ++ rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); ++ if ((de->inode ? rlen - nlen : rlen) >= reclen) ++ break; +++ /* Then for dotdot entries, check for the smaller space +++ * required for just the entry, no FID */ +++ if (fname_len(fname) == 2 && memcmp(fname_name(fname), "..", 2) == 0) { +++ if ((de->inode ? rlen - nlen : rlen) >= +++ EXT4_DIR_REC_LEN(fname_len(fname))) { +++ /* set dlen=1 to indicate not +++ * enough space store fid */ +++ dlen ? *dlen = 1 : 0; +++ break; +++ } +++ /* The new ".." entry must be written over the +++ * previous ".." entry, which is the first +++ * entry traversed by this scan. If it doesn't +++ * fit, something is badly wrong, so -EIO. */ +++ return -EIO; +++ } ++ de = (struct ext4_dir_entry_2 *)((char *)de + rlen); ++ offset += rlen; ++ } ++@@ -1934,12 +1963,12 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, ++ void ext4_insert_dentry(struct inode *inode, ++ struct ext4_dir_entry_2 *de, ++ int buf_size, ++- struct ext4_filename *fname) +++ struct ext4_filename *fname, void *data) ++ { ++ ++ int nlen, rlen; ++ ++- nlen = EXT4_DIR_REC_LEN(de->name_len); +++ nlen = EXT4_DIR_ENTRY_LEN(de); ++ rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); ++ if (de->inode) { ++ struct ext4_dir_entry_2 *de1 = ++@@ -1953,6 +1982,11 @@ void ext4_insert_dentry(struct inode *inode, ++ ext4_set_de_type(inode->i_sb, de, inode->i_mode); ++ de->name_len = fname_len(fname); ++ memcpy(de->name, fname_name(fname), fname_len(fname)); +++ if (data) { +++ de->name[fname_len(fname)] = 0; +++ memcpy(&de->name[fname_len(fname) + 1], data, *(char *)data); +++ de->file_type |= EXT4_DIRENT_LUFID; +++ } ++ } ++ ++ /* ++@@ -1970,14 +2004,19 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, ++ { ++ unsigned int blocksize = dir->i_sb->s_blocksize; ++ int csum_size = 0; ++- int err; +++ int err, dlen = 0; +++ unsigned char *data; ++ +++ data = ext4_dentry_get_data(inode->i_sb, (struct ext4_dentry_param *) +++ EXT4_I(inode)->i_dirdata); ++ if (ext4_has_metadata_csum(inode->i_sb)) ++ csum_size = sizeof(struct ext4_dir_entry_tail); ++ ++ if (!de) { +++ if (data) +++ dlen = (*data) + 1; ++ err = ext4_find_dest_de(dir, inode, bh, bh->b_data, ++- blocksize - csum_size, fname, &de); +++ blocksize - csum_size, fname, &de, &dlen); ++ if (err) ++ return err; ++ } ++@@ -1989,7 +2028,10 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, ++ } ++ ++ /* By now the buffer is marked for journaling */ ++- ext4_insert_dentry(inode, de, blocksize, fname); +++ /* If writing the short form of "dotdot", don't add the data section */ +++ if (dlen == 1) +++ data = NULL; +++ ext4_insert_dentry(inode, de, blocksize, fname, data); ++ ++ /* ++ * XXX shouldn't update any times until successful ++@@ -2098,7 +2140,8 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, ++ ++ dx_set_block(entries, 1); ++ dx_set_count(entries, 1); ++- dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info))); +++ dx_set_limit(entries, dx_root_limit(dir, +++ dot_de, sizeof(*dx_info))); ++ ++ /* Initialize as for dx_probe */ ++ fname->hinfo.hash_version = dx_info->hash_version; ++@@ -2148,6 +2191,8 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, ++ struct buffer_head *dir_block; ++ struct ext4_dir_entry_2 *de; ++ int len, journal = 0, err = 0; +++ int dlen = 0; +++ char *data; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++@@ -2173,11 +2218,16 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, ++ goto out_journal; ++ ++ journal = 1; ++- de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(1)); +++ de->rec_len = cpu_to_le16(EXT4_DIR_ENTRY_LEN(de)); ++ } ++ ++- len -= EXT4_DIR_REC_LEN(1); ++- assert(len == 0 || len >= EXT4_DIR_REC_LEN(2)); +++ len -= EXT4_DIR_ENTRY_LEN(de); +++ data = ext4_dentry_get_data(dir->i_sb, +++ (struct ext4_dentry_param *)dentry->d_fsdata); +++ if (data) +++ dlen = *data + 1; +++ assert(len == 0 || len >= EXT4_DIR_REC_LEN(2 + dlen)); +++ ++ de = (struct ext4_dir_entry_2 *) ++ ((char *) de + le16_to_cpu(de->rec_len)); ++ if (!journal) { ++@@ -2194,7 +2244,12 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, ++ assert(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2)); ++ de->name_len = 2; ++ strcpy(de->name, ".."); ++- ext4_set_de_type(dir->i_sb, de, S_IFDIR); +++ if (data != NULL && ext4_get_dirent_data_len(de) >= dlen) { +++ de->name[2] = 0; +++ memcpy(&de->name[2 + 1], data, *data); +++ ext4_set_de_type(dir->i_sb, de, S_IFDIR); +++ de->file_type |= EXT4_DIRENT_LUFID; +++ } ++ ++ out_journal: ++ if (journal) { ++@@ -2233,6 +2288,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ ext4_lblk_t block, blocks; ++ int csum_size = 0; ++ +++ EXT4_I(inode)->i_dirdata = dentry->d_fsdata; ++ if (ext4_has_metadata_csum(inode->i_sb)) ++ csum_size = sizeof(struct ext4_dir_entry_tail); ++ ++@@ -2777,37 +2833,70 @@ err_unlock_inode: ++ return err; ++ } ++ +++struct tp_block { +++ struct inode *inode; +++ void *data1; +++ void *data2; +++}; +++ ++ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, ++ struct ext4_dir_entry_2 *de, ++ int blocksize, int csum_size, ++ unsigned int parent_ino, int dotdot_real_len) ++ { +++ void *data1 = NULL, *data2 = NULL; +++ int dot_reclen = 0; +++ +++ if (dotdot_real_len == 10) { +++ struct tp_block *tpb = (struct tp_block *)inode; +++ data1 = tpb->data1; +++ data2 = tpb->data2; +++ inode = tpb->inode; +++ dotdot_real_len = 0; +++ } ++ de->inode = cpu_to_le32(inode->i_ino); ++ de->name_len = 1; ++- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), ++- blocksize); ++ strcpy(de->name, "."); ++ ext4_set_de_type(inode->i_sb, de, S_IFDIR); ++ +++ /* get packed fid data*/ +++ data1 = ext4_dentry_get_data(inode->i_sb, +++ (struct ext4_dentry_param *) data1); +++ if (data1) { +++ de->name[1] = 0; +++ memcpy(&de->name[2], data1, *(char *) data1); +++ de->file_type |= EXT4_DIRENT_LUFID; +++ } +++ de->rec_len = cpu_to_le16(EXT4_DIR_ENTRY_LEN(de)); +++ dot_reclen = cpu_to_le16(de->rec_len); ++ de = ext4_next_entry(de, blocksize); ++ de->inode = cpu_to_le32(parent_ino); ++ de->name_len = 2; +++ strcpy(de->name, ".."); +++ ext4_set_de_type(inode->i_sb, de, S_IFDIR); +++ data2 = ext4_dentry_get_data(inode->i_sb, +++ (struct ext4_dentry_param *) data2); +++ if (data2) { +++ de->name[2] = 0; +++ memcpy(&de->name[3], data2, *(char *) data2); +++ de->file_type |= EXT4_DIRENT_LUFID; +++ } ++ if (!dotdot_real_len) ++ de->rec_len = ext4_rec_len_to_disk(blocksize - ++- (csum_size + EXT4_DIR_REC_LEN(1)), +++ (csum_size + dot_reclen), ++ blocksize); ++ else ++ de->rec_len = ext4_rec_len_to_disk( ++- EXT4_DIR_REC_LEN(de->name_len), blocksize); ++- strcpy(de->name, ".."); ++- ext4_set_de_type(inode->i_sb, de, S_IFDIR); +++ EXT4_DIR_ENTRY_LEN(de), blocksize); ++ ++ return ext4_next_entry(de, blocksize); ++ } ++ ++ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, ++- struct inode *inode) +++ struct inode *inode, +++ const void *data1, const void *data2) ++ { +++ struct tp_block param; ++ struct buffer_head *dir_block = NULL; ++ struct ext4_dir_entry_2 *de; ++ struct ext4_dir_entry_tail *t; ++@@ -2832,7 +2921,11 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, ++ if (IS_ERR(dir_block)) ++ return PTR_ERR(dir_block); ++ de = (struct ext4_dir_entry_2 *)dir_block->b_data; ++- ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); +++ param.inode = inode; +++ param.data1 = (void *)data1; +++ param.data2 = (void *)data2; +++ ext4_init_dot_dotdot((struct inode *)(¶m), de, blocksize, +++ csum_size, dir->i_ino, 10); ++ set_nlink(inode, 2); ++ if (csum_size) { ++ t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize); ++@@ -2849,6 +2942,29 @@ out: ++ return err; ++ } ++ +++/* Initialize @inode as a subdirectory of @dir, and add the +++ * "." and ".." entries into the first directory block. */ +++int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, +++ struct inode *inode, +++ const void *data1, const void *data2) +++{ +++ int rc; +++ +++ if (IS_ERR(handle)) +++ return PTR_ERR(handle); +++ +++ if (IS_DIRSYNC(dir)) +++ ext4_handle_sync(handle); +++ +++ inode->i_op = &ext4_dir_inode_operations; +++ inode->i_fop = &ext4_dir_operations; +++ rc = ext4_init_new_dir(handle, dir, inode, data1, data2); +++ if (!rc) +++ rc = ext4_mark_inode_dirty(handle, inode); +++ return rc; +++} +++EXPORT_SYMBOL(ext4_add_dot_dotdot); +++ ++ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ++ { ++ handle_t *handle; ++@@ -2875,7 +2991,7 @@ retry: ++ ++ inode->i_op = &ext4_dir_inode_operations; ++ inode->i_fop = &ext4_dir_operations; ++- err = ext4_init_new_dir(handle, dir, inode); +++ err = ext4_init_new_dir(handle, dir, inode, NULL, NULL); ++ if (err) ++ goto out_clear_inode; ++ err = ext4_mark_inode_dirty(handle, inode); ++diff --git a/fs/ext4/super.c b/fs/ext4/super.c ++index 18dc174..89f39f9 100644 ++--- a/fs/ext4/super.c +++++ b/fs/ext4/super.c ++@@ -1618,7 +1618,7 @@ enum { ++ Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, ++ Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, ++ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, ++- Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, +++ Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_dirdata, ++ Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, ++ Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, ++ Opt_nowarn_on_error, Opt_mblk_io_submit, ++@@ -1694,6 +1694,7 @@ static const match_table_t tokens = { ++ {Opt_nolazytime, "nolazytime"}, ++ {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"}, ++ {Opt_nodelalloc, "nodelalloc"}, +++ {Opt_dirdata, "dirdata"}, ++ {Opt_removed, "mblk_io_submit"}, ++ {Opt_removed, "nomblk_io_submit"}, ++ {Opt_block_validity, "block_validity"}, ++@@ -1917,6 +1918,7 @@ static const struct mount_opts { ++ {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING}, ++ {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING}, ++ {Opt_offusrjquota, 0, MOPT_Q}, +++ {Opt_dirdata, EXT4_MOUNT_DIRDATA, MOPT_SET}, ++ {Opt_offgrpjquota, 0, MOPT_Q}, ++ {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, ++ {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, ++-- ++2.33.0 ++ +diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.19.90-oe2003.series b/ldiskfs/kernel_patches/series/ldiskfs-4.19.90-oe2003.series +index ec19e1e90f..cb7e413dd8 100644 +--- a/ldiskfs/kernel_patches/series/ldiskfs-4.19.90-oe2003.series ++++ b/ldiskfs/kernel_patches/series/ldiskfs-4.19.90-oe2003.series +@@ -8,7 +8,7 @@ oe2003/ext4-mballoc-extra-checks.patch + ubuntu18/ext4-hash-indexed-dir-dotdot-update.patch + rhel8.1/ext4-kill-dx-root.patch + oe2003/ext4-mballoc-pa-free-mismatch.patch +-linux-5.4/ext4-data-in-dirent.patch ++oe2003/ext4-data-in-dirent.patch + rhel8/ext4-nocmtime.patch + base/ext4-htree-lock.patch + oe2003/ext4-pdirop.patch +diff --git a/lustre/kernel_patches/targets/4.19-oe2003sp3.target.in b/lustre/kernel_patches/targets/4.19-oe2003sp3.target.in +index dd8deef6f2..c60d6d88cd 100644 +--- a/lustre/kernel_patches/targets/4.19-oe2003sp3.target.in ++++ b/lustre/kernel_patches/targets/4.19-oe2003sp3.target.in +@@ -1,5 +1,5 @@ + lnxmaj="4.19.90" +-lnxrel="2308.1.0.0212.oe1" ++lnxrel="2311.1.0.0224.oe1" + + KERNEL_SRPM=kernel-${lnxmaj}-${lnxrel}.src.rpm + SERIES="" +-- +2.33.0 + diff --git a/2.15.57.tar.gz b/2.15.3.tar.gz similarity index 73% rename from 2.15.57.tar.gz rename to 2.15.3.tar.gz index 0d0d1a9b9836fe05d7a3de830bc54f4757d48cc1..68890f571280382a1d7c33d6fcff991bce89cb7a 100644 Binary files a/2.15.57.tar.gz and b/2.15.3.tar.gz differ diff --git a/LU-16802-build-iov_iter_iovec-class_create-get_expir.patch b/LU-16802-build-iov_iter_iovec-class_create-get_expir.patch deleted file mode 100644 index d8dd89071f916b0e6c56c87a69090cbf6f6e0722..0000000000000000000000000000000000000000 --- a/LU-16802-build-iov_iter_iovec-class_create-get_expir.patch +++ /dev/null @@ -1,466 +0,0 @@ -From 13bab88098587ad08e5b56450a21f34003eaf6d3 Mon Sep 17 00:00:00 2001 -From: Shaun Tancheff -Date: Tue, 11 Jul 2023 18:41:13 +0700 -Subject: [PATCH] LU-16802 build: iov_iter_iovec, class_create, get_expiry - -linux kernel v6.3-rc4-32-g6eb203e1a868 - iov_iter: remove iov_iter_iovec() - -Provide a replacement iov_iter_iovec() when one is not provided. - -linux kernel v6.3-rc4-34-g747b1f65d39a - iov_iter: overlay struct iovec and ubuf/len - -This renames iov_iter member iov to __iov. -Define __iov as iov when __iov not present. - -linux kernel v6.3-rc1-13-g1aaba11da9aa - driver core: class: remove module * from class_create() - -Provide an ll_class_create() to pass THIS_MODULE, or not, -as needed by class_create(). - -Linux commit v6.2-rc1-20-gf861646a6562 - quota: port to mnt_idmap - -Update osd_dquot_transfer to use mnt_idmap and fallback -to user_ns, if needed, by dquot_transfer. - -Linux commit v6.3-rc7-2433-gcf64b9bce950 - SUNRPC: return proper error from get_expiry() - -Updated get_expiry() requires a time64_t pointer to be passed -to hold the expiry time. A non-zero return value indicates an -error, nominally -EINVAL. Provide a wrapper for kernels that -return a time64_t and return -EINVAL on error. - -Test-Parameters: trivial -HPE-bug-id: LUS-11614 -Signed-off-by: Shaun Tancheff -Change-Id: I765d6257eec8b5a9bf1bd5947f03370eb9df1625 ---- - lustre/autoconf/lustre-core.m4 | 117 +++++++++++++++++++++ - lustre/include/lustre_compat.h | 18 ++-- - lustre/mdc/mdc_request.c | 2 +- - lustre/ofd/ofd_access_log.c | 2 +- - lustre/ptlrpc/gss/gss_svc_upcall.c | 159 +++++++++++++++-------------- - 5 files changed, 213 insertions(+), 85 deletions(-) - -diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 -index b2b9ebcd86..38c3906abf 100644 ---- a/lustre/autoconf/lustre-core.m4 -+++ b/lustre/autoconf/lustre-core.m4 -@@ -3866,6 +3866,111 @@ AC_DEFUN([LC_HAVE_LOCKS_LOCK_FILE_WAIT_IN_FILELOCK], [ - ]) - ]) # LC_HAVE_LOCKS_LOCK_FILE_WAIT_IN_FILELOCK - -+# -+# LC_HAVE_IOV_ITER_IOVEC -+# -+# linux kernel v6.3-rc4-32-g6eb203e1a868 -+# iov_iter: remove iov_iter_iovec() -+# -+AC_DEFUN([LC_SRC_HAVE_IOV_ITER_IOVEC], [ -+ LB2_LINUX_TEST_SRC([iov_iter_iovec_exists], [ -+ #include -+ ],[ -+ struct iovec iov __attribute__ ((unused)); -+ struct iov_iter i = { }; -+ -+ iov = iov_iter_iovec(&i); -+ ],[-Werror]) -+]) -+AC_DEFUN([LC_HAVE_IOV_ITER_IOVEC], [ -+ AC_MSG_CHECKING([if 'iov_iter_iovec' is available]) -+ LB2_LINUX_TEST_RESULT([iov_iter_iovec_exists], [ -+ AC_DEFINE(HAVE_IOV_ITER_IOVEC, 1, -+ ['iov_iter_iovec' is available]) -+ ]) -+]) # LC_HAVE_IOV_ITER_IOVEC -+ -+# -+# LC_HAVE_IOVEC_WITH_IOV_MEMBER -+# -+# linux kernel v6.3-rc4-34-g747b1f65d39a -+# iov_iter: overlay struct iovec and ubuf/len -+# This renames iov_iter member iov to __iov and now __iov == __ubuf_iovec -+# -+AC_DEFUN([LC_SRC_HAVE_IOVEC_WITH_IOV_MEMBER], [ -+ LB2_LINUX_TEST_SRC([iov_iter_has___iov_member], [ -+ #include -+ ],[ -+ struct iov_iter iter = { }; -+ size_t len __attribute__ ((unused)); -+ -+ len = iter->__iov->iov_len; -+ ],[-Werror]) -+]) -+AC_DEFUN([LC_HAVE_IOVEC_WITH_IOV_MEMBER], [ -+ AC_MSG_CHECKING([if 'iov_iter_iovec' is available]) -+ LB2_LINUX_TEST_RESULT([iov_iter_has___iov_member], [ -+ AC_DEFINE(HAVE___IOV_MEMBER, __iov, -+ ['iov_iter' has '__iov' member]) -+ ],[ -+ AC_DEFINE(__iov, iov, -+ ['iov_iter' has 'iov' member]) -+ ]) -+]) # LC_HAVE_IOVEC_WITH_IOV_MEMBER -+ -+# -+# LC_HAVE_CLASS_CREATE_MODULE_ARG -+# -+# linux kernel v6.3-rc1-13-g1aaba11da9aa -+# driver core: class: remove module * from class_create() -+# -+AC_DEFUN([LC_SRC_HAVE_CLASS_CREATE_MODULE_ARG], [ -+ LB2_LINUX_TEST_SRC([class_create_without_module_arg], [ -+ #include -+ ],[ -+ struct class *class __attribute__ ((unused)); -+ -+ class = class_create("empty"); -+ if (IS_ERR(class)) -+ /* checked */; -+ ],[-Werror]) -+]) -+AC_DEFUN([LC_HAVE_CLASS_CREATE_MODULE_ARG], [ -+ AC_MSG_CHECKING([if 'class_create' does not have module arg]) -+ LB2_LINUX_TEST_RESULT([class_create_without_module_arg], [ -+ AC_DEFINE([ll_class_create(name)], -+ [class_create((name))], -+ ['class_create' does not have module arg]) -+ ],[ -+ AC_DEFINE([ll_class_create(name)], -+ [class_create(THIS_MODULE, (name))], -+ ['class_create' expects module arg]) -+ ]) -+]) # LC_HAVE_IOVEC_WITH_IOV_MEMBER -+ -+# -+# LC_HAVE_GET_EXPIRY_TIME64_T -+# -+# linux kernel v6.3-rc7-2433-gcf64b9bce950 -+# SUNRPC: return proper error from get_expiry() -+# -+AC_DEFUN([LC_SRC_HAVE_GET_EXPIRY_TIME64_T], [ -+ LB2_LINUX_TEST_SRC([get_expiry_with_time64_t], [ -+ #include -+ ],[ -+ int err __attribute__ ((unused)); -+ -+ err = get_expiry((char **)NULL, (time64_t *)NULL); -+ ],[-Werror]) -+]) -+AC_DEFUN([LC_HAVE_GET_EXPIRY_TIME64_T], [ -+ AC_MSG_CHECKING([if 'get_expiry' needs a time64_t arg]) -+ LB2_LINUX_TEST_RESULT([get_expiry_with_time64_t], [ -+ AC_DEFINE(HAVE_GET_EXPIRY_2ARGS, 1, -+ ['get_expiry' takes time64_t]) -+ ]) -+]) # LC_HAVE_IOVEC_WITH_IOV_MEMBER -+ - # - # LC_PROG_LINUX - # -@@ -4116,6 +4221,12 @@ AC_DEFUN([LC_PROG_LINUX_SRC], [ - LC_SRC_HAVE_LOCKS_LOCK_FILE_WAIT_IN_FILELOCK - LC_SRC_HAVE_U64_CAPABILITY - -+ # 6.4 -+ LC_SRC_HAVE_IOV_ITER_IOVEC -+ LC_SRC_HAVE_IOVEC_WITH_IOV_MEMBER -+ LC_SRC_HAVE_CLASS_CREATE_MODULE_ARG -+ LC_SRC_HAVE_GET_EXPIRY_TIME64_T -+ - # kernel patch to extend integrity interface - LC_SRC_BIO_INTEGRITY_PREP_FN - ]) -@@ -4384,6 +4495,12 @@ AC_DEFUN([LC_PROG_LINUX_RESULTS], [ - LC_HAVE_LOCKS_LOCK_FILE_WAIT_IN_FILELOCK - LC_HAVE_U64_CAPABILITY - -+ # 6.4 -+ LC_HAVE_IOV_ITER_IOVEC -+ LC_HAVE_IOVEC_WITH_IOV_MEMBER -+ LC_HAVE_CLASS_CREATE_MODULE_ARG -+ LC_HAVE_GET_EXPIRY_TIME64_T -+ - # kernel patch to extend integrity interface - LC_BIO_INTEGRITY_PREP_FN - ]) -diff --git a/lustre/include/lustre_compat.h b/lustre/include/lustre_compat.h -index 3040cc22ce..c99c8d3730 100644 ---- a/lustre/include/lustre_compat.h -+++ b/lustre/include/lustre_compat.h -@@ -313,20 +313,22 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) - # define SB_NODIRATIME MS_NODIRATIME - #endif - --#ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER --static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) --{ -- i->count = count; --} -- -+#ifndef HAVE_IOV_ITER_IOVEC - static inline struct iovec iov_iter_iovec(const struct iov_iter *iter) - { - return (struct iovec) { -- .iov_base = iter->iov->iov_base + iter->iov_offset, -+ .iov_base = iter->__iov->iov_base + iter->iov_offset, - .iov_len = min(iter->count, -- iter->iov->iov_len - iter->iov_offset), -+ iter->__iov->iov_len - iter->iov_offset), - }; - } -+#endif -+ -+#ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER -+static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) -+{ -+ i->count = count; -+} - - #define iov_for_each(iov, iter, start) \ - for (iter = (start); \ -diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c -index 086d13e670..2680d941f5 100644 ---- a/lustre/mdc/mdc_request.c -+++ b/lustre/mdc/mdc_request.c -@@ -3037,7 +3037,7 @@ static int __init mdc_init(void) - if (rc) - return rc; - -- mdc_changelog_class = class_create(THIS_MODULE, MDC_CHANGELOG_DEV_NAME); -+ mdc_changelog_class = ll_class_create(MDC_CHANGELOG_DEV_NAME); - if (IS_ERR(mdc_changelog_class)) { - rc = PTR_ERR(mdc_changelog_class); - goto out_dev; -diff --git a/lustre/ofd/ofd_access_log.c b/lustre/ofd/ofd_access_log.c -index 088ec66a2a..1024c08ad1 100644 ---- a/lustre/ofd/ofd_access_log.c -+++ b/lustre/ofd/ofd_access_log.c -@@ -681,7 +681,7 @@ int ofd_access_log_module_init(void) - - oal_log_major = MAJOR(dev); - -- oal_log_class = class_create(THIS_MODULE, LUSTRE_ACCESS_LOG_DIR_NAME); -+ oal_log_class = ll_class_create(LUSTRE_ACCESS_LOG_DIR_NAME); - if (IS_ERR(oal_log_class)) { - rc = PTR_ERR(oal_log_class); - goto out_dev; -diff --git a/lustre/ptlrpc/gss/gss_svc_upcall.c b/lustre/ptlrpc/gss/gss_svc_upcall.c -index ea94cb663f..b82e6f7cb9 100644 ---- a/lustre/ptlrpc/gss/gss_svc_upcall.c -+++ b/lustre/ptlrpc/gss/gss_svc_upcall.c -@@ -71,6 +71,15 @@ - #include "gss_api.h" - #include "gss_crypto.h" - -+#ifndef HAVE_GET_EXPIRY_2ARGS -+static inline int __get_expiry2(char **bpp, time64_t *rvp) -+{ -+ *rvp = get_expiry(bpp); -+ return *rvp ? 0 : -EINVAL; -+} -+#define get_expiry(ps, pt) __get_expiry2((ps), (pt)) -+#endif -+ - #define GSS_SVC_UPCALL_TIMEOUT (20) - - static DEFINE_SPINLOCK(__ctx_index_lock); -@@ -339,13 +348,13 @@ static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen) - * Directly return -EINVAL in this case. - */ - status = -EINVAL; -- goto out; -+ goto out; - } - - rsii.h.flags = 0; - /* expiry */ -- expiry = get_expiry(&mesg); -- if (expiry == 0) -+ status = get_expiry(&mesg, &expiry); -+ if (status) - goto out; - - len = qword_get(&mesg, buf, mlen); -@@ -582,36 +591,36 @@ static struct cache_head * rsc_alloc(void) - - static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen) - { -- char *buf = mesg; -- int len, rv, tmp_int; -- struct rsc rsci, *rscp = NULL; -+ char *buf = mesg; -+ int len, rv, tmp_int; -+ struct rsc rsci, *rscp = NULL; - time64_t expiry; -- int status = -EINVAL; -- struct gss_api_mech *gm = NULL; -+ int status = -EINVAL; -+ struct gss_api_mech *gm = NULL; - -- memset(&rsci, 0, sizeof(rsci)); -+ memset(&rsci, 0, sizeof(rsci)); - -- /* context handle */ -- len = qword_get(&mesg, buf, mlen); -- if (len < 0) goto out; -- status = -ENOMEM; -- if (rawobj_alloc(&rsci.handle, buf, len)) -- goto out; -- -- rsci.h.flags = 0; -- /* expiry */ -- expiry = get_expiry(&mesg); -- status = -EINVAL; -- if (expiry == 0) -- goto out; -- -- /* remote flag */ -- rv = get_int(&mesg, &tmp_int); -- if (rv) { -- CERROR("fail to get remote flag\n"); -- goto out; -- } -- rsci.ctx.gsc_remote = (tmp_int != 0); -+ /* context handle */ -+ len = qword_get(&mesg, buf, mlen); -+ if (len < 0) goto out; -+ status = -ENOMEM; -+ if (rawobj_alloc(&rsci.handle, buf, len)) -+ goto out; -+ -+ rsci.h.flags = 0; -+ /* expiry */ -+ status = get_expiry(&mesg, &expiry); -+ if (status) -+ goto out; -+ -+ status = -EINVAL; -+ /* remote flag */ -+ rv = get_int(&mesg, &tmp_int); -+ if (rv) { -+ CERROR("fail to get remote flag\n"); -+ goto out; -+ } -+ rsci.ctx.gsc_remote = (tmp_int != 0); - - /* root user flag */ - rv = get_int(&mesg, &tmp_int); -@@ -621,41 +630,41 @@ static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen) - } - rsci.ctx.gsc_usr_root = (tmp_int != 0); - -- /* mds user flag */ -- rv = get_int(&mesg, &tmp_int); -- if (rv) { -- CERROR("fail to get mds user flag\n"); -- goto out; -- } -- rsci.ctx.gsc_usr_mds = (tmp_int != 0); -+ /* mds user flag */ -+ rv = get_int(&mesg, &tmp_int); -+ if (rv) { -+ CERROR("fail to get mds user flag\n"); -+ goto out; -+ } -+ rsci.ctx.gsc_usr_mds = (tmp_int != 0); - -- /* oss user flag */ -- rv = get_int(&mesg, &tmp_int); -- if (rv) { -- CERROR("fail to get oss user flag\n"); -- goto out; -- } -- rsci.ctx.gsc_usr_oss = (tmp_int != 0); -+ /* oss user flag */ -+ rv = get_int(&mesg, &tmp_int); -+ if (rv) { -+ CERROR("fail to get oss user flag\n"); -+ goto out; -+ } -+ rsci.ctx.gsc_usr_oss = (tmp_int != 0); - -- /* mapped uid */ -- rv = get_int(&mesg, (int *) &rsci.ctx.gsc_mapped_uid); -- if (rv) { -- CERROR("fail to get mapped uid\n"); -- goto out; -- } -+ /* mapped uid */ -+ rv = get_int(&mesg, (int *) &rsci.ctx.gsc_mapped_uid); -+ if (rv) { -+ CERROR("fail to get mapped uid\n"); -+ goto out; -+ } - -- rscp = rsc_lookup(&rsci); -- if (!rscp) -- goto out; -- -- /* uid, or NEGATIVE */ -- rv = get_int(&mesg, (int *) &rsci.ctx.gsc_uid); -- if (rv == -EINVAL) -- goto out; -- if (rv == -ENOENT) { -- CERROR("NOENT? set rsc entry negative\n"); -+ rscp = rsc_lookup(&rsci); -+ if (!rscp) -+ goto out; -+ -+ /* uid, or NEGATIVE */ -+ rv = get_int(&mesg, (int *) &rsci.ctx.gsc_uid); -+ if (rv == -EINVAL) -+ goto out; -+ if (rv == -ENOENT) { -+ CERROR("NOENT? set rsc entry negative\n"); - set_bit(CACHE_NEGATIVE, &rsci.h.flags); -- } else { -+ } else { - rawobj_t tmp_buf; - time64_t ctx_expiry; - -@@ -699,23 +708,23 @@ static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen) - * We want just the number of seconds into the future. - */ - expiry += ctx_expiry - ktime_get_real_seconds(); -- } -+ } - -- rsci.h.expiry_time = expiry; -- rscp = rsc_update(&rsci, rscp); -- status = 0; -+ rsci.h.expiry_time = expiry; -+ rscp = rsc_update(&rsci, rscp); -+ status = 0; - out: -- if (gm) -- lgss_mech_put(gm); -- rsc_free(&rsci); -- if (rscp) -- cache_put(&rscp->h, &rsc_cache); -- else -- status = -ENOMEM; -+ if (gm) -+ lgss_mech_put(gm); -+ rsc_free(&rsci); -+ if (rscp) -+ cache_put(&rscp->h, &rsc_cache); -+ else -+ status = -ENOMEM; - -- if (status) -- CERROR("parse rsc error %d\n", status); -- return status; -+ if (status) -+ CERROR("parse rsc error %d\n", status); -+ return status; - } - - static struct cache_detail rsc_cache = { --- -2.33.0 - diff --git a/kmp-lustre-osd-ldiskfs.files b/kmp-lustre-osd-ldiskfs.files index ffd12c1ebe6f8ecab3c2e0d20c87fc27ec0d6080..47bcee182499932c4c7641f957c4088fea795651 100644 --- a/kmp-lustre-osd-ldiskfs.files +++ b/kmp-lustre-osd-ldiskfs.files @@ -1,5 +1,3 @@ %defattr(-,root,root) -%dir %{modules_fs_path}/%{lustre_name}-osd-ldiskfs -%dir %{modules_fs_path}/%{lustre_name}-osd-ldiskfs/fs %{modules_fs_path}/%{lustre_name}-osd-ldiskfs/fs/ldiskfs.ko %{modules_fs_path}/%{lustre_name}-osd-ldiskfs/fs/osd_ldiskfs.ko diff --git a/kmp-lustre-osd-ldiskfs.preamble b/kmp-lustre-osd-ldiskfs.preamble index 135ea611020828300aec18dc575be0ac719ec842..cdd8a122e85ba6af1adbe00dac7b397e77686eed 100644 --- a/kmp-lustre-osd-ldiskfs.preamble +++ b/kmp-lustre-osd-ldiskfs.preamble @@ -1,7 +1,4 @@ License: GPL-2.0-only -%if 0%{?suse_version} > 1 -Requires: kernel-%1 -%endif Requires: %{requires_kmod_name} = %{requires_kmod_version} Requires: ldiskfsprogs >= 1.44.3.wc1 Requires: %{name}-osd-ldiskfs-mount = %{version} diff --git a/kmp-lustre-osd-zfs.files b/kmp-lustre-osd-zfs.files index 737eeeebf84dca3f853e620bca0eb38e6fbad3c1..0ac2677dd8fb8a3533b03c258b7a104ee738e6bf 100644 --- a/kmp-lustre-osd-zfs.files +++ b/kmp-lustre-osd-zfs.files @@ -1,4 +1,2 @@ %defattr(-,root,root) -%dir %{modules_fs_path}/%{lustre_name}-osd-zfs -%dir %{modules_fs_path}/%{lustre_name}-osd-zfs/fs %{modules_fs_path}/%{lustre_name}-osd-zfs/fs/osd_zfs.ko diff --git a/kmp-lustre-osd-zfs.preamble b/kmp-lustre-osd-zfs.preamble index b6581b9e4500152698f1905cc32c69754b3b8fac..339ed9bbe0514a467a30c4396347c588a93746c1 100644 --- a/kmp-lustre-osd-zfs.preamble +++ b/kmp-lustre-osd-zfs.preamble @@ -1,8 +1,5 @@ Summary: Lustre osd-zfs feature support License: GPL-2.0-only -%if 0%{?suse_version} > 1 -Requires: kernel-%1 -%endif Requires: %{name}-osd-zfs-mount = %{version} Provides: %{name}-osd = %{version} Obsoletes: %{name}-osd-zfs < %{version} diff --git a/kmp-lustre-tests.files b/kmp-lustre-tests.files index fd7aed8a68e362295e0f2b07749a6708d3140507..f62b46f963be3f4721e163731ed8f840ee63a36f 100644 --- a/kmp-lustre-tests.files +++ b/kmp-lustre-tests.files @@ -1,3 +1 @@ -%dir %{modules_fs_path}/%{lustre_name}-tests -%dir %{modules_fs_path}/%{lustre_name}-tests/fs %{modules_fs_path}/%{lustre_name}-tests/fs/llog_test.ko diff --git a/kmp-lustre-tests.preamble b/kmp-lustre-tests.preamble index f2b74206eaced070133e97f46f43d9eb07f58bf3..7de5d664f9bd001146ff967b22f7af632571fc1e 100644 --- a/kmp-lustre-tests.preamble +++ b/kmp-lustre-tests.preamble @@ -1,7 +1,5 @@ License: GPL-2.0-only +Requires: %{requires_kmod_name} = %{requires_kmod_version} %if 0%{?suse_version} > 1 Requires: kernel-%1 %endif -%if %{with lustre_modules} -Requires: %{requires_kmod_name} = %{requires_kmod_version} -%endif diff --git a/kmp-lustre.files b/kmp-lustre.files index 3ded0c4bb03f0a0a29e5ea4c48e1752915ef7c0c..62e39d0d0fba71dd9023fc77084d21bd429c49b9 100644 --- a/kmp-lustre.files +++ b/kmp-lustre.files @@ -1,6 +1,4 @@ %defattr(-,root,root) -%dir %{modules_fs_path} -%dir %{modules_fs_path}/%{lustre_name} %{modules_fs_path}/%{lustre_name}/* %doc COPYING %doc ChangeLog-lustre diff --git a/kmp-lustre.preamble b/kmp-lustre.preamble index edb68f0d023348e150011c6e1f95bd10e51f7040..9224573471ae63e94ad8c018921b52e947b9fcdd 100644 --- a/kmp-lustre.preamble +++ b/kmp-lustre.preamble @@ -1,5 +1,2 @@ -%if 0%{?suse_version} > 1 -Requires: kernel-%1 -%endif Obsoletes: %{name}-modules < %{version} License: GPL-2.0-only diff --git a/lustre.spec b/lustre.spec index b066313bc524f187b7f4e0c20ccd6597c8f0747e..c7d90ccea03155161c68254d92132af0f3f6df53 100644 --- a/lustre.spec +++ b/lustre.spec @@ -1,15 +1,7 @@ -# SPDX-License-Identifier: GPL-2.0 - -# -# This file is part of Lustre, http://www.lustre.org/ -# -# lustre.spec.in -# -# spec file template for RHEL package builds -# +# lustre.spec # Declare rpmbuild --with/--without parameters -%bcond_with servers +%bcond_without servers %bcond_without ldiskfs %bcond_with zfs %bcond_without lustre_tests @@ -57,7 +49,7 @@ %undefine with_lustre_tests %endif -%{!?version: %global version 2.15.57} +%{!?version: %global version 2.15.3} # if you want a custom kernel version set it variable with $ver.$arch %{!?kver: %global kver %(rpm -q --qf '%%{VERSION}-%%{RELEASE}.%%{ARCH}' `rpm -q kernel-devel | sort -rV|head -n 1`)} # cut epoch for kmodtool @@ -121,22 +113,22 @@ # Set the package name prefix %if %{undefined lustre_name} %if %{with servers} - %global lustre_name lustre + %global lustre_name lustre %else - %global lustre_name lustre-client + %global lustre_name lustre-client %endif %endif %if %{with lustre_modules} %if %{undefined kmoddir} %if %{defined kernel_module_package_moddir} - %global kmoddir %{kernel_module_package_moddir} + %global kmoddir %{kernel_module_package_moddir} %else - %if %{defined suse_kernel_module_package} - %global kmoddir updates - %else - %global kmoddir extra - %endif + %if %{defined suse_kernel_module_package} + %global kmoddir updates + %else + %global kmoddir extra + %endif %endif %endif @@ -144,7 +136,7 @@ # requires want to set a version including epoch %global krequires %(echo %{kver} | sed -e 's/\.x86_64$//' -e 's/\.i[3456]86$//' -e 's/-smp$//' -e 's/-bigsmp$//' -e 's/[-.]ppc64$//' -e 's/\.aarch64$//' -e 's/-default$//' -e 's/-%{_flavor}//') -%if "%{_vendor}" == "redhat" || "%{_vendor}" == "fedora" || "%{_vendor}" == "openEuler" +%if "%{_vendor}" == "redhat" || "%{_vendor}" == "fedora" || 0%{?openEuler} %global requires_kmod_name kmod-%{lustre_name} %global requires_kmod_osd_zfs_name kmod-%{lustre_name}-osd-zfs %if %{with lustre_tests} @@ -184,7 +176,7 @@ %endif # openEuler comes with systemd -%if "%{_vendor}" == "openEuler" +%if 0%{?openEuler} %define with_systemd 1 %endif @@ -192,13 +184,10 @@ Summary: Lustre File System Name: %{lustre_name} -Version: 2.15.57 -Release: 1 +Version: 2.15.3 +Release: 2 License: GPL-2.0-only AND LGPL-2.1-or-later -%if 0%{?suse_version} >= 1310 -# SUSE needs Group for the kernel_module_package macro -Group: System/Kernel -%endif +Group: System Environment/Kernel Source: https://github.com/lustre/lustre-release/archive/refs/tags/%{version}.tar.gz Source1: kmp-lustre.preamble Source2: kmp-lustre.files @@ -214,7 +203,67 @@ URL: https://wiki.whamcloud.com/ BuildRoot: %{_tmppath}/lustre-%{version}-root # patches -Patch1: LU-16802-build-iov_iter_iovec-class_create-get_expir.patch +Patch01: 0001-Prepare-for-next-pointrelease.patch +Patch02: 0002-LU-15821-ldlm-Prioritize-blocking-callbacks.patch +Patch03: 0003-LU-14377-tests-make-parallel-scale-rr_alloc-less-str.patch +Patch04: 0004-LU-15123-tests-check-quota-reintegration-after-recov.patch +Patch05: 0005-LU-13081-tests-skip-sanity-test_151-test_156.patch +Patch06: 0006-LU-11785-tests-fix-conf-sanity-98-mount-check-on-64K.patch +Patch07: 0007-LU-11388-tests-replay-single-131b-to-refresh-grants.patch +Patch08: 0008-LU-16163-tests-skip-racer_on_nfs-for-NFSv3.patch +Patch09: 0009-LU-14294-tests-fixed-NFS-configuration-issue.patch +Patch10: 0010-LU-16717-mdt-treat-unknown-hash-type-as-sane-type.patch +Patch11: 0011-LU-15481-llog-Add-LLOG_SKIP_PLAIN-to-skip-llog-plain.patch +Patch12: 0012-LU-6612-utils-strengthen-llog_reader-vs-wrong-format.patch +Patch13: 0013-LU-16052-llog-handle-EBADR-for-catalog-processing.patch +Patch14: 0014-LU-16717-mdt-resume-dir-migration-with-bad_type.patch +Patch15: 0015-LU-14668-lnet-Lock-primary-NID-logic.patch +Patch16: 0016-LU-14668-lnet-Peers-added-via-kernel-API-should-be-p.patch +Patch17: 0017-LU-14668-lnet-don-t-delete-peer-created-by-Lustre.patch +Patch18: 0018-LU-14668-lnet-add-force-option-to-lnetctl-peer-del.patch +Patch19: 0019-LU-14668-lnet-add-lock_prim_nid-lnet-module-paramete.patch +Patch20: 0020-LU-14668-tests-verify-state-of-peer-added-with-lock_.patch +Patch21: 0021-LU-11787-test-Fix-checkfilemap-tests-for-64K-page.patch +Patch22: 0022-LU-15800-ofd-take-a-read-lock-for-fallocate.patch +Patch23: 0023-LU-16873-osd-update-OI_Scrub-file-with-new-magic.patch +Patch24: 0024-LU-15519-quota-fallocate-does-not-increase-projectid.patch +Patch25: 0025-LU-16060-osd-ldiskfs-copy-nul-byte-terminator-in-wri.patch +Patch26: 0026-LU-16934-kernel-update-RHEL-8.8-4.18.0-477.15.1.el8_.patch +Patch27: 0027-LU-15740-tests-scale-fs_log_size-by-OSTCOUNT.patch +Patch28: 0028-LU-16943-tests-fix-replay-single-135-under-hard-fail.patch +Patch29: 0029-LU-16517-build-pass-extra-configure-options-to-make-.patch +Patch30: 0030-LU-15193-quota-expand-QUOTA_MAX_TRANSIDS-to-12.patch +Patch31: 0031-LU-16916-tests-fix-client_evicted-not-to-ignore-EOPN.patch +Patch32: 0032-LU-16626-build-remove-python2-dependencies.patch +Patch33: 0033-LU-16943-tests-use-primary-ost1-server-in-replay-sin.patch +Patch34: 0034-LU-16585-build-remove-python2-dependencies.patch +Patch35: 0035-LU-15660-statahead-statahead-thread-doesn-t-stop.patch +Patch36: 0036-LU-16042-tests-can-not-get-cache-size-on-Arm64.patch +Patch37: 0037-LU-16662-autoconf-fix-configure-test-compile-for-CON.patch +Patch38: 0038-LU-16322-build-Add-client-build-support-for-openEule.patch +Patch39: 0039-LU-16481-build-add-server-support-for-openEuler.patch +Patch40: 0040-LU-16824-ldiskfs-add-support-for-openEuler-22.03-LTS.patch +Patch41: 0041-LU-16976-ldiskfs-add-support-for-openEuler-22.03-SP2.patch +Patch42: 0042-lustre.spec.in-match-rpm-macro-openEuler-for-openEul.patch +Patch43: 0043-LU-15722-osd-ldiskfs-fix-IO-write-gets-stuck-for-64K.patch +Patch44: 0044-LU-15722-osd-ldiskfs-fix-write-stuck-for-64K-PAGE_SI.patch +Patch45: 0045-LU-15978-osp-fix-striped-directory-deletion-fails-fo.patch +Patch46: 0046-ldiskfs-add-support-for-oe2003.patch +Patch47: 0047-lustre.spec.in-Add-gcc-option-Wno-stringop-overflow.patch +Patch48: 0048-LU-16321-osd-Allow-fiemap-on-kernel-buffers.patch +Patch49: 0049-LU-13135-quota-improve-checks-in-OSDs-to-ignore-quot.patch +Patch50: 0050-LU-16893-libcfs-Remove-force_sig-usage-from-lfsck.patch +Patch51: 0051-LU-16534-build-Prefer-timer_delete-_sync.patch +Patch52: 0052-LU-16541-tests-Improve-test-64f.patch +Patch53: 0053-LU-16788-tests-sanity-should-remove-temp-files.patch +Patch54: 0054-LU-14992-tests-sanity-replay-vbr-mkdir-on-MDT0.patch +Patch55: 0055-LU-14992-tests-add-more-mkdir_on_mdt0-calls.patch +Patch56: 0056-LU-15816-tests-use-correct-ost-host-to-manage-failur.patch +Patch57: 0057-LU-16571-utils-fix-parallel-lfs-migrate-b-on-hard-li.patch +Patch58: 0058-LU-14073-ldiskfs-don-t-test-LDISKFS_IOC_FSSETXATTR.patch +Patch59: 0059-LU-16019-llite-fully-disable-readahead-in-kernel-I-O.patch +Patch60: 0060-Update-openEuler-22.03-kernels.patch +Patch61: 0061-Update-kernel-for-openEuler-20.03-LTS.patch %if %{with lustre_modules} Requires: %{requires_kmod_name} = %{requires_kmod_version} @@ -223,15 +272,12 @@ Requires: %{requires_kmod_name} = %{requires_kmod_version} Requires: python3 >= 3.6.0 BuildRequires: python3-devel >= 3.6.0, swig %endif -BuildRequires: libtool pkgconfig(yaml-0.1) pkgconfig(zlib) pkgconfig(libnl-3.0) flex bison +BuildRequires: libtool libyaml-devel zlib-devel libnl3-devel flex bison %if "%{_vendor}" == "redhat" BuildRequires: redhat-rpm-config BuildRequires: pkgconfig -%if 0%{?rhel} > 7 || 0%{?fedora} > 33 || 0%{?rhel} < 1 -Suggests: bash-completion -%endif %else -%if "%{_vendor}" == "openEuler" +%if 0%{?openEuler} BuildRequires: openEuler-rpm-config %if %{with ldiskfs} BuildRequires: kernel-debugsource @@ -242,48 +288,33 @@ BuildRequires: pkg-config %if %{with gss} BuildRequires: krb5-devel openssl-devel %endif -%if %{with lustre_modules} -# abuild (auto-build) used by SUSE Open Build Service -# need kernel-source as a build requirement, but the code -# which extracts these requirements don't understand %() -# and treats all such as failures. So the following dance -# Allows the requirements to be seen by abuild, but ignored -# by lbuild. -%if "%(echo $USER)" != "abuild" -%else -BuildRequires: kernel-source -%endif -%endif %if %{with servers} -Requires: %{name}-osd -Requires: %{name}-osd-mount +Requires: lustre-osd +Requires: lustre-osd-mount Obsoletes: lustre-server < %{version} Provides: lustre-server = %{version}-%{release} %endif Obsoletes: lustre-client < %{version} Provides: lustre-client = %{version}-%{release} -%if "%{_vendor}" == "redhat" || "%{_vendor}" == "fedora" || "%{_vendor}" == "openEuler" +%if "%{_vendor}" == "redhat" || "%{_vendor}" == "fedora" || 0%{?openEuler} #suse don't support selinux -BuildRequires: pkgconfig(libselinux) +BuildRequires: libselinux-devel %endif %if %{with lustre_modules} %if %{with mofed} BuildRequires: mlnx-ofa_kernel-devel -%if "%{_vendor}" == "redhat" || "%{_vendor}" == "openEuler" +%if "%{_vendor}" == "redhat" || 0%{?openEuler} Requires: kmod-mlnx-ofa_kernel %else Requires: mlnx-ofa_kernel-kmp %endif %endif -%if 0%{?rhel} >= 8 || "%{_vendor}" == "openEuler" +%if 0%{?rhel} >= 8 || 0%{?openEuler} BuildRequires: kernel-rpm-macros %endif -%if 0%{?suse_version} >= 1530 -BuildRequires: rpm-build >= 4.14.3 -%endif BuildRequires: %kernel_module_package_buildreqs # need to provide a /usr/lib/${uname -r)/build dir -BuildRequires: kernel >= 3.10 +BuildRequires: kernel %if "%{_vendor}" == "redhat" %if %{with kabi} BuildRequires: kernel-abi-whitelists @@ -295,10 +326,10 @@ BuildRequires: kernel-abi-whitelists Requires(post): systemd Requires(preun): systemd Requires(postun): systemd -BuildRequires: pkgconfig(systemd) +BuildRequires: systemd %endif -BuildRequires: pkgconfig(mount) +BuildRequires: libmount-devel %description Userspace tools and files for the Lustre file system. @@ -330,17 +361,15 @@ echo $TMPFILE %kernel_module_package -n %{name}-osd-ldiskfs -p %SOURCE3 -f %SOURCE4 %{_flavor} %if %{with lustre_utils} %package osd-ldiskfs-mount -Summary: Lustre mount's ldiskfs-specific helper library +Summary: osd-ldiskfs-mount contains mount's ldiskfs specific dso. BuildRequires: e2fsprogs-devel >= 1.44.3 -Requires: ldiskfsprogs > 1.45.6 +Requires: ldiskfsprogs > 1.45.6 libmount Provides: lustre-osd-mount = %{version} -Provides: %{name}-osd-mount = %{version} -Obsoletes: %{name}-osd-mount < %{version} Obsoletes: lustre-osd-mount < %{version} +Group: System Environment/Kernel %description osd-ldiskfs-mount -Provide a shared library (dso) that can be loaded into various -lustre tools (mount/mkfs) to provide support for ldisfs +LDISKFS hooks for mount/mkfs into a dynamic library. # with lustre_utils %endif @@ -351,16 +380,17 @@ lustre tools (mount/mkfs) to provide support for ldisfs %kernel_module_package -n %{name}-osd-zfs -p %SOURCE5 -f %SOURCE6 %{_flavor} %if %{with lustre_utils} %package osd-zfs-mount -Summary: Lustre mount's zfs-specific helper library -Provides: %{name}-osd-mount = %{version} +Summary: osd-zfs-mount contains mount's zfs specific dso. +Requires: libmount +Provides: lustre-osd-mount = %{version} Obsoletes: lustre-osd-mount < %{version} # Tests also require zpool from zfs package: Requires: zfs Requires: %{requires_kmod_osd_zfs_name} +Group: System Environment/Kernel %description osd-zfs-mount -Provide a shared library (dso) that can be loaded into various -lustre tools (mount/mkfs) to provide support for ZFS. +ZFS hooks for mount/mkfs into a dynamic library. # with lustre_utils %endif @@ -372,6 +402,7 @@ lustre tools (mount/mkfs) to provide support for ZFS. %if %{with servers} %package resource-agents Summary: HA Resuable Cluster Resource Scripts for Lustre +Group: System Environment/Base Requires: %{name} Requires: resource-agents @@ -382,6 +413,7 @@ environment for both Pacemaker and rgmanager. %package devel Summary: Lustre include headers +Group: Development/Kernel Provides: lustre-devel = %{version} Requires: %{lustre_name} = %{version} %if %{with lustre_modules} @@ -397,11 +429,10 @@ applications against the Lustre / LNet utilities libraries. %if %{with lustre_tests} %package tests Summary: Lustre testing framework -Provides: %{name}-tests = %{version} +Group: System Environment/Kernel +Provides: lustre-tests = %{version} %if %{with lustre_iokit} -Requires: %{name} = %{version}, lustre-iokit -%else -Requires: %{name} = %{version} +Requires: lustre-iokit %endif Requires: lustre-devel = %{version} %if 0%{?rhel} >= 8 || 0%{?suse_version} >= 1500 || 0%{?openEuler} @@ -414,20 +445,13 @@ Requires: %{requires_kmod_tests_name} = %{requires_kmod_version} %if %{with lustre_tests_lutf} Requires: python3 >= 3.6.0 %endif -Requires: attr, rsync, lsof, /usr/bin/getconf -Requires: /usr/sbin/getenforce, acl, /usr/bin/killall, /usr/bin/ping, bc -# Of the supported targets, only rhel7 doesn't support Recommends. -%if 0%{?rhel} > 7 || 0%{?fedora} > 33 || 0%{?rhel} < 1 -Recommends: perl, dbench, iozone -# Either of these is sufficient -Suggests: pdsh, clush -%endif +Requires: attr, rsync, perl, lsof, /usr/bin/getconf %if %{with mpi} %if "%{mpi_name}" == "mpich" BuildRequires: mpich-devel %endif %if "%{mpi_name}" == "openmpi" -%if "%{_vendor}" == "redhat" || "%{_vendor}" == "openEuler" || 0%{?suse_version} < 1500 +%if "%{_vendor}" == "redhat" || 0%{?openEuler} || 0%{?suse_version} < 1500 BuildRequires: openmpi-devel %else BuildRequires: openmpi2-devel @@ -448,8 +472,9 @@ to be used by the Lustre testing framework. %if %{with lustre_iokit} %package -n lustre-iokit -Summary: Collection of benchmark tools for a cluster with the Lustre file system -Requires: perl, sg3_utils +Summary: The Lustre IO-Kit is a collection of benchmark tools for a cluster with the Lustre file system. +Group: Applications/System +Requires: python3, sg3_utils %description -n lustre-iokit This package includes five tools: @@ -472,16 +497,13 @@ This script will collect IO stats on a defined set of nodes. ior-survey: A script to run the IOR benchmark. The latest version can be downloaded from -https://github.com/hpc/ior/ +http://www.llnl.gov/asci/purple/benchmarks/limited/ior/ mds-survey: This survey tests the local metadata performance using the echo_client to drive the MDD layer to perform operations. It is run with multiple threads (to simulate MDT service threads) locally on the MDS node, and does not need Lustre clients in order to run - -lst-survey: -This survey tests LNet performance between a group of clients and servers. %endif %if 0%{?suse_version} @@ -517,7 +539,7 @@ export UTILS_CFLAGS="${UTILS_CFLAGS} -D__SANE_USERSPACE_TYPES__=1" # Disable any hardening or annotation since this doesn't make sense for # kernel code, and reset "optflags" so that the vendor's overzealous flags don't # create build failures. -%define optflags -g -O2 -Werror -Wno-stringop-overflow -Wno-format-truncation -Wno-use-after-free +%define optflags -g -O2 -Werror -Wno-stringop-overflow %undefine _annotated_build %undefine _hardened_build @@ -673,8 +695,6 @@ fi # legacy syntax. Install a compatibility symlink to avoid conflicts when # newer-style agents are added. ln -s Lustre.ha_v2 $RPM_BUILD_ROOT%{_sysconfdir}/ha.d/resource.d/Lustre -echo '%dir %{_sysconfdir}/ha.d' >>lustre.files -echo '%dir %{_sysconfdir}/ha.d/resource.d' >>lustre.files echo '%{_sysconfdir}/ha.d/resource.d/Lustre.ha_v2' >>lustre.files echo '%{_sysconfdir}/ha.d/resource.d/Lustre' >>lustre.files %endif @@ -684,7 +704,7 @@ echo '%{_sysconfdir}/ha.d/resource.d/Lustre' >>lustre.files echo '%{_unitdir}/lnet.service' >>lustre.files %endif -%if "%{_vendor}" == "redhat" || "%{_vendor}" == "openEuler" +%if "%{_vendor}" == "redhat" || 0%{?openEuler} # The following scripts are Red Hat specific %if %{with servers} echo '%{_sysconfdir}/init.d/lustre' >>lustre.files @@ -731,7 +751,6 @@ if [ -d $RPM_BUILD_ROOT%{_libdir}/lustre ] ; then fi %endif -echo '%{_prefix}/lib/firewalld/services/*.xml' >>lustre.files %if %{with lustre_modules} # mark modules executable for find-debuginfo.sh find $RPM_BUILD_ROOT/lib/modules -name \*.ko -type f -exec chmod u+x {} \; @@ -750,8 +769,6 @@ rm -f $RPM_BUILD_ROOT%{_libdir}/liblustreapi.la %endif # mpi %endif -echo '%dir %{_libdir}/lustre' >>lustre-tests.files -echo '%dir %{_libdir}/lustre/tests' >>lustre-tests.files echo '%{_libdir}/lustre/tests/*' >>lustre-tests.files echo '%{_bindir}/mcreate' >>lustre-tests.files echo '%{_bindir}/munlink' >>lustre-tests.files @@ -800,16 +817,12 @@ echo '%{_libdir}/lustre/tests/lutf/*' >>lustre-tests.files %endif %if %{with lustre_utils} %if %{with servers} -%dir %{_libexecdir}/lustre %{_libexecdir}/lustre/lc_common %{_libexecdir}/lustre/haconfig %{_bindir}/lustre_req_history -%{_bindir}/remove_changelog -%{_bindir}/remove_updatelog %endif %{_bindir}/llobdstat -%{_bindir}/lljobstat %{_bindir}/llstat %{_bindir}/plot-llstat %{_datadir}/lustre @@ -830,9 +843,9 @@ echo '%{_libdir}/lustre/tests/lutf/*' >>lustre-tests.files %if %{with shared} %{_libdir}/liblustreapi.so.* %endif -%{_udevrulesdir}/99-lustre.rules +%{_sysconfdir}/udev/rules.d/99-lustre.rules %if %{with servers} -%{_udevrulesdir}/99-lustre-server.rules +%{_sysconfdir}/udev/rules.d/99-lustre-server.rules %endif %if %{with zfs} %config(noreplace) %{_sysconfdir}/ldev.conf @@ -849,7 +862,6 @@ echo '%{_libdir}/lustre/tests/lutf/*' >>lustre-tests.files %if %{with lustre_utils} %files osd-ldiskfs-mount %defattr(-,root,root) -%dir %{_libdir}/lustre %{_libdir}/lustre/mount_osd_ldiskfs.so %endif %endif @@ -860,7 +872,6 @@ echo '%{_libdir}/lustre/tests/lutf/*' >>lustre-tests.files %if %{with lustre_utils} %files osd-zfs-mount %defattr(-,root,root) -%dir %{_libdir}/lustre %{_libdir}/lustre/mount_osd_zfs.so %{_sysconfdir}/zfs/zed.d/* %endif @@ -873,8 +884,6 @@ echo '%{_libdir}/lustre/tests/lutf/*' >>lustre-tests.files %if %{with servers} %files resource-agents %defattr(0755,root,root) -%dir %{_prefix}/lib/ocf -%dir %{_prefix}/lib/ocf/resource.d %{_prefix}/lib/ocf/resource.d/lustre/ %endif @@ -900,15 +909,12 @@ echo '%{_libdir}/lustre/tests/lutf/*' >>lustre-tests.files %{_bindir}/obdfilter-survey %{_bindir}/ost-survey %{_bindir}/sgpdd-survey -%{_bindir}/lst-survey -%{_bindir}/lst.sh %doc lustre-iokit/ior-survey/README.ior-survey %doc lustre-iokit/mds-survey/README.mds-survey %doc lustre-iokit/obdfilter-survey/README.obdfilter-survey %doc lustre-iokit/ost-survey/README.ost-survey %doc lustre-iokit/sgpdd-survey/README.sgpdd-survey %doc lustre-iokit/stats-collect/README.iokit-lstats -%doc lustre-iokit/lst-survey/README.lst-survey %endif %post @@ -931,12 +937,24 @@ rm -rf $RPM_BUILD_ROOT rm -rf %{_tmppath}/kmp %changelog -* Thu Aug 03 2023 Xinliang Liu - 2.15.57-1 -- Update to 2.15.57 with kernel 6.4 support patch. +* Fri Nov 24 2023 Xinliang Liu - 2.15.3-2 +- test new build + +* Fri Nov 10 2023 Xinliang Liu - 2.15.3-1 +- Update to 2.15.3 with openEuler bugfixes patches. + +* Fri Jun 02 2023 Xinliang Liu - 2.15.2-4 +- Fix sanity test 155e,155f,155g,155h -* Fri May 12 2023 Xinliang Liu - 2.15.54-3 +* Thu Jun 01 2023 Xinliang Liu - 2.15.2-3 +- Fix kmod-lustre-client-tests install warnings + +* Fri May 12 2023 Xinliang Liu - 2.15.2-2 - Fix client build requires libmount-devel +* Fri May 05 2023 Xinliang Liu - 2.15.2-1 +- Switch to 2.15 LTS, only client support + * Thu Apr 20 2023 Xinliang Liu - 2.15.54-2 - Fix release number contains double %{dist}, e.g. *.oe1.oe1.aarch64.rpm - Fix sort by version number when finding the latest kernel version.