diff --git a/templates/data.CP2K.X86.config b/templates/data.CP2K.X86.config new file mode 100644 index 0000000000000000000000000000000000000000..44dc7cd22fb15140c5c73f87ef3e7726d88dd3bf --- /dev/null +++ b/templates/data.CP2K.X86.config @@ -0,0 +1,67 @@ +[SERVER] +11.11.11.11 + +[ENV] +export LD_LIBRARY_PATH=/opt/cp2k/cp2k-8.2.0/tools/toolchain/install/SpFFT-0.9.13/lib/:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/usr/local/gcc/lib64/:$LD_LIBRARY_PATH +export CPATH=/opt/software/cp2k-8.2.0/tools/toolchain/install/libxsmm-1.16.2/include:$CPATH +source /opt/intel/oneapi/setvars.sh + +[APP] +app_name = CP2K +build_dir = /opt/software/cp2k-8.2.0/ +binary_dir = /opt/software/cp2k-8.2.0/exe/Linux-x86-64-intel/ +#case_dir = /opt/software/cp2k-8.2.0/benchmarks/QS/ +case_dir = /opt/software/cp2k-8.2.0/benchmarks/QS_DM_LS/ + +[BUILD] +make -j 48 ARCH=local-cpu VERSION=psmp + +[CLEAN] +make -j 48 ARCH=local-cpu VERSION=psmp clean + +[RUN] +run = mpirun -genv CUDA_VISIBLE_DEVICES=0,1 -np 48 -x OMP_NUM_THREADS=1 +binary = cp2k.psmp H2O-dft-ls.NREP2.inp +nodes = 1 + +[BATCH] +#!/bin/bash + +logfile=cp2k.H2O-256.inp.log + +nvidia-smi -pm 1 +nvidia-smi -ac 1215,1410 + +echo 3 > /proc/sys/vm/drop_caches +echo "===run 32C*GPU===" >> $logfile +mpirun -np 32 -genv OMP_NUM_THREADS=1 -genv CUDA_VISIBLE_DEVICES=0 exe/local-cuda/cp2k.psmp benchmarks/QS/H2O-256.inp > cp2k.H2O-256.inp.log >> $logfile 2>&1 + +echo 3 > /proc/sys/vm/drop_caches +echo "===run 32C*2GPU===" >> $logfile +mpirun -np 32 -genv OMP_NUM_THREADS=1 -genv CUDA_VISIBLE_DEVICES=0,1 exe/local-cuda/cp2k.psmp benchmarks/QS/H2O-256.inp > cp2k.H2O-256.inp.log >> $logfile 2>&1 + + +echo 3 > /proc/sys/vm/drop_caches +echo "===run 64C*GPU===" >> $logfile +mpirun -np 64 -genv OMP_NUM_THREADS=1 -genv CUDA_VISIBLE_DEVICES=0 exe/local-cuda/cp2k.psmp benchmarks/QS/H2O-256.inp > cp2k.H2O-256.inp.log >> $logfile 2>&1 + +echo 3 > /proc/sys/vm/drop_caches +echo "===run 64C*2GPU===" >> $logfile +mpirun -np 32 -genv OMP_NUM_THREADS=1 -genv CUDA_VISIBLE_DEVICES=0,1 exe/local-cuda/cp2k.psmp benchmarks/QS/H2O-256.inp > cp2k.H2O-256.inp.log >> $logfile 2>&1 + + +echo 3 > /proc/sys/vm/drop_caches +echo "===run 128C*GPU===" >> $logfile +mpirun -np 128 -genv OMP_NUM_THREADS=1 -genv CUDA_VISIBLE_DEVICES=0 exe/local-cuda/cp2k.psmp benchmarks/QS/H2O-256.inp > cp2k.H2O-256.inp.log >> $logfile 2>&1 + +echo 3 > /proc/sys/vm/drop_caches +echo "===run 128C*2GPU===" >> $logfile +mpirun -np 128 -genv OMP_NUM_THREADS=1 -genv CUDA_VISIBLE_DEVICES=0,1 exe/local-cuda/cp2k.psmp benchmarks/QS/H2O-256.inp > cp2k.H2O-256.inp.log >> $logfile 2>&1 + + + + + + + diff --git a/templates/data.amber.config b/templates/data.amber.config new file mode 100644 index 0000000000000000000000000000000000000000..af5dc2f328ad8bce1acdcd5a6bde059b291f1fd8 --- /dev/null +++ b/templates/data.amber.config @@ -0,0 +1,33 @@ +[SERVER] +11.11.11.11 + +[ENV] +# add gcc ompi +module use /opt/modulefile +module add gcc-9.3.1 openmpi-4.1.1 +# add cuda +export CUDA_INCLUDE_DIRS=/usr/local/cuda/include +export CUDA_CUDART_LIBRARY=/usr/local/cuda/lib64/libcudart.so +export CUDA_HOME=/usr/local/cuda +export PATH=$CUDA_HOME/bin:$PATH +export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH +export C_INCLUDE_PATH=$CUDA_HOME/include:$C_INCLUDE_PATH +export LIBRARY_PATH=$CUDA_HOME/lib64:$LIBRARY_PATH + +[APP] +app_name = Amber +build_dir = /home/amber20_src/build/ +binary_dir = /home/amber20/bin/ +case_dir = /home/amber_case/ + +[BUILD] +./run_cmake +make -j 96 install + +[CLEAN] +./clean_build + +[RUN] +run = mpirun -mca btl ^vader,tcp,openib,uct -np 2 +binary = pmemd.cuda_SPFP.MPI -O -i mdinOPT.GPU -o mdout -p Cellulose.prmtop -c Cellulose.inpcrd && cat mdout +nodes = 1 \ No newline at end of file diff --git a/templates/data.openfoam.config b/templates/data.openfoam.config new file mode 100644 index 0000000000000000000000000000000000000000..593075506d41927a29a1f2268d0668aaf923a3e2 --- /dev/null +++ b/templates/data.openfoam.config @@ -0,0 +1,26 @@ +[SERVER] +11.11.11.11 + +[ENV] +# add gcc/mpi +module use modules +module load compiler/kgcc-9.3 +module load gcc/openmpi + +[APP] +app_name = OpenFOAM +build_dir = /home/HPCRunner/OpenFOAM-v1906/ +binary_dir = +case_dir = /home/HPCRunner/OpenFOAM-v1906/tutorials/incompressible/pisoFoam/LES/motorBike/motorBike/ + +[BUILD] +source /home/HPCRunner/OpenFOAM-v1906/etc/bashrc +./Allwmake -j 64 + +[CLEAN] +rm -rf build + +[RUN] +run = +binary = ./Allrun +nodes = 1 \ No newline at end of file diff --git a/templates/data.qe.gpu.config b/templates/data.qe.gpu.config new file mode 100644 index 0000000000000000000000000000000000000000..5d00bfe8af6fbecfb104583c9c344d8c8051f4e5 --- /dev/null +++ b/templates/data.qe.gpu.config @@ -0,0 +1,38 @@ +[SERVER] +11.11.11.11 + +[ENV] +# add gcc/mpi +export BLAS_LIBS="-I/home/HPCRunner-master/software/arm/gcc/openblas-0.3.18/include -L/home/HPCRunner-master/software/arm/gcc/openblas-0.3.18/lib/ -lopenblas" +export LAPACK_LIBS="-L/home/HPCRunner-master/software/arm/gcc/openblas-0.3.18/lib/ -lopenblas" +#export SCALAPACK_LIBS="-lgfortran -L/home/HPCRunner-master/software/arm/gcc/scalapack-2.1.0/ -lscalapack" +export FFT_LIBS="-I/home/HPCRunner-master/software/arm/gcc/fftw-3.3.8/FFTW/include/ -L/home/HPCRunner-master/software/arm/gcc/fftw-3.3.8/FFTW/lib/ -lfftw3 -lfftw3_omp" +export LD_LIBRARY_PATH=/home/HPCRunner-master/software/arm/gcc/openblas-0.3.18/lib/:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/opt/nvidia/hpc_sdk/Linux_aarch64/21.9/comm_libs/openmpi4/openmpi-4.0.5/lib/:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/opt/nvidia/hpc_sdk/Linux_aarch64/21.9/compilers/lib/:$LD_LIBRARY_PATH +module purge +module use modules +module load compiler/kgcc + +module use /opt/nvidia/hpc_sdk/modulefiles/ +module load nvhpc/21.9 + +[APP] +app_name = QE +build_dir = /home/HPCRunner-master/q-e-qe-6.8/ +binary_dir = /home/HPCRunner-master/q-e-qe-6.8/bin/ +case_dir = /home/HPCRunner-master/jiancong/ + +[BUILD] +./configure --with-cuda=yes --with-cuda-runtime=11.4 --with-cuda-cc=80 --enable-openmp --with-scalapack=no +echo DFLAGS += -D__GPU_MPI -D__FFTW3 >> make.inc +echo LD_LIBS += -lcurand >> make.inc +make pw -j 64 + +[CLEAN] +make clean + +[RUN] +run = mpirun --allow-run-as-root -mca btl ^vader,tcp,openib,uct -np 4 -x OMP_NUM_THREADS=1 +binary = pw.x -nk 4 -input ausurf.in +nodes = 1 \ No newline at end of file diff --git a/templates/data.vasp.config b/templates/data.vasp.config new file mode 100644 index 0000000000000000000000000000000000000000..7a86019ca4ceb562cddae2401953f0548ec87e36 --- /dev/null +++ b/templates/data.vasp.config @@ -0,0 +1,24 @@ +[SERVER] +11.11.11.11 + +[ENV] +# add gcc/mpi +module use /opt/modulefile/ +module load openmpi-4.1.1 + +[APP] +app_name = VASP +build_dir = /home/HPCRunner/vasp.5.4.4/ +binary_dir = /home/HPCRunner/vasp.5.4.4/bin/ +case_dir = /home/HPCRunner/vasp_normal + +[BUILD] +make std + +[CLEAN] +make veryclean + +[RUN] +run = mpirun -mca pml ucx -mca btl ^vader,tcp,openib,uct -np 128 +binary = vasp_std +nodes = 1 \ No newline at end of file diff --git a/templates/data.vasp6.1.gpu.x86.config b/templates/data.vasp6.1.gpu.x86.config new file mode 100644 index 0000000000000000000000000000000000000000..d7e6e56a3a2fb60b6972b59ad0f03dedbfa8264e --- /dev/null +++ b/templates/data.vasp6.1.gpu.x86.config @@ -0,0 +1,31 @@ +[SERVER] +11.11.11.11 + +[ENV] +# add gcc/mpi +export LD_LIBRARY_PATH=/home/HPCRunner/software/arm/gcc/fftw-3.3.8/FFTW/lib:$LD_LIBRARY_PATH +module use /opt/nvidia/hpc_sdk/modulefiles/ +module load nvhpc/21.9 +source /opt/intel/oneapi/setvars.sh + +[APP] +app_name = VASP +build_dir = /home/HPCRunner/vasp.6.1.0/ +binary_dir = /home/HPCRunner/vasp.6.1.0/bin/ +case_dir = /home/HPCRunner/VASP-std-GPU/ + +[BUILD] +make std + +[CLEAN] +make veryclean + +[RUN] +run = mpirun --allow-run-as-root -n 4 +binary = vasp_std +nodes = 1 + +[BATCH] +mpirun --allow-run-as-root -n 4 /home/HPCRunner/vasp.6.1.0/bin/vasp_std >> vasp.log +mpirun --allow-run-as-root -n 2 /home/HPCRunner/vasp.6.1.0/bin/vasp_std >> vasp.log +mpirun --allow-run-as-root -n 1 /home/HPCRunner/vasp.6.1.0/bin/vasp_std >> vasp.log \ No newline at end of file diff --git a/tunning/CESM/cesm-aarch64-kunpeng-001.patch b/tunning/CESM/cesm-aarch64-kunpeng-001.patch new file mode 100644 index 0000000000000000000000000000000000000000..c850ef0234bb80f248f48783373dc2a0d48db011 --- /dev/null +++ b/tunning/CESM/cesm-aarch64-kunpeng-001.patch @@ -0,0 +1,188 @@ +diff -uNr components/cam/src/dynamics/fv/mapz_module.F90 components_modify/cam/src/dynamics/fv/mapz_module.F90 +--- components/cam/src/dynamics/fv/mapz_module.F90 2021-09-27 23:43:43.844314000 +0800 ++++ components_modify/cam/src/dynamics/fv/mapz_module.F90 2022-09-27 03:09:05.026601000 +0800 +@@ -724,15 +724,18 @@ + km1 = km - 1 + it = i2 - i1 + 1 + +- do k=2,km ++ ++ + do i=i1,i2 +- delq(i,k-1) = a4(1,i,k) - a4(1,i,k-1) +- d4(i,k ) = delp(i,k-1) + delp(i,k) +- enddo +- enddo ++ delq(i,2-1) = a4(1,i,2) - a4(1,i,2-1) ++ d4(i,2 ) = delp(i,2-1) + delp(i,2) ++ enddo + +- do k=2,km1 ++ k=2 + do i=i1,i2 ++ ++ delq(i,k-1+1) = a4(1,i,k+1) - a4(1,i,k-1+1) ++ d4(i,k +1 ) = delp(i,k-1+1) + delp(i,k+1) + c1 = (delp(i,k-1)+D0_5*delp(i,k))/d4(i,k+1) + c2 = (delp(i,k+1)+D0_5*delp(i,k))/d4(i,k) + tmp = delp(i,k)*(c1*delq(i,k) + c2*delq(i,k-1)) / & +@@ -742,23 +745,40 @@ + dc(i,k) = sign(min(abs(tmp),qmax,qmin), tmp) + df2(i,k) = tmp + enddo +- enddo ++ ++ do k=3,km1 ++ ++ do i=i1,i2 ++ ++ delq(i,k-1+1) = a4(1,i,k+1) - a4(1,i,k-1+1) ++ d4(i,k +1 ) = delp(i,k-1+1) + delp(i,k+1) ++ c1 = (delp(i,k-1)+D0_5*delp(i,k))/d4(i,k+1) ++ c2 = (delp(i,k+1)+D0_5*delp(i,k))/d4(i,k) ++ tmp = delp(i,k)*(c1*delq(i,k) + c2*delq(i,k-1)) / & ++ (d4(i,k)+delp(i,k+1)) ++ qmax = max(a4(1,i,k-1),a4(1,i,k),a4(1,i,k+1)) - a4(1,i,k) ++ qmin = a4(1,i,k) - min(a4(1,i,k-1),a4(1,i,k),a4(1,i,k+1)) ++ dc(i,k) = sign(min(abs(tmp),qmax,qmin), tmp) ++ df2(i,k) = tmp + + !****6***0*********0*********0*********0*********0*********0**********72 + ! 4th order interpolation of the provisional cell edge value + !****6***0*********0*********0*********0*********0*********0**********72 + +- do k=3,km1 +- do i=i1,i2 +- c1 = delq(i,k-1)*delp(i,k-1) / d4(i,k) ++ c1 = delq(i,k-1)*delp(i,k-1) / d4(i,k) + a1 = d4(i,k-1) / (d4(i,k) + delp(i,k-1)) + a2 = d4(i,k+1) / (d4(i,k) + delp(i,k)) + a4(2,i,k) = a4(1,i,k-1) + c1 + D2_0/(d4(i,k-1)+d4(i,k+1)) * & + ( delp(i,k)*(c1*(a1 - a2)+a2*dc(i,k-1)) - & + delp(i,k-1)*a1*dc(i,k ) ) +- enddo ++ ++ ++ enddo + enddo + ++ ++ ++ + call steepz(i1, i2, km, a4, df2, dc, delq, delp, d4) + + ! Area preserving cubic with 2nd deriv. = 0 at the boundaries +@@ -1236,29 +1256,46 @@ + !BOC + ! + ! !LOCAL VARIABLES: +- integer i, k ++ integer i, k, inc + real(r8) alfa(i1:i2,km) + real(r8) f(i1:i2,km) + real(r8) rat(i1:i2,km) + real(r8) dg2 + + ! Compute ratio of dq/dp +- do k=2,km ++ ++ k = 2 + do i=i1,i2 + rat(i,k) = dq(i,k-1) / d4(i,k) + enddo +- enddo + +-! Compute F +- do k=2,km-1 ++ inc = 1 ++ ++ ++ k = 3 + do i=i1,i2 ++! Compute ratio of dq/dp ++ rat(i,k + inc) = dq(i,k-1 + inc) / d4(i,k +inc) ++ ++! Compute F + f(i,k) = (rat(i,k+1) - rat(i,k)) & + / ( dp(i,k-1)+dp(i,k)+dp(i,k+1) ) + enddo +- enddo + +- do k=3,km-2 ++ ++ inc = inc+1 ++ ++ ++ k=3 + do i=i1,i2 ++! Compute ratio of dq/dp ++ rat(i,k + inc) = dq(i,k-1 + inc) / d4(i,k +inc) ++ ++! Compute F ++ f(i,k + inc - 1) = (rat(i,k+1+ inc - 1) - rat(i,k+ inc - 1)) & ++ / ( dp(i,k-1+ inc - 1)+dp(i,k+ inc - 1)+dp(i,k+1+ inc - 1) ) ++ ++ + if(f(i,k+1)*f(i,k-1) if you have libraries ++# FPPFLAGS pre-processing flags ++# AR assembler ++# ARFLAGS assembler flags ++# MK make ++# USER_INC complete list of include files ++# USER_LIB complete list of libraries to pass to the linker ++# ++# Note that: ++# - unix variables "$..." are accpeted and will be evaluated before calling fcm. ++# - fcm variables are starting with a % (and not a $) ++# ++ ++%NCDF_HOME /path/to/netcdf ++ ++%NCDF_INC -I%NCDF_HOME/include ++%NCDF_LIB -L%NCDF_HOME/lib ++ ++%CPP cpp -Dkey_nosignedzero ++%FC mpif90 -c -cpp ++%FCFLAGS -fdefault-real-8 -fno-second-underscore -Dgfortran -ffree-line-length-none ++%FFLAGS %FCFLAGS ++%LD %FC ++%LDFLAGS ++%FPPFLAGS -x f77-cpp-input ++%AR gcc-ar ++%ARFLAGS rs ++%MK gmake ++%USER_INC %NCDF_INC ++%USER_LIB %NCDF_LIB -lnetcdf -L/path/to/libnonosc.so -lnonosc-bs ++ +diff -uprN NEMO/source/ARCH/arch-gfortran.fcm NEMO-optimized/source/ARCH/arch-gfortran.fcm +--- NEMO/source/ARCH/arch-gfortran.fcm 2021-12-10 15:04:45.208988000 +0800 ++++ NEMO-optimized/source/ARCH/arch-gfortran.fcm 2021-12-10 15:58:21.364914000 +0800 +@@ -30,19 +30,19 @@ + # - fcm variables are starting with a % (and not a $) + # + +-%NCDF_HOME /WORK/home/qhyc1/zhangyu/soft_gcc/netcdf_3_6_3 ++%NCDF_HOME /path/to/netcdf + + %NCDF_INC -I%NCDF_HOME/include + %NCDF_LIB -L%NCDF_HOME/lib + +-%CPP cpp +-%FC mpif90 ++%CPP cpp -Dkey_nosignedzero ++%FC mpif90 -c -cpp + %FCFLAGS -fdefault-real-8 -fno-second-underscore -Dgfortran -ffree-line-length-none + %FFLAGS %FCFLAGS + %LD %FC + %LDFLAGS + %FPPFLAGS -x f77-cpp-input +-%AR ar ++%AR gcc-ar + %ARFLAGS rs + %MK gmake + %USER_INC %NCDF_INC +diff -uprN NEMO/source/NEMO/OPA_SRC/TRA/traadv_tvd.F90 NEMO-optimized/source/NEMO/OPA_SRC/TRA/traadv_tvd.F90 +--- NEMO/source/NEMO/OPA_SRC/TRA/traadv_tvd.F90 2021-12-10 15:04:38.186792000 +0800 ++++ NEMO-optimized/source/NEMO/OPA_SRC/TRA/traadv_tvd.F90 2021-12-10 15:36:49.848020000 +0800 +@@ -599,79 +599,85 @@ CONTAINS + ! + CALL wrk_alloc( jpi, jpj, jpk, zbetup, zbetdo, zbup, zbdo ) + ! +- zbig = 1.e+40_wp +- zrtrn = 1.e-15_wp +- zbetup(:,:,:) = 0._wp ; zbetdo(:,:,:) = 0._wp +- +- ! Search local extrema +- ! -------------------- +- ! max/min of pbef & paft with large negative/positive value (-/+zbig) inside land +- zbup = MAX( pbef * tmask - zbig * ( 1._wp - tmask ), & +- & paft * tmask - zbig * ( 1._wp - tmask ) ) +- zbdo = MIN( pbef * tmask + zbig * ( 1._wp - tmask ), & +- & paft * tmask + zbig * ( 1._wp - tmask ) ) +- +- DO jk = 1, jpkm1 +- ikm1 = MAX(jk-1,1) +- z2dtt = p2dt(jk) +- DO jj = 2, jpjm1 +- DO ji = fs_2, fs_jpim1 ! vector opt. +- +- ! search maximum in neighbourhood +- zup = MAX( zbup(ji ,jj ,jk ), & +- & zbup(ji-1,jj ,jk ), zbup(ji+1,jj ,jk ), & +- & zbup(ji ,jj-1,jk ), zbup(ji ,jj+1,jk ), & +- & zbup(ji ,jj ,ikm1), zbup(ji ,jj ,jk+1) ) +- +- ! search minimum in neighbourhood +- zdo = MIN( zbdo(ji ,jj ,jk ), & +- & zbdo(ji-1,jj ,jk ), zbdo(ji+1,jj ,jk ), & +- & zbdo(ji ,jj-1,jk ), zbdo(ji ,jj+1,jk ), & +- & zbdo(ji ,jj ,ikm1), zbdo(ji ,jj ,jk+1) ) +- +- ! positive part of the flux +- zpos = MAX( 0., paa(ji-1,jj ,jk ) ) - MIN( 0., paa(ji ,jj ,jk ) ) & +- & + MAX( 0., pbb(ji ,jj-1,jk ) ) - MIN( 0., pbb(ji ,jj ,jk ) ) & +- & + MAX( 0., pcc(ji ,jj ,jk+1) ) - MIN( 0., pcc(ji ,jj ,jk ) ) +- +- ! negative part of the flux +- zneg = MAX( 0., paa(ji ,jj ,jk ) ) - MIN( 0., paa(ji-1,jj ,jk ) ) & +- & + MAX( 0., pbb(ji ,jj ,jk ) ) - MIN( 0., pbb(ji ,jj-1,jk ) ) & +- & + MAX( 0., pcc(ji ,jj ,jk ) ) - MIN( 0., pcc(ji ,jj ,jk+1) ) +- +- ! up & down beta terms +- zbt = e1t(ji,jj) * e2t(ji,jj) * fse3t(ji,jj,jk) / z2dtt +- zbetup(ji,jj,jk) = ( zup - paft(ji,jj,jk) ) / ( zpos + zrtrn ) * zbt +- zbetdo(ji,jj,jk) = ( paft(ji,jj,jk) - zdo ) / ( zneg + zrtrn ) * zbt +- END DO +- END DO +- END DO ++! zbig = 1.e+40_wp ++! zrtrn = 1.e-15_wp ++! zbetup(:,:,:) = 0._wp ; zbetdo(:,:,:) = 0._wp ++! ++! ! Search local extrema ++! ! -------------------- ++! ! max/min of pbef & paft with large negative/positive value (-/+zbig) inside land ++! zbup = MAX( pbef * tmask - zbig * ( 1._wp - tmask ), & ++! & paft * tmask - zbig * ( 1._wp - tmask ) ) ++! zbdo = MIN( pbef * tmask + zbig * ( 1._wp - tmask ), & ++! & paft * tmask + zbig * ( 1._wp - tmask ) ) ++! ++! DO jk = 1, jpkm1 ++! ikm1 = MAX(jk-1,1) ++! z2dtt = p2dt(jk) ++! DO jj = 2, jpjm1 ++! DO ji = fs_2, fs_jpim1 ! vector opt. ++! ++! ! search maximum in neighbourhood ++! zup = MAX( zbup(ji ,jj ,jk ), & ++! & zbup(ji-1,jj ,jk ), zbup(ji+1,jj ,jk ), & ++! & zbup(ji ,jj-1,jk ), zbup(ji ,jj+1,jk ), & ++! & zbup(ji ,jj ,ikm1), zbup(ji ,jj ,jk+1) ) ++! ++! ! search minimum in neighbourhood ++! zdo = MIN( zbdo(ji ,jj ,jk ), & ++! & zbdo(ji-1,jj ,jk ), zbdo(ji+1,jj ,jk ), & ++! & zbdo(ji ,jj-1,jk ), zbdo(ji ,jj+1,jk ), & ++! & zbdo(ji ,jj ,ikm1), zbdo(ji ,jj ,jk+1) ) ++! ++! ! positive part of the flux ++! zpos = MAX( 0., paa(ji-1,jj ,jk ) ) - MIN( 0., paa(ji ,jj ,jk ) ) & ++! & + MAX( 0., pbb(ji ,jj-1,jk ) ) - MIN( 0., pbb(ji ,jj ,jk ) ) & ++! & + MAX( 0., pcc(ji ,jj ,jk+1) ) - MIN( 0., pcc(ji ,jj ,jk ) ) ++! ++! ! negative part of the flux ++! zneg = MAX( 0., paa(ji ,jj ,jk ) ) - MIN( 0., paa(ji-1,jj ,jk ) ) & ++! & + MAX( 0., pbb(ji ,jj ,jk ) ) - MIN( 0., pbb(ji ,jj-1,jk ) ) & ++! & + MAX( 0., pcc(ji ,jj ,jk ) ) - MIN( 0., pcc(ji ,jj ,jk+1) ) ++! ++! ! up & down beta terms ++! zbt = e1t(ji,jj) * e2t(ji,jj) * fse3t(ji,jj,jk) / z2dtt ++! zbetup(ji,jj,jk) = ( zup - paft(ji,jj,jk) ) / ( zpos + zrtrn ) * zbt ++! zbetdo(ji,jj,jk) = ( paft(ji,jj,jk) - zdo ) / ( zneg + zrtrn ) * zbt ++! END DO ++! END DO ++! END DO ++#if defined key_vvl ++ CALL nonosc_p1(zbetup, zbetdo, zbup, zbdo, pbef, paft, paa, pbb, pcc, e1t, e2t, e3t_n, tmask, p2dt, jpi, jpj, jpk) ++#else ++ CALL nonosc_p1(zbetup, zbetdo, zbup, zbdo, pbef, paft, paa, pbb, pcc, e1t, e2t, e3t_0, tmask, p2dt, jpi, jpj, jpk) ++#endif + CALL lbc_lnk( zbetup, 'T', 1. ) ; CALL lbc_lnk( zbetdo, 'T', 1. ) ! lateral boundary cond. (unchanged sign) + + ! 3. monotonic flux in the i & j direction (paa & pbb) + ! ---------------------------------------- +- DO jk = 1, jpkm1 +- DO jj = 2, jpjm1 +- DO ji = fs_2, fs_jpim1 ! vector opt. +- zau = MIN( 1._wp, zbetdo(ji,jj,jk), zbetup(ji+1,jj,jk) ) +- zbu = MIN( 1._wp, zbetup(ji,jj,jk), zbetdo(ji+1,jj,jk) ) +- zcu = ( 0.5 + SIGN( 0.5 , paa(ji,jj,jk) ) ) +- paa(ji,jj,jk) = paa(ji,jj,jk) * ( zcu * zau + ( 1._wp - zcu) * zbu ) +- +- zav = MIN( 1._wp, zbetdo(ji,jj,jk), zbetup(ji,jj+1,jk) ) +- zbv = MIN( 1._wp, zbetup(ji,jj,jk), zbetdo(ji,jj+1,jk) ) +- zcv = ( 0.5 + SIGN( 0.5 , pbb(ji,jj,jk) ) ) +- pbb(ji,jj,jk) = pbb(ji,jj,jk) * ( zcv * zav + ( 1._wp - zcv) * zbv ) +- +- ! monotonic flux in the k direction, i.e. pcc +- ! ------------------------------------------- +- za = MIN( 1., zbetdo(ji,jj,jk+1), zbetup(ji,jj,jk) ) +- zb = MIN( 1., zbetup(ji,jj,jk+1), zbetdo(ji,jj,jk) ) +- zc = ( 0.5 + SIGN( 0.5 , pcc(ji,jj,jk+1) ) ) +- pcc(ji,jj,jk+1) = pcc(ji,jj,jk+1) * ( zc * za + ( 1._wp - zc) * zb ) +- END DO +- END DO +- END DO ++ CALL nonosc_p2(zbetdo, zbetup, paa, pbb, pcc, jpi, jpj, jpk) ++ !DO jk = 1, jpkm1 ++ ! DO jj = 2, jpjm1 ++ ! DO ji = fs_2, fs_jpim1 ! vector opt. ++ ! zau = MIN( 1._wp, zbetdo(ji,jj,jk), zbetup(ji+1,jj,jk) ) ++ ! zbu = MIN( 1._wp, zbetup(ji,jj,jk), zbetdo(ji+1,jj,jk) ) ++ ! zcu = ( 0.5 + SIGN( 0.5 , paa(ji,jj,jk) ) ) ++ ! paa(ji,jj,jk) = paa(ji,jj,jk) * ( zcu * zau + ( 1._wp - zcu) * zbu ) ++ ++ ! zav = MIN( 1._wp, zbetdo(ji,jj,jk), zbetup(ji,jj+1,jk) ) ++ ! zbv = MIN( 1._wp, zbetup(ji,jj,jk), zbetdo(ji,jj+1,jk) ) ++ ! zcv = ( 0.5 + SIGN( 0.5 , pbb(ji,jj,jk) ) ) ++ ! pbb(ji,jj,jk) = pbb(ji,jj,jk) * ( zcv * zav + ( 1._wp - zcv) * zbv ) ++ ++ !! monotonic flux in the k direction, i.e. pcc ++ !! ------------------------------------------- ++ ! za = MIN( 1., zbetdo(ji,jj,jk+1), zbetup(ji,jj,jk) ) ++ ! zb = MIN( 1., zbetup(ji,jj,jk+1), zbetdo(ji,jj,jk) ) ++ ! zc = ( 0.5 + SIGN( 0.5 , pcc(ji,jj,jk+1) ) ) ++ ! pcc(ji,jj,jk+1) = pcc(ji,jj,jk+1) * ( zc * za + ( 1._wp - zc) * zb ) ++ ! END DO ++ ! END DO ++ !END DO + CALL lbc_lnk( paa, 'U', -1. ) ; CALL lbc_lnk( pbb, 'V', -1. ) ! lateral boundary condition (changed sign) + ! + CALL wrk_dealloc( jpi, jpj, jpk, zbetup, zbetdo, zbup, zbdo ) +Binary files NEMO/source/optlib/libnonosc-bs.so and NEMO-optimized/source/optlib/libnonosc-bs.so differ diff --git a/tunning/WRF/wrf-4.2.2-nproca02.patch b/tunning/WRF/wrf-4.2.2-nproca02.patch new file mode 100644 index 0000000000000000000000000000000000000000..2f2a1a5100980c312610d8316a6457464f2ab70d --- /dev/null +++ b/tunning/WRF/wrf-4.2.2-nproca02.patch @@ -0,0 +1,132 @@ +diff -uprN wrf-4.2.2/external/RSL_LITE/module_dm.F wrf-4.2.2-patch/external/RSL_LITE/module_dm.F +--- wrf-4.2.2/external/RSL_LITE/module_dm.F 2021-01-16 01:21:58.000000000 +0800 ++++ wrf-4.2.2-patch/external/RSL_LITE/module_dm.F 2021-09-26 08:47:56.525060245 +0800 +@@ -109,22 +109,34 @@ CONTAINS + SUBROUTINE MPASPECT( P, MINM, MINN, PROCMIN_M, PROCMIN_N ) + IMPLICIT NONE + INTEGER P, M, N, MINI, MINM, MINN, PROCMIN_M, PROCMIN_N +- MINI = 2*P +- MINM = 1 +- MINN = P +- DO M = 1, P +- IF ( MOD( P, M ) .EQ. 0 ) THEN +- N = P / M +- IF ( ABS(M-N) .LT. MINI & +- .AND. M .GE. PROCMIN_M & +- .AND. N .GE. PROCMIN_N & +- ) THEN +- MINI = ABS(M-N) +- MINM = M +- MINN = N +- END IF +- END IF +- END DO ++ INTEGER i, j, k, size, max_nx, select_number ++ i = sqrt( real(P) ) ++ do j = i, 1, -1 ++ if( mod(P,j) .eq. 0 ) then ++ max_nx = j ++ exit ++ endif ++ end do ++ ++ size = 0 ++ do j = 4, max_nx, 1 ++ if( mod(P,j) .eq. 0 ) then ++ size = size + 1 ++ endif ++ end do ++ select_number = size / 2 + 1 ++ ++ k = 0 ++ do j = 4, max_nx, 1 ++ if( mod(P,j) .eq. 0 ) then ++ k = k + 1 ++ if ( k .eq. select_number) then ++ MINM = j ++ MINN = P / j ++ exit ++ endif ++ endif ++ end do + IF ( MINM .LT. PROCMIN_M .OR. MINN .LT. PROCMIN_N ) THEN + WRITE( wrf_err_message , * )'MPASPECT: UNABLE TO GENERATE PROCESSOR MESH. STOPPING.' + CALL wrf_message ( TRIM ( wrf_err_message ) ) +diff -uprN wrf-4.2.2/share/module_check_a_mundo.F wrf-4.2.2-patch/share/module_check_a_mundo.F +--- wrf-4.2.2/share/module_check_a_mundo.F 2021-01-16 01:21:58.000000000 +0800 ++++ wrf-4.2.2-patch/share/module_check_a_mundo.F 2021-09-26 08:47:52.275060093 +0800 +@@ -2490,40 +2490,40 @@ + ENDDO + #endif + +-#if ( ( EM_CORE == 1) && ( defined(DM_PARALLEL) )&& ( ! defined(STUBMPI) ) ) +-!----------------------------------------------------------------------- +-! Did the user ask for too many MPI tasks, or are those tasks poorly distributed. +-!----------------------------------------------------------------------- +- +- oops = 0 +- DO i = 1, model_config_rec % max_dom +- IF ( .NOT. model_config_rec % grid_allowed(i) ) CYCLE +- IF ( ( model_config_rec % e_we(i) / model_config_rec % nproc_x .LT. 10 ) .OR. & +- ( model_config_rec % e_sn(i) / model_config_rec % nproc_y .LT. 10 ) ) THEN +- WRITE ( wrf_err_message , * ) 'For domain ',i,', the domain size is too small for this many processors, ', & +- 'or the decomposition aspect ratio is poor.' +- CALL wrf_debug ( 0, TRIM( wrf_err_message ) ) +- WRITE ( wrf_err_message , * ) 'Minimum decomposed computational patch size, either x-dir or y-dir, is 10 grid cells.' +- CALL wrf_debug ( 0, TRIM( wrf_err_message ) ) +- WRITE ( wrf_err_message , fmt='(a,i5,a,i4,a,i4)' ) & +- 'e_we = ', model_config_rec % e_we(i),', nproc_x = ',model_config_rec % nproc_x, & +- ', with cell width in x-direction = ', & +- model_config_rec % e_we(i) / model_config_rec % nproc_x +- CALL wrf_debug ( 0, TRIM( wrf_err_message ) ) +- WRITE ( wrf_err_message , fmt='(a,i5,a,i4,a,i4)' ) & +- 'e_sn = ', model_config_rec % e_sn(i),', nproc_y = ',model_config_rec % nproc_y, & +- ', with cell width in y-direction = ', & +- model_config_rec % e_sn(i) / model_config_rec % nproc_y +- CALL wrf_debug ( 0, TRIM( wrf_err_message ) ) +- wrf_err_message = '--- ERROR: Reduce the MPI rank count, or redistribute the tasks.' +- CALL wrf_debug ( 0, TRIM( wrf_err_message ) ) +- oops = oops + 1 +- END IF +- ENDDO +- IF ( oops .GT. 0 ) THEN +- count_fatal_error = count_fatal_error + 1 +- END IF +-#endif ++!#if ( ( EM_CORE == 1) && ( defined(DM_PARALLEL) )&& ( ! defined(STUBMPI) ) ) ++!!----------------------------------------------------------------------- ++!! Did the user ask for too many MPI tasks, or are those tasks poorly distributed. ++!!----------------------------------------------------------------------- ++! ++! oops = 0 ++! DO i = 1, model_config_rec % max_dom ++! IF ( .NOT. model_config_rec % grid_allowed(i) ) CYCLE ++! IF ( ( model_config_rec % e_we(i) / model_config_rec % nproc_x .LT. 10 ) .OR. & ++! ( model_config_rec % e_sn(i) / model_config_rec % nproc_y .LT. 10 ) ) THEN ++! WRITE ( wrf_err_message , * ) 'For domain ',i,', the domain size is too small for this many processors, ', & ++! 'or the decomposition aspect ratio is poor.' ++! CALL wrf_debug ( 0, TRIM( wrf_err_message ) ) ++! WRITE ( wrf_err_message , * ) 'Minimum decomposed computational patch size, either x-dir or y-dir, is 10 grid cells.' ++! CALL wrf_debug ( 0, TRIM( wrf_err_message ) ) ++! WRITE ( wrf_err_message , fmt='(a,i5,a,i4,a,i4)' ) & ++! 'e_we = ', model_config_rec % e_we(i),', nproc_x = ',model_config_rec % nproc_x, & ++! ', with cell width in x-direction = ', & ++! model_config_rec % e_we(i) / model_config_rec % nproc_x ++! CALL wrf_debug ( 0, TRIM( wrf_err_message ) ) ++! WRITE ( wrf_err_message , fmt='(a,i5,a,i4,a,i4)' ) & ++! 'e_sn = ', model_config_rec % e_sn(i),', nproc_y = ',model_config_rec % nproc_y, & ++! ', with cell width in y-direction = ', & ++! model_config_rec % e_sn(i) / model_config_rec % nproc_y ++! CALL wrf_debug ( 0, TRIM( wrf_err_message ) ) ++! wrf_err_message = '--- ERROR: Reduce the MPI rank count, or redistribute the tasks.' ++! CALL wrf_debug ( 0, TRIM( wrf_err_message ) ) ++! oops = oops + 1 ++! END IF ++! ENDDO ++! IF ( oops .GT. 0 ) THEN ++! count_fatal_error = count_fatal_error + 1 ++! END IF ++!#endif + + +