From 2d82dc47c9f68fea261d0390befb6d0b22ca109b Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 13 Dec 2022 11:24:58 +0800 Subject: [PATCH 1/3] update namd config --- .../namd/2.14/data.namd.arm.gpu.opt.config | 17 +- templates/namd/2.14/ns_per_day.py | 150 ++++++++++++++++++ 2 files changed, 161 insertions(+), 6 deletions(-) create mode 100644 templates/namd/2.14/ns_per_day.py diff --git a/templates/namd/2.14/data.namd.arm.gpu.opt.config b/templates/namd/2.14/data.namd.arm.gpu.opt.config index 07f9f51..b1f5ad4 100644 --- a/templates/namd/2.14/data.namd.arm.gpu.opt.config +++ b/templates/namd/2.14/data.namd.arm.gpu.opt.config @@ -2,14 +2,16 @@ 1.1.1.1 [DOWNLOAD] +#download NAMD_2.14 and charm-6.10.2 by hand https://www.ks.uiuc.edu/Development/Download/download.cgi?PackageName=NAMD +http://charm.cs.illinois.edu/distrib/charm-6.10.2.tar.gz [DEPENDENCY] yum install -y fftw-devel tcl-devel module use ./software/modulefiles module purge -./jarvis -install bisheng/2.4.0 com -module load bisheng/2.4.0 +./jarvis -install bisheng/2.1.0 com +module load bisheng/2.1.0 export CC=clang CXX=clang++ FC=flang ./jarvis -install hmpi/1.1.1 clang tar xzf $JARVIS_DOWNLOAD/NAMD_2.14_Source.tar.gz @@ -19,14 +21,16 @@ tar xf $JARVIS_DOWNLOAD/charm-6.10.2.tar [ENV] module use ./software/modulefiles module purge -module load bisheng/2.4.0 +module load bisheng/2.1.0 module load hmpi/1.1.1 export CC=clang CXX=clang++ FC=flang +export PATH=//usr/local/cuda/bin:$PATH [APP] app_name = NAMD build_dir = $JARVIS_ROOT/NAMD_2.14_Source/ -binary_dir = $JARVIS_ROOT/NAMD_2.14_Source/Linux-ARM64-g++/ +#binary_dir = $JARVIS_ROOT/NAMD_2.14_Source/Linux-ARM64-g++ +binary_dir = case_dir = ${JARVIS_ROOT}/workloads/namd/stmv [BUILD] @@ -64,8 +68,9 @@ make -j make clean [RUN] -run = -binary = namd2 +p126 +setcpuaffinity +maffinity +isomalloc_sync +devices 0,2 stmv_nve_cuda.namd 2>&1 | tee namd.log && python $JARVIS_ROOT/templates/namd/2.14/ns_per_day.py namd.log +run = rm -rf namd_* && dsub -s dsub_namd_1n.sh +binary = +# binary = namd2 +p126 +setcpuaffinity +maffinity +isomalloc_sync +devices 0,1 stmv_nve_cuda.namd 2>&1 | tee namd.log && python2 $JARVIS_ROOT/templates/namd/2.14/ns_per_day.py namd.log nodes = 1 [PERF] diff --git a/templates/namd/2.14/ns_per_day.py b/templates/namd/2.14/ns_per_day.py new file mode 100644 index 0000000..e99bf10 --- /dev/null +++ b/templates/namd/2.14/ns_per_day.py @@ -0,0 +1,150 @@ +#!/usr/bin/python +# +# ns_per_day.py +# +# Find nanoseconds per day simulation rate from log file +# by averaging time per step using "TIMING:" lines. +# Also need to determine TIMESTEP value. +# +import sys +import string +import math +scaling = 3 # scaling of standard deviation +do_remove = False # remove outliers? +do_list = False # list all timings? +do_quiet = False # print only final ns/day value +fname = '' # the file name for value +argcnt = len(sys.argv) +i = 1 +while i < argcnt: + arg = sys.argv[i] + if arg=="-s" or arg=="--scaling": + if i >= argcnt-1: + sys.stderr.write('Missing value for scaling\n') + sys.exit(1) + scaling = float(sys.argv[i+1]) + i += 1 + elif arg=="-f" or arg=="--file": + if i >= argcnt-1: + sys.stderr.write('Missing name of log file\n') + sys.exit(1) + fname = sys.argv[i+1] + i += 1 + elif arg=="-r" or arg=="--remove": + do_remove = True + elif arg=="-l" or arg=="--list": + do_list = True + elif arg=="-q" or arg=="--quiet": + do_quiet = True + elif i < argcnt-1: + sys.stderr.write('Found extra argument after file name\n') + sys.exit(1) + else: + fname = arg + i += 1 + +if fname=='': + sys.stderr.write('No log file specified\n') + +# for debugging +#print 'scaling= ', scaling +#print 'do_remove= ', do_remove +#print 'do_list= ', do_list +#print 'fname= ', fname + +f = open(fname, 'r') +dt = 1.0 # assume time step of 1.0 until otherwise specified +tlist = [] # list of timings is initially empty +for line in f: + s = string.split(line) + if len(s) > 1 and s[1] == 'TIMESTEP': + dt = float(s[2]) + if len(s) > 0 and s[0] == 'TIMING:': + # parse the TIMING line for wall clock seconds per step + t = float(string.split(s[7],'/')[0]) + tlist.append(t) + +n = len(tlist) + +# calculate the mean +t_avg = 0.0 +for t in tlist: + t_avg += t +if n > 0: + t_avg /= n + +# calculate the variance +t_var = 0.0 +for t in tlist: + t_var += (t - t_avg)**2 +if n > 0: + t_var /= n + +# calculate the standard deviation +t_std = math.sqrt(t_var) + +# calculate outlier limit by scaling standard deviation +t_out = scaling * t_std + t_avg + +# list all values? +if do_list and not do_quiet: + print '----------------------------------------' + print 'Listing %d values:' % n + for t in tlist: + if t < t_out: + print t + else: + print t, ' <--- outlier' + print '----------------------------------------' + +# calculate nanoseconds per day +ns_per_day = 0.0 +if t_avg != 0.0: + ns_per_day = (dt / t_avg) * (60 * 60 * 24 * 1e-6) + +if not do_quiet: + # print results + print 'Nanoseconds per day: %g' % ns_per_day + print + print 'Mean time per step: %g' % t_avg + #print 'Variance: %g' % t_var + print 'Standard deviation: %g' % t_std +elif not do_remove: + print '%g' % ns_per_day + +# remove the outliers? +if do_remove: + nt_avg = 0.0 # calculate new time average + nt = 0 # new count + for t in tlist: + if t < t_out: + nt_avg += t + nt += 1 + if nt > 0: + nt_avg /= nt + # calculate new variance and standard deviation + nt_var = 0.0 + for t in tlist: + if t < t_out: + nt_var += (t - nt_avg)**2 + if nt > 0: + nt_var /= nt + nt_std = math.sqrt(nt_var) + # calculate new nanoseconds per day + nns_per_day = 0.0 + if nt_avg != 0.0: + nns_per_day = (dt / nt_avg) * (60 * 60 * 24 * 1e-6) + if not do_quiet: + # print adjusted results + print '----------------------------------------' + print 'Remove outliers beyond (avg + %g sigma) = %g' % (scaling, t_out) + print '(removing %d of %d values)' % (n - nt, n) + print + print 'Adjusted nanoseconds per day: %g' % nns_per_day + print + print 'Adjusted mean time per step: %g' % nt_avg + #print 'Adjusted variance: %g' % nt_var + print 'Adjusted standard deviation: %g' % nt_std + else: + print '%g' % nns_per_day + -- Gitee From 6a5527b188f3d3949c8c6b875d67f47245638324 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 13 Dec 2022 11:43:12 +0800 Subject: [PATCH 2/3] Add schedule script --- templates/namd/2.14/dsub_namd_1n.sh | 45 +++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 templates/namd/2.14/dsub_namd_1n.sh diff --git a/templates/namd/2.14/dsub_namd_1n.sh b/templates/namd/2.14/dsub_namd_1n.sh new file mode 100644 index 0000000..81f62cd --- /dev/null +++ b/templates/namd/2.14/dsub_namd_1n.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#DSUB -n namd_1n_2card +#DSUB --job_type cosched +#DSUB -N 1 +#DSUB -R "cpu=128;mem=256000;gpu=2" +#DSUB -A root.default +#DSUB -q root.default +#DSUB -o namd_%J.log +#DSUB -e namd_err_%J.log + + +echo ----- print env vars ----- +if [ "${CCSCHEDULER_ALLOC_FILE}" != "" ]; then + echo " " + ls -la ${CCSCHEDULER_ALLOC_FILE} + echo ------ cat ${CCSCHEDULER_ALLOC_FILE} + cat ${CCSCHEDULER_ALLOC_FILE} +fi + +export HOSTFILE=/tmp/hostfile.$$ +rm -rf $HOSTFILE +touch $HOSTFILE + +ntask=`cat ${CCSCHEDULER_ALLOC_FILE} | awk -v fff="$HOSTFILE" '{} +{ + split($0, a, " ") + if (length(a[1]) >0 && length(a[3]) >0) { + print a[1]" slots="a[2] >> fff + total_task+=a[2] + } +}END{print total_task}'` + +echo "openmpi hostfile $HOSTFILE generated:" +echo "-----------------------" +cat $HOSTFILE +echo "-----------------------" +echo "Total tasks is $ntask" +echo "mpirun -hostfile $HOSTFILE -n $ntask " +ulimit -s unlimited + +$JARVIS_ROOT/NAMD_2.14_Source/Linux-ARM64-g++/namd2 +p126 +setcpuaffinity +maffinity +isomalloc_sync +devices 0,1 stmv_nve_cuda.namd 2>&1 | tee namd.log && python2 $JARVIS_ROOT/templates/namd/2.14/ns_per_day.py namd.log +ret=$? + +rm -rf $HOSTFILE +exit $ret -- Gitee From 71286e854ad7a64290986e67cdde45495ed9dffe Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Fri, 23 Dec 2022 16:31:59 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dhmpi=E5=8D=87=E7=BA=A7BUG?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/installService.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/installService.py b/src/installService.py index 2bed271..14be3d9 100644 --- a/src/installService.py +++ b/src/installService.py @@ -88,12 +88,26 @@ class InstallService: def get_icc_info(self): return self.gen_compiler_dict("icc", ('2018', "2018.4")) + def get_hmpi_version(self): + mpirun_path = self.get_cmd_output('which mpirun')[0] + hmpi_path = os.path.dirname(mpirun_path) + hmpi_path = os.path.dirname(hmpi_path) + libucg_path = os.path.join(hmpi_path, "hucx/lib") + libucg_so_flag = "libucg.so." + version = None + for file_name in os.listdir(libucg_path): + if libucg_so_flag in file_name: + version = self.get_version_info(file_name) + if version: + break + return version + def get_hmpi_info(self): hmpi_info = self.get_cmd_output('ompi_info | grep "MCA coll: ucx"')[0] if hmpi_info == "": return None name = 'hmpi' - version = self.get_version_info(hmpi_info, r'Component v(\d+)\.(\d+)\.(\d+)') + version = self.get_hmpi_version() return self.gen_mpi_dict(name, version) def get_openmpi_info(self): -- Gitee