diff --git a/container/stream/openeuler-bisheng2-hmpi2-stream-1.8.def b/container/stream/openeuler-bisheng2-hmpi2-stream-1.8.def new file mode 100644 index 0000000000000000000000000000000000000000..24cafbbaa15eff400570b11e2a3acfee93e91322 --- /dev/null +++ b/container/stream/openeuler-bisheng2-hmpi2-stream-1.8.def @@ -0,0 +1,35 @@ +BootStrap: docker +From: openeuler/openeuler + +%environment + source /etc/profile || true + source /etc/profile.d/modules.sh + cd /hpcrunner + source init.sh + source env.sh + +%post + # Install the necessary development environment + yum install -y environment-modules git wget unzip make flex tar + source /etc/profile || true + git config --global http.sslVerify false + git clone https://gitee.com/openeuler/hpcrunner + cd hpcrunner + source ./init.sh + ./jarvis -i + # Switch config + ./jarvis -use templates/stream/1.8/data.stream.arm.cpu.config + # download dependency + ./jarvis -d + # install dependency + ./jarvis -dp + # build stream + ./jarvis -b + # run test + ./jarvis -r + # clean cache and downloads directory + yum clean all + rm -rf downloads + +%labels + Author shaoyuteng \ No newline at end of file diff --git a/container/stream/openeuler-gcc-10.3.0-oneapi1-stream-1.8.def b/container/stream/openeuler-gcc-10.3.0-oneapi1-stream-1.8.def new file mode 100644 index 0000000000000000000000000000000000000000..4043de60f5c72c51a520557de3549ca51bcd2984 --- /dev/null +++ b/container/stream/openeuler-gcc-10.3.0-oneapi1-stream-1.8.def @@ -0,0 +1,35 @@ +BootStrap: docker +From: openeuler/openeuler + +%environment + source /etc/profile || true + source /etc/profile.d/modules.sh + cd /hpcrunner + source env.sh + +%post + # Install the necessary development environment + yum install -y environment-modules git flex wget vim tar unzip coreutils + # Install base gcc + yum install -y gcc gcc-c++ gcc-gfortran glibc-devel make libgfortran + source /etc/profile || true + git config --global http.sslVerify false + git clone https://gitee.com/openeuler/hpcrunner.git + cd hpcrunner + source ./init.sh + ./jarvis -i + # Switch config + ./jarvis -use templates/stream/1.8/data.stream.x86.cpu.config + # downloads stream + ./jarvis -d + # install dependency + ./jarvis -dp + # build stream + ./jarvis -b + # run stream + ./jarvis -r + # clean downloads directory + rm -rf downloads + +%labels + Author shaoyuteng \ No newline at end of file diff --git "a/doc/stream/\343\200\212\345\237\272\344\272\216openEuler\347\232\204stream\350\275\257\344\273\266\346\265\213\350\257\225\346\212\245\345\221\212\343\200\213.md" "b/doc/stream/\343\200\212\345\237\272\344\272\216openEuler\347\232\204stream\350\275\257\344\273\266\346\265\213\350\257\225\346\212\245\345\221\212\343\200\213.md" new file mode 100644 index 0000000000000000000000000000000000000000..b406ff2e9dc8c19fe060daa224f9235aa3e5dd2d --- /dev/null +++ "b/doc/stream/\343\200\212\345\237\272\344\272\216openEuler\347\232\204stream\350\275\257\344\273\266\346\265\213\350\257\225\346\212\245\345\221\212\343\200\213.md" @@ -0,0 +1,1064 @@ +# 《基于openEuler的imb软件测试报告》 + +## 1.规范性自检 + +使用对项目Clang-Format对文件进行格式化 + +Clang-Format是一个广泛使用的C代码格式化器。我们在使用编辑器的缩进(TAB)功能时,由于不同编辑器的差别,有的插入的是制表符,有的是2个空格,有的是4个空格。这样如果别人用另一个编辑器来阅读程序时,可能会由于缩进的不同,导致阅读效果一团糟。为了解决这个问题,使用Clang-Format,它可以自动重新缩进,并手动指定空格的数量,自动格式化源文件。它是可以通过命令行使用,也可以作为插件,在其他IDE中使用。 + +文件格式化配置参考文件`.clang-format`,文件内容如下 + +```clang-format +--- +Language: Cpp +# BasedOnStyle: LLVM +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignConsecutiveMacros: false +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Right +AlignOperands: true +AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: MultiLine +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DeriveLineEnding: true +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + SortPriority: 0 + - Regex: '^(<|"(gtest|gmock|isl|json)/)' + Priority: 3 + SortPriority: 0 + - Regex: '.*' + Priority: 1 + SortPriority: 0 +IncludeIsMainRegex: '(Test)?$' +IncludeIsMainSourceRegex: '' +IndentCaseLabels: false +IndentGotoLabels: true +IndentPPDirectives: None +IndentWidth: 2 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: true +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +PointerAlignment: Right +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInConditionalStatement: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +Standard: Latest +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseCRLF: false +UseTab: Never +... +``` + +### 1.1.选择统计文件类型 + +统计项目文件类型及其文件数量 + +使用python编写脚本文件 + +```python +# -*- coding: utf-8 -*- + +import os + +print (os.getcwd()) + +def getAllFiles(targetDir): + files = [] + listFiles = os.listdir(targetDir) + for i in range(0, len(listFiles)): + path = os.path.join(targetDir, listFiles[i]) + if os.path.isdir(path): + files.extend(getAllFiles(path)) + elif os.path.isfile(path): + files.append(path) + return files + +all_files=getAllFiles(os.curdir) +type_dict=dict() + +for each_file in all_files: + if os.path.isdir(each_file): + type_dict.setdefault("文件夹",0) + type_dict["文件夹"]+=1 + else: + ext=os.path.splitext(each_file)[1] + type_dict.setdefault(ext,0) + type_dict[ext]+=1 + +for each_type in type_dict.keys(): + print ("当前文件夹下共有[%s]的文件%d个" %(each_type,type_dict[each_type])) +``` + +在imb项目根目录下运行,运行结果如下 + +```bash +[root@192 stream-1.8]# python ../count.py +/root/stream-1.8 +当前文件夹下共有[]的文件3个 +当前文件夹下共有[.c]的文件2个 +当前文件夹下共有[.f]的文件1个 +当前文件夹下共有[.o]的文件2个 + +### 1.3.统计不符合要求的总行数 + +对文件后缀名为`c`的所有文件进行格式,后缀名为`f`的文件的代码是由Fortran77编写,现网支持的fortran格式化工具fprettify只支持f90以及更高的版本,所以无法对其进行格式。 +通过git与clang-format结合的方式进行统计 + +```bash +[root@host- src]# [root@192 stream]# find . -regex '.*\.\(c\|h\)' | xargs clang-format -style=LLVM -i +[root@host- src]# git commit -m "format" +[master aff8b23] format + 2 files changed, 873 insertions(+), 858 deletions(-) + rewrite stream/mysecond.c (68%) + rewrite stream/stream_mpi.c (63%) +``` + +### 1.4.统计结果 + +综上信息,项目中代码规范性自检检查结果为 + +通过率 : 40.13% 1-858/1433*100% + +不通过率 : 59.87% 858/1433*100% + +## 2.功能性测试 + +### 2.1.所选测试案例 + +stream提供了运行的不同二进制文件,本次选取stream_mpi_c对MPI以及程序的运行情况进行检测。 + +在项目根目录下执行命令来运行Benchmark + +```bash +export OMP_NUM_THREADS=4; +./stream_mpi_c +``` + +### 2.2.运行结果 + +```bash +[root@host-10-208-220-226 Stream-1.8]# ./stream_mpi_c +------------------------------------------------------------- +STREAM version $Revision: 1.8 $ +------------------------------------------------------------- +This system uses 8 bytes per array element. +------------------------------------------------------------- +Total Aggregate Array size = 34603008 (elements) +Total Aggregate Memory per array = 264.0 MiB (= 0.3 GiB). +Total Aggregate memory required = 792.0 MiB (= 0.8 GiB). +Data is distributed across 1 MPI ranks + Array size per MPI rank = 34603008 (elements) + Memory per array per MPI rank = 264.0 MiB (= 0.3 GiB). + Total memory per MPI rank = 792.0 MiB (= 0.8 GiB). +------------------------------------------------------------- +Each kernel will be executed 20 times. + The *best* time for each kernel (excluding the first iteration) + will be used to compute the reported bandwidth. +The SCALAR value used for this run is 0.420000 +------------------------------------------------------------- +Your timer granularity/precision appears to be 1 microseconds. +Each test below will take on the order of 35916 microseconds. + (= 35916 timer ticks) +Increase the size of the arrays if this shows that +you are not getting at least 20 timer ticks per test. +------------------------------------------------------------- +WARNING -- The above is only a rough guideline. +For best results, please be sure you know the +precision of your system timer. +------------------------------------------------------------- +VERBOSE: total setup time for rank 0 = 0.095054 seconds +------------------------------------------------------------- +Function Best Rate MB/s Avg time Min time Max time +Copy: 15488.3 0.037941 0.035746 0.042000 +Scale: 15173.8 0.037793 0.036487 0.041473 +Add: 12718.6 0.065584 0.065296 0.066783 +Triad: 12637.4 0.066013 0.065715 0.067098 +------------------------------------------------------------- +VERBOSE: rank 0, AvgErrors 1.332268e-15 4.440892e-16 1.776357e-15 +Solution Validates: avg error less than 1.000000e-13 on all three arrays +Results Validation Verbose Results: + Expected a(1), b(1), c(1): 2.769001 1.144215 3.868538 + Observed a(1), b(1), c(1): 2.769001 1.144215 3.868538 + Rel Errors on a, b, c: 4.811366e-16 3.881168e-16 4.591805e-16 +------------------------------------------------------------- +VERBOSE: total shutdown time for rank 0 = 0.281207 seconds +``` + +测试结果 + +运行正常,说明各类型函数和功能都响应正常。测试通过。 + +## 3.性能测试 + +### 3.1.测试平台信息对比 + +| | arm信息 | x86信息 | +| -------- | -------------------------------- | --------------------- | +| 操作系统 | openEuler 22.03 (LTS) | openEuler 22.03 (LTS) | +| 内核版本 | 4.19.90-2012.4.0.0053.oe1.aarch64 | 4.19.90-2109.1.0.0108.oe1.x86_64 | + +### 3.2.测试软件环境信息对比 + +| | arm信息 | x86信息 | +| --- | ------------- | --------- | +| gcc | bisheng 2.1.0 | gcc 10.3.0 | +| mpi | hmpi 1.2.0 | oneapi 2021.1.0 | +| stream | 1.8 | 1.8 | + +### 3.3.测试硬件性能信息对比 + +| | arm信息 | x86信息 | +| ------ | ----------- | -------- | +| cpu | Kunpeng 920 | | +| 核心数 | 16 | 4 | +| 内存 | 32 GB | 8 GB | +| 磁盘io | 1.3 GB/s | 400 MB/s | +| 虚拟化 | KVM | KVM | + +### 3.4.测试选择的案例 + +stream进行内存带宽测试 + +### 3.6.ARM运行结果 + +#### stream_mpi_c + +```shell +------------------------------------------------------------- +STREAM version $Revision: 1.8 $ +------------------------------------------------------------- +This system uses 8 bytes per array element. +------------------------------------------------------------- +/*-----------------------------------------------------------------------*/ +/* License: */ +/* 1. You are free to use this program and/or to redistribute */ +/* this program. */ +/* 2. You are free to modify this program for your own use, */ +/* including commercial use, subject to the publication */ +/* restrictions in item 3. */ +/* 3. You are free to publish results obtained from running this */ +/* program, or from works that you derive from this program, */ +/* with the following limitations: */ +/* 3a. In order to be referred to as "STREAM benchmark results", */ +/* published results must be in conformance to the STREAM */ +/* Run Rules, (briefly reviewed below) published at */ +/* http://www.cs.virginia.edu/stream/ref.html */ +/* and incorporated herein by reference. */ +/* As the copyright holder, John McCalpin retains the */ +/* right to determine conformity with the Run Rules. */ +/* 3b. Results based on modified source code or on runs not in */ +/* accordance with the STREAM Run Rules must be clearly */ +/* labelled whenever they are published. Examples of */ +/* proper labelling include: */ +/* "tuned STREAM benchmark results" */ +/* "based on a variant of the STREAM benchmark code" */ +/* Other comparable, clear, and reasonable labelling is */ +/* acceptable. */ +/* 3c. Submission of results to the STREAM benchmark web site */ +/* is encouraged, but not required. */ +/* 4. Use of this program or creation of derived works based on this */ +/* program constitutes acceptance of these licensing restrictions. */ +/* 5. Absolutely no warranty is expressed or implied. */ +/*-----------------------------------------------------------------------*/ + +# define _XOPEN_SOURCE 600 + +# include +# include +# include +# include +# include +# include +# include +# include +# include "mpi.h" + +/*----------------------------------------------------------------------- + * INSTRUCTIONS: + * + * 1) STREAM requires different amounts of memory to run on different + * systems, depending on both the system cache size(s) and the + * granularity of the system timer. + * You should adjust the value of 'STREAM_ARRAY_SIZE' (below) + * to meet *both* of the following criteria: + * (a) Each array must be at least 4 times the size of the + * available cache memory. I don't worry about the difference + * between 10^6 and 2^20, so in practice the minimum array size + * is about 3.8 times the cache size. + * Example 1: One Xeon E3 with 8 MB L3 cache + * STREAM_ARRAY_SIZE should be >= 4 million, giving + * an array size of 30.5 MB and a total memory requirement + * of 91.5 MB. + * Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) + * STREAM_ARRAY_SIZE should be >= 20 million, giving + * an array size of 153 MB and a total memory requirement + * of 458 MB. + * (b) The size should be large enough so that the 'timing calibration' + * output by the program is at least 20 clock-ticks. + * Example: most versions of Windows have a 10 millisecond timer + * granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. + * If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. + * This means the each array must be at least 1 GB, or 128M elements. + * + * Version 5.10 increases the default array size from 2 million + * elements to 10 million elements in response to the increasing + * size of L3 caches. The new default size is large enough for caches + * up to 20 MB. + * Version 5.10 changes the loop index variables from "register int" + * to "ssize_t", which allows array indices >2^32 (4 billion) + * on properly configured 64-bit systems. Additional compiler options + * (such as "-mcmodel=medium") may be required for large memory runs. + * + * Array size can be set at compile time without modifying the source + * code for the (many) compilers that support preprocessor definitions + * on the compile line. E.g., + * gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M + * will override the default size of 10M with a new size of 100M elements + * per array. + */ + +// ----------------------- !!! NOTE CHANGE IN DEFINITION !!! ------------------ +// For the MPI version of STREAM, the three arrays with this many elements +// each will be *distributed* across the MPI ranks. +// +// Be careful when computing the array size needed for a particular target +// system to meet the minimum size requirement to ensure overflowing the caches. +// +// Example: +// Assume 4 nodes with two Intel Xeon E5-2680 processors (20 MiB L3) each. +// The *total* L3 cache size is 4*2*20 = 160 MiB, so each array must be +// at least 640 MiB, or at least 80 million 8 Byte elements. +// Note that it does not matter whether you use one MPI rank per node or +// 16 MPI ranks per node -- only the total array size and the total +// cache size matter. +// +#ifndef STREAM_ARRAY_SIZE +# define STREAM_ARRAY_SIZE 10000000 +#endif + +/* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result + * for any iteration after the first, therefore the minimum value + * for NTIMES is 2. + * There are no rules on maximum allowable values for NTIMES, but + * values larger than the default are unlikely to noticeably + * increase the reported performance. + * NTIMES can also be set on the compile line without changing the source + * code using, for example, "-DNTIMES=7". + */ +#ifdef NTIMES +#if NTIMES<=1 +extern void checkSTREAMresults(STREAM_TYPE *AvgErrByRank, int numranks); +extern void computeSTREAMerrors(STREAM_TYPE *aAvgErr, STREAM_TYPE *bAvgErr, STREAM_TYPE *cAvgErr); +#ifdef TUNED +extern void tuned_STREAM_Copy(); +extern void tuned_STREAM_Scale(STREAM_TYPE scalar); +extern void tuned_STREAM_Add(); +extern void tuned_STREAM_Triad(STREAM_TYPE scalar); +#endif +#ifdef _OPENMP +extern int omp_get_num_threads(); +#endif +int +main() + { + int quantum, checktick(); + int BytesPerWord; + int i,k; + ssize_t j; + STREAM_TYPE scalar; + double t, times[4][NTIMES]; + double *TimesByRank; + double t0,t1,tmin; + int rc, numranks, myrank; + STREAM_TYPE AvgError[3] = {0.0,0.0,0.0}; + STREAM_TYPE *AvgErrByRank; + + /* --- SETUP --- call MPI_Init() before anything else! --- */ + + rc = MPI_Init(NULL, NULL); + t0 = MPI_Wtime(); + if (rc != MPI_SUCCESS) { + printf("ERROR: MPI Initialization failed with return code %d\n",rc); + exit(1); + } + // if either of these fail there is something really screwed up! + MPI_Comm_size(MPI_COMM_WORLD, &numranks); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + /* --- NEW FEATURE --- distribute requested storage across MPI ranks --- */ + array_elements = STREAM_ARRAY_SIZE / numranks; // don't worry about rounding vs truncation + array_alignment = 64; // Can be modified -- provides partial support for adjusting relative alignment + + // Dynamically allocate the three arrays using "posix_memalign()" + // NOTE that the OFFSET parameter is not used in this version of the code! + array_bytes = array_elements * sizeof(STREAM_TYPE); + k = posix_memalign((void **)&a, array_alignment, array_bytes); + if (k != 0) { + printf("Rank %d: Allocation of array a failed, return code is %d\n",myrank,k); + MPI_Abort(MPI_COMM_WORLD, 2); + exit(1); + } + k = posix_memalign((void **)&b, array_alignment, array_bytes); + if (k != 0) { + printf("Rank %d: Allocation of array b failed, return code is %d\n",myrank,k); + MPI_Abort(MPI_COMM_WORLD, 2); + exit(1); + } + k = posix_memalign((void **)&c, array_alignment, array_bytes); + if (k != 0) { + printf("Rank %d: Allocation of array c failed, return code is %d\n",myrank,k); + MPI_Abort(MPI_COMM_WORLD, 2); + exit(1); + } + + // Initial informational printouts -- rank 0 handles all the output + if (myrank == 0) { + printf(HLINE); + printf("STREAM version $Revision: 1.8 $\n"); + printf(HLINE); + BytesPerWord = sizeof(STREAM_TYPE); + printf("This system uses %d bytes per array element.\n", + BytesPerWord); + + printf(HLINE); +#ifdef N + printf("***** WARNING: ******\n"); + printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); + printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); + printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); + printf("***** WARNING: ******\n"); +#endif + if (OFFSET != 0) { + printf("***** WARNING: ******\n"); + printf(" This version ignores the OFFSET parameter.\n"); + printf("***** WARNING: ******\n"); + } + + printf("Total Aggregate Array size = %llu (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE); + printf("Total Aggregate Memory per array = %.1f MiB (= %.1f GiB).\n", + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); + printf("Total Aggregate memory required = %.1f MiB (= %.1f GiB).\n", + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); + printf("Data is distributed across %d MPI ranks\n",numranks); + printf(" Array size per MPI rank = %llu (elements)\n" , (unsigned long long) array_elements); + printf(" Memory per array per MPI rank = %.1f MiB (= %.1f GiB).\n", + BytesPerWord * ( (double) array_elements / 1024.0/1024.0), + BytesPerWord * ( (double) array_elements / 1024.0/1024.0/1024.0)); + printf(" Total memory per MPI rank = %.1f MiB (= %.1f GiB).\n", + (3.0 * BytesPerWord) * ( (double) array_elements / 1024.0/1024.), + (3.0 * BytesPerWord) * ( (double) array_elements / 1024.0/1024./1024.)); + + printf(HLINE); + printf("Each kernel will be executed %d times.\n", NTIMES); + printf(" The *best* time for each kernel (excluding the first iteration)\n"); + printf(" will be used to compute the reported bandwidth.\n"); + printf("The SCALAR value used for this run is %f\n",SCALAR); + +#ifdef _OPENMP + printf(HLINE); +#pragma omp parallel + { +#pragma omp master + { + k = omp_get_num_threads(); + printf ("Number of Threads requested for each MPI rank = %i\n",k); + } + } +#endif + +#ifdef _OPENMP + k = 0; +#pragma omp parallel +#pragma omp atomic + k++; + printf ("Number of Threads counted for rank 0 = %i\n",k); +#endif + + } + + /* --- SETUP --- initialize arrays and estimate precision of timer --- */ + +#pragma omp parallel for + for (j=0; j epsilon) { + err++; + printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,aj,a[j],abs((aj-a[j])/aAvgErr)); + } +#endif + } + } + printf(" For array a[], %d errors were found.\n",ierr); + } + if (abs(bAvgErr/bj) > epsilon) { + err++; + printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,bj,b[j],abs((bj-b[j])/bAvgErr)); + } +#endif + } + } + printf(" For array b[], %d errors were found.\n",ierr); + } + if (abs(cAvgErr/cj) > epsilon) { + err++; + printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,cj,c[j],abs((cj-c[j])/cAvgErr)); + } +#endif + } + } + printf(" For array c[], %d errors were found.\n",ierr); + } + if (err == 0) { + printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); + } +#ifdef VERBOSE + printf ("Results Validation Verbose Results: \n"); + printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj); + printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]); + printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj)); +#endif +} + +#ifdef TUNED +/* stubs for "tuned" versions of the kernels */ +void tuned_STREAM_Copy() +{ + ssize_t j; +#pragma omp parallel for + for (j=0; j + +- GITHUB托管地址: + +## 2.环境要求 + +- 操作系统:OpenEuler arm/x86 (本文档以 x86 架构为例) + +## 3.配置编译环境 + +配置环境指导,手动进行配置依赖环境。 + +### 3.1.环境总览 + +- 编译器: gcc + +- MPI:hmpi + + 具体版本和下载地址如下 + +| 名称 | 版本 | 软件下载地址 | +| -------- | ------ | ---------------------------------------------------------------------------------------------- | +| oneapi | 2021.1.0 | +| gcc | 10.3.0 | | + +### 3.2.创建文件夹 + +```bash +mkdir -p $HOME/build +mkdir -p $HOME/install +mkdir -p $HOME/tmp +``` + +### 3.3.安装预设 + +设置环境变量,方便修改自定义安装目录 + +- 编译目录为 $HOME/build , 根据实际情况进行修改 +- 软件安装目录为 $HOME/install , 根据实际情况进行修改 +- 下载目录为 $HOME/tmp , 根据实际情况进行修改 + +```bash +#为了方便自定义软件安装目录 +#环境变量DEP_INSTALL_DIR将在后文中作为软件安装目录的根目录 +export DEP_INSTALL_DIR=$HOME/install +#环境变量DEP_BUILD_DIR将在后文中作为编译的根目录 +export DEP_BUILD_DIR=$HOME/build +#环境变量DEP_DOWNLOAD_DIR将在后文中作为下载文件的保存目录 +export DEP_DOWNLOAD_DIR=$HOME/tmp + +#注: 以上变量只在一次会话中有效。如果中途断开ssh会话,则在后续的安装过程中不会生效,需要重新运行 +``` + +### 3.4.安装环境依赖和gcc编译器 + +```bash +#环境依赖 +yum -y install wget tar libatomic + +#安装bisheng编译器 +wget -P $DEP_DOWNLOAD_DIR https://ftp.gnu.org/gnu/gcc/gcc-10.3.0/gcc-10.3.0.tar.gz +tar -xf $DEP_DOWNLOAD_DIR/gcc-10.3.0.tar.gz -C $DEP_INSTALL_DIR +sed -i "35s/ftp/http/g" ./contrib/download_prerequisites +./contrib/download_prerequisites +./configure --disable-multilib --enable-languages="c,c++,fortran" --prefix=$1 --disable-static --enable-shared +make -j && make install +#设置环境变量 +echo "export PATH=$DEP_INSTALL_DIR/gcc-10.3.0/bin:$PATH" >> ~/.bashrc && source ~/.bashrc +export CC=`which clang` +export CXX=`which clang++` +``` + +## 3.5.下载并编译hmpi + +```bash +yum -y install libstdc++ libstdc++-devel +yum -y install unzip make autoconf automake git libtool +#下载解压源码文件 +wget https://github.com/kunpengcompute/hucx/archive/refs/tags/v1.1.1-huawei.zip -O $DEP_DOWNLOAD_DIR/hucx-1.1.1-huawei.zip +wget https://github.com/kunpengcompute/xucg/archive/refs/tags/v1.1.1-huawei.zip -O $DEP_DOWNLOAD_DIR/xucg-1.1.1-huawei.zip +wget https://github.com/kunpengcompute/hmpi/archive/refs/tags/v1.1.1-huawei.zip -O $DEP_DOWNLOAD_DIR/hmpi-1.1.1-huawei.zip + +cd $DEP_BUILD_DIR +unzip -q $DEP_DOWNLOAD_DIR/hucx-1.1.1-huawei.zip +unzip -q $DEP_DOWNLOAD_DIR/xucg-1.1.1-huawei.zip +unzip -q $DEP_DOWNLOAD_DIR/hmpi-1.1.1-huawei.zip +cp -rf xucg-1.1.1-huawei/* hucx-1.1.1-huawei/src/ucg/ + +#编译hucx +cd $DEP_BUILD_DIR/hucx-1.1.1-huawei +./autogen.sh +./contrib/configure-opt --prefix=$DEP_INSTALL_DIR/hmpi/hucx CFLAGS="-DHAVE___CLEAR_CACHE=1" --disable-numa --without-java +for file in `find . -name Makefile`;do sed -i "s/-Werror//g" $file;done +for file in `find . -name Makefile`;do sed -i "s/-implicit-function-declaration//g" $file;done +make -j +make install + +yum -y install flex +#编译hmpi +cd $DEP_BUILD_DIR/hmpi-1.1.1-huawei +./autogen.pl +./configure --prefix=$DEP_INSTALL_DIR/hmpi --with-platform=contrib/platform/mellanox/optimized --enable-mpi1-compatibility --with-ucx=$DEP_INSTALL_DIR/hmpi/hucx +make -j +make install + +echo "export PATH=$DEP_INSTALL_DIR/hmpi/bin:$PATH" >> ~/.bashrc && source ~/.bashrc +export CC=mpicc CXX=mpicxx FC=mpifort F77=mpifort +## 4.编译stream + +### 4.1.下载并编译stream + +获取stream软件源码并解压文件 + +```bash + +# 下载源码文件 +wget https://www.cs.virginia.edu/stream/FTP/Code/Versions/stream_mpi.c +wget https://www.cs.virginia.edu/stream/FTP/Code/Versions/stream_mpi.f +wget https://www.cs.virginia.edu/stream/FTP/Code/mysecond.c + +#编译源码 +export CC=mpicc CXX=mpicxx FC=mpifort F77=mpifort +mpiicc -O3 -ffreestanding -qopenmp -qopt-streaming-stores=always \ +-DSTREAM_ARRAY_SIZE=100000000 -DNTIMES=20 -DVERBOSE \ +stream_mpi.c -o stream_mpi_c +icc -c mysecond.c +mpiifort -c stream_mpi.f +mpiifort -O3 -qopenmp -qopt-streaming-stores=always stream_mpi.o mysecond.o -o stream_mpi_f +``` + +### 4.2. 运行测试文件 + +运行stream项目文件 + +```bash +${JARVIS_LIBS}/stream-1.8/bin/stream_mpi_c && ${JARVIS_LIBS}/stream-1.8/bin/stream_mpi_f +``` + +## 附A:使用hpcrunner进行一键安装 + +推荐使用hpcrunner进行安装stream + +### 1.克隆仓库 + +```bash +git clone https://gitee.com/openeuler/hpcrunner.git +``` + +## 2.初始化hpcrunner 和 安装必要软件包 + +初始化项目助手 + +```bash +cd hpcrunner +source init.sh +``` + +安装必要软件包 + +**arm / x86 需要的软件包不同,根据实际环境进行选择** + +```bash +# arm +yum install -y environment-modules git wget unzip make flex tar +# x86 +yum install -y environment-modules git wget unzip make flex tar +yum install -y gcc gcc-c++ gcc-gfortran glibc-devel libgfortran +yum install -y tcsh tcl lsof tk bc +``` + +### 3.选择平台对应配置文件 + +- arm平台的配置文件为 `templates/stream/1.8/data.stream.arm.cpu.config` + + ```bash + ./jarvis -use templates/stream/1.8/data.stream.arm.cpu.config + ``` + +- x86 平台的配置文件为 `templates/stream/1.8/data.stream.x86.cpu.config` + + ```bash + ./jarvis -use templates/stream/1.8/data.stream.x86.cpu.config + ``` + +### 4.下载stream源码 + +```bash +./jarvis -d +``` + +### 5.一键配置依赖环境 + +```bash +./jarvis -dp +./jarvis -r +``` + +## 附B:使用singularity运行容器 + +### 使用教程 + +### 下载容器镜像 + +通过链接下载: + +[百度云盘](https://pan.baidu.com/s/1b-pDtKfXEnWyqSQSnb27OQ?pwd=hus8) + +或者扫码下载: + +![百度云]() + +#### 1.安装singularity + +安装singularity, + +具体步骤如下 + +```bash +mkdir -p ~/install +mkdir -p ~/build + +#安装编译所需依赖 +yum -y install libatomic libstdc++ libstdc++-devel libseccomp-devel glib2-devel gcc squashfs-tools tar + +#安装bisheng编译器 +cd ~/build +wget https://mirrors.huaweicloud.com/kunpeng/archive/compiler/bisheng_compiler/bisheng-compiler-2.1.0-aarch64-linux.tar.gz +tar -C ~/install -xf bisheng-compiler-2.1.0-aarch64-linux.tar.gz +echo "export PATH=$HOME/install/bisheng-compiler-2.1.0-aarch64-linux/bin:$PATH" >> ~/.bashrc && source ~/.bashrc +export CC=`which clang` +export CXX=`which clang++` + +#安装go编译器 +cd ~/build +wget https://go.dev/dl/go1.19.linux-arm64.tar.gz +tar -C ~/install -xf go1.19.linux-arm64.tar.gz +echo "export PATH=$HOME/install/go/bin:$PATH" >> ~/.bashrc && source ~/.bashrc + +#安装singularity +cd ~/build +wget https://github.com/sylabs/singularity/releases/download/v3.10.2/singularity-ce-3.10.2.tar.gz +tar -xf singularity-ce-3.10.2.tar.gz +cd singularity-ce-3.10.2 +./mconfig --prefix=$HOME/install/singularity +make -C ./builddir +make -C ./builddir install +echo "export PATH=$HOME/install/singularity/bin:$PATH" >> ~/.bashrc && source ~/.bashrc +``` + +#### 2.构建镜像 + +```shell +# x86 +singularity build ./name-of-image.sif openeuler-gcc-10.3.0-oneapi1-stream-1.8.def +# arm +singularity build ./name-of-image.sif openeuler-bisheng2-hmpi2-stream-1.8.def +# 转换为沙盒 +singularity build --sandbox image-sandbox name-of-image.sif +``` + +#### 3.在沙盒中运行 + +```shell +#进入沙盒 +singularity shell -w image-sandbox +#在沙盒中运行内置的测试案例 +cd /hpcrunner +./jarvis -r +``` \ No newline at end of file diff --git a/templates/stream/1.8/data.stream.arm.cpu.config b/templates/stream/1.8/data.stream.arm.cpu.config new file mode 100644 index 0000000000000000000000000000000000000000..e6a1bbe2286828cd4d56adb866d34ec81a422ad9 --- /dev/null +++ b/templates/stream/1.8/data.stream.arm.cpu.config @@ -0,0 +1,52 @@ +[SERVER] +11.11.11.11 + +[DOWNLOAD] +stream_mpi.c/2014.10.21 https://www.cs.virginia.edu/stream/FTP/Code/Versions/stream_mpi.c +stream_mpi.f/2014.2.14 https://www.cs.virginia.edu/stream/FTP/Code/Versions/stream_mpi.f +mysecond.c/2009.2.19 https://www.cs.virginia.edu/stream/FTP/Code/mysecond.c + +[DEPENDENCY] +set -e +set -x +module purge +mkdir -p ${JARVIS_TMP}/stream-1.8 +module use ./software/modulefiles +./jarvis -install bisheng/2.1.0 com +module load bisheng/2.1.0 +export CC=clang CXX=clang++ FC=flang +./jarvis -install hmpi/1.2.0 clang +module load hmpi/1.2.0 +cd ${JARVIS_TMP} +mv ${JARVIS_DOWNLOAD}/stream_mpi.c ${JARVIS_TMP}/stream-1.8 +mv ${JARVIS_DOWNLOAD}/stream_mpi.f ${JARVIS_TMP}/stream-1.8 +mv ${JARVIS_DOWNLOAD}/mysecond.c ${JARVIS_TMP}/stream-1.8 + +[ENV] +module purge +module use ${JARVIS_ROOT}/software/modulefiles +module load bisheng/2.1.0 +export CC=clang CXX=clang++ FC=flang +module load hmpi/1.2.0 +export CC=mpicc CXX=mpicxx FC=mpifort F77=mpifort + +[APP] +app_name = stream +build_dir = ${JARVIS_TMP}/stream-1.8 +binary_dir = ${JARVIS_LIBS}/stream-1.8 +case_dir = + +[BUILD] +mpicc -O3 -ffreestanding -openmp -opt-streaming-stores=always \ +-DSTREAM_ARRAY_SIZE=34603008 -DNTIMES=20 -DVERBOSE \ +stream_mpi.c -o stream_mpi_c +mpicc -c mysecond.c +mpifort -c stream_mpi.f +mpifort -O3 -openmp -opt-streaming-stores=always mysecond.o stream_mpi.o -o stream_mpi_f +mkdir -p ${JARVIS_LIBS}/stream-1.8 +cp -r stream_mpi_* ${JARVIS_LIBS}/stream-1.8 + +[RUN] +run = ${JARVIS_LIBS}/stream-1.8/stream_mpi_f && ${JARVIS_LIBS}/stream-1.8/stream_mpi_c +binary = +nodes = 1 \ No newline at end of file diff --git a/templates/stream/1.8/data.stream.x86.cpu.config b/templates/stream/1.8/data.stream.x86.cpu.config new file mode 100644 index 0000000000000000000000000000000000000000..1704f4de6276ca2e5f52c7e853aab6173a72349d --- /dev/null +++ b/templates/stream/1.8/data.stream.x86.cpu.config @@ -0,0 +1,51 @@ +[SERVER] +11.11.11.11 + +[DOWNLOAD] +stream_mpi.c/2014.10.21 https://www.cs.virginia.edu/stream/FTP/Code/Versions/stream_mpi.c +stream_mpi.f/2014.2.14 https://www.cs.virginia.edu/stream/FTP/Code/Versions/stream_mpi.f +mysecond.c/2009.2.19 https://www.cs.virginia.edu/stream/FTP/Code/mysecond.c + +[DEPENDENCY] +set -x +set -e +module purge +./jarvis -install gcc/10.3.0 com +module use ./software/modulefiles +module load gcc/10.3.0 +export CC=`which gcc` +export CXX=`which g++` +export FC=`which gfortran` +./jarvis -install oneapi/2021.1.0 gcc +source /opt/intel/oneapi/setvars.sh +mkdir -p ${JARVIS_TMP}/stream-1.8 +cd ${JARVIS_TMP} +mv ${JARVIS_DOWNLOAD}/stream_mpi.c ${JARVIS_TMP}/stream-1.8 +mv ${JARVIS_DOWNLOAD}/stream_mpi.f ${JARVIS_TMP}/stream-1.8 +mv ${JARVIS_DOWNLOAD}/mysecond.c ${JARVIS_TMP}/stream-1.8 + +[ENV] +module purge +source /opt/intel/oneapi/setvars.sh +export CC=mpiicc FC=mpiifort F77=mpiifort + +[APP] +app_name = stream +build_dir = ${JARVIS_TMP}/stream-1.8 +binary_dir = ${JARVIS_LIBS}/stream-1.8 +case_dir = + +[BUILD] +mpiicc -O3 -ffreestanding -qopenmp -qopt-streaming-stores=always \ +-DSTREAM_ARRAY_SIZE=8650752 -DNTIMES=20 -DVERBOSE \ +stream_mpi.c -o stream_mpi_c +icc -c mysecond.c +mpiifort -c stream_mpi.f +mpiifort -O3 -qopenmp -qopt-streaming-stores=always stream_mpi.o mysecond.o -o stream_mpi_f +mkdir -p ${JARVIS_LIBS}/stream-1.8 +cp -r stream_mpi_* ${JARVIS_LIBS}/stream-1.8 + +[RUN] +run = ${JARVIS_LIBS}/stream-1.8/stream_mpi_c && ${JARVIS_LIBS}/stream-1.8/stream_mpi_f +binary = +nodes = 1 \ No newline at end of file diff --git a/test/test-stream.sh b/test/test-stream.sh new file mode 100644 index 0000000000000000000000000000000000000000..f175c76e90e5908696c73054a82566f3d0a76632 --- /dev/null +++ b/test/test-stream.sh @@ -0,0 +1,16 @@ +#!/bin/bash +cd .. +# release stream src code +rm -rf tmp/stream-1.8 +# copy templates +cp -rf templates/stream/1.8/data.stream.x86.cpu.config ./ +# switch to config +./jarvis -use data.stream.x86.cpu.config +# download stream src code +./jarvis -d +# install dependency +./jarvis -dp +# build +./jarvis -b +# run +./jarvis -r \ No newline at end of file