diff --git a/tools/kal-test/bin/compare/graph/3rd_para_compare.sh b/tools/kal-test/bin/compare/graph/3rd_para_compare.sh index 7fd4ead7a8f35518146124c4518ccc7ade14744e..387faac45a568f1202f65494238f0591fefaf863 100644 --- a/tools/kal-test/bin/compare/graph/3rd_para_compare.sh +++ b/tools/kal-test/bin/compare/graph/3rd_para_compare.sh @@ -3,7 +3,7 @@ set -e function alg_usage() { echo "Usage: " - echo "1st argument: algorithm name: betweenness, bfs, closeness, clusteringcoefficient, cc, cd, degree, kcore, mce, mssp, pr, scc, tc, tpr, tr, wce, wpr" + echo "1st argument: algorithm name: betweenness, bfs, closeness, clusteringcoefficient, cc, cd, degree, kcore, mssp, pr, scc, tc, tpr, tr, wce, wpr" echo "2nd argument: path of baseline result" echo "3rd argument: path of algorithm result" } @@ -32,11 +32,13 @@ elif [ $alg == "clusteringcoefficient" ] || [ $alg == "tc" ]; then class_name=com.bigdata.compare.graph.ClusteringCoefficientTCVerify elif [ $alg == "cc" ]; then class_name=com.bigdata.compare.graph.CCVerify -elif [ $alg == "cd" ] || [ $alg == "degree" ]; then - class_name=com.bigdata.compare.graph.CDDegreeVerify +elif [ $alg == "degree" ]; then + class_name=com.bigdata.compare.graph.DegreeVerify +elif [ $alg == "cd" ]; then + class_name=com.bigdata.compare.graph.CDVerify elif [ $alg == "kcore" ]; then class_name=com.bigdata.compare.graph.KCoreVerify -elif [ $alg == "mce" ] || [ $alg == "wce" ]; then +elif [ $alg == "wce" ]; then class_name=com.bigdata.compare.graph.MceWceVerify elif [ $alg == "mssp" ]; then class_name=com.bigdata.compare.graph.MsspVerify diff --git a/tools/kal-test/bin/compare/graph/sgm_compare.sh b/tools/kal-test/bin/compare/graph/sgm_compare.sh new file mode 100644 index 0000000000000000000000000000000000000000..bab2277d8c70a1bf2ed4350574fe6aa70ade3ed4 --- /dev/null +++ b/tools/kal-test/bin/compare/graph/sgm_compare.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -e + +case "$1" in +-h | --help | ?) + echo "Usage: " + echo "1st argument: master:local/yarn" + echo "2nd argument: path of algorithm result" + echo "3rd argument: name of graphType" + echo "4th argument: path of dataset" + echo "5th argument: splitGraph" + echo "6th argument: path of outputCheckResult" + echo "7th argument: partition" + exit 0 + ;; +esac + +master=$1 +resultPath=$2 +graphType=$3 +datasetPath=$4 +splitGraph=$5 +outputCheckResult=$6 +partition=$7 + +source conf/graph/graph_datasets.properties +scala_version=scalaVersion +scala_version_val=${!scala_version} + +spark-submit \ +--class com.bigdata.compare.graph.SubgraphMatchingVerify \ +--master yarn \ +--num-executors 29 \ +--executor-memory 35g \ +--executor-cores 8 \ +--driver-memory 50g \ +./lib/kal-test_${scala_version_val}-0.1.jar ${master} ${resultPath} ${graphType} ${datasetPath} ${splitGraph} ${outputCheckResult} ${partition} \ No newline at end of file diff --git a/tools/kal-test/bin/compare/ml/knn_compare.sh b/tools/kal-test/bin/compare/ml/knn_compare.sh index cc02eed2f2d1212b1c1abef8ad9ad7cab606a32d..d500ed15c7214950db1821ab3970ac30c9bb6703 100644 --- a/tools/kal-test/bin/compare/ml/knn_compare.sh +++ b/tools/kal-test/bin/compare/ml/knn_compare.sh @@ -3,7 +3,7 @@ set -e function usage() { echo "Usage: " - echo "1st argument: name of dataset: e.g. glove" + echo "1st argument: name of dataset: e.g. glove,gist,deep1b" } case "$1" in @@ -32,16 +32,19 @@ save_resultPath_val=${!save_resultPath} data_path_val=${!dataset_name} echo "${dataset_name} : ${data_path_val}" -numExe=15 -exeCore=10 -exeMem=50 +numExe_real=15 +exeCore_real=10 +exeMem_real=50 groundTruthLocalPath="result/KNN/${dataset_name}_truth" groundTruthHDFSPath="${save_resultPath_val}/KNN/${dataset_name}_truth" testNum=50000 testBatchSize=5000 +if [ ${dataset_name} == "gist" ] || [ ${dataset_name} == "deep1b" ]; then + testBatchSize=1000 +fi k=100 -pt=150 +pt=188 rm -rf ${groundTruthLocalPath} mkdir -p ${groundTruthLocalPath} @@ -52,12 +55,12 @@ spark-submit \ --driver-class-path "./lib/kal-test_${scala_version_val}-0.1.jar" \ --master yarn \ --deploy-mode client \ ---num-executors ${numExe} \ ---executor-cores ${exeCore} \ ---executor-memory ${exeMem}g \ +--num-executors ${numExe_real} \ +--executor-cores ${exeCore_real} \ +--executor-memory ${exeMem_real}g \ --driver-cores 50 \ --driver-memory 50g \ ---conf "spark.executor.extraJavaOptions=-Xms${exeMem}g" \ +--conf "spark.executor.extraJavaOptions=-Xms${exeMem_real}g" \ --conf "spark.driver.maxResultSize=256G" \ --conf "spark.scheduler.mode=FAIR" \ --conf "spark.network.timeout=10000000" \ @@ -79,6 +82,17 @@ hadoop fs -put ${groundTruthLocalPath} ${groundTruthHDFSPath} echo "--------------- 生成自研算法结果,并与真实解做对比 ---------------" +source conf/ml/knn/knn_spark.properties +num_executors="numExectuors_"${dataset_name}_${cpu_name} +executor_cores="executorCores_"${dataset_name}_${cpu_name} +executor_memory="executorMemory_"${dataset_name}_${cpu_name} +extra_java_options="extraJavaOptions_"${dataset_name}_${cpu_name} +memory_overhead="execMemOverhead_"${dataset_name}_${cpu_name} +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +memory_overhead_val=${!memory_overhead} spark-submit \ --class com.bigdata.compare.ml.KNNVerify \ --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ @@ -86,18 +100,19 @@ spark-submit \ --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ --master yarn \ --deploy-mode client \ ---num-executors ${numExe} \ ---executor-cores ${exeCore} \ ---executor-memory ${exeMem}g \ ---driver-cores 50 \ +--num-executors ${num_executors_val} \ +--executor-cores ${executor_cores_val} \ +--executor-memory ${executor_memory_val} \ +--driver-cores 36 \ --driver-memory 50g \ ---conf "spark.executor.extraJavaOptions=-Xms${exeMem}g" \ +--conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ --conf "spark.driver.maxResultSize=256G" \ --conf "spark.scheduler.mode=FAIR" \ --conf "spark.network.timeout=10000000" \ --conf "spark.executor.heartbeatInterval=1000" \ --conf "spark.scheduler.maxRegisteredResourcesWaitingTime=24h" \ --conf "spark.scheduler.minRegisteredResourcesRatio=1.0" \ +--conf "spark.executor.memory_overhead=${memory_overhead_val}" \ ./lib/kal-test_${scala_version_val}-0.1.jar \ --task "verify" \ --pt ${pt} \ diff --git a/tools/kal-test/bin/compare/ml/major_compare.sh b/tools/kal-test/bin/compare/ml/major_compare.sh index b79b0f9885e4521d0e19ce86b38b0c1ee58dcf0e..7208c2d415f936f6e0dae5484664fd4457a71f1f 100644 --- a/tools/kal-test/bin/compare/ml/major_compare.sh +++ b/tools/kal-test/bin/compare/ml/major_compare.sh @@ -3,8 +3,8 @@ set -e function usage() { echo "Usage: " - echo "1st argument: algorithm name: als kmeans linr svm dtr(dt.regression) gbdtr(gbdt.regression) rfr(rf.regression) xgbtr(xgbt.regression) - lda logr svm dtc(dt.classification) gbdtc(gbdt.classification) rfc(rf.classification) xgbtc(xgbt.classification) + echo "1st argument: algorithm name: als kmeans linr svm dtr(dt.regression) gbdtr(gbdt.regression) rfr(rf.regression) xgbtr(xgbt.regression) lgbmr(lgbmr.regression) bor(bor.regression) fmr(fmr.regression) + lda crf logr svm dtc(dt.classification) gbdtc(gbdt.classification) rfc(rf.classification) xgbtc(xgbt.classification) lgbmc(lgbmc.classification) boc(boc.classification) fmc(fmc.classification) cov pca pearson spca spearman lda ps svd dtb" echo "2st argument: path of opt result: eg [hdfs:///tmp/ml/result/RF/classification_epsilon_dataframe_fit1]" echo "3nd argument: path of raw result: eg [hdfs:///tmp/ml/result/RF/classification_epsilon_dataframe_fit1_raw]" @@ -27,9 +27,9 @@ alg=$1 path0=$2 path1=$3 -if [ $alg == "als" ] || [ $alg == "kmeans" ]|| [ $alg == "linr" ]|| [ $alg == "svm" ]|| [ $alg == "dtr" ]|| [ $alg == "gbdtr" ]|| [ $alg == "rfr" ]|| [ $alg == "xgbtr" ]; then +if [ $alg == "als" ] || [ $alg == "kmeans" ]|| [ $alg == "linr" ]|| [ $alg == "svm" ]|| [ $alg == "dtr" ]|| [ $alg == "gbdtr" ]|| [ $alg == "rfr" ]|| [ $alg == "xgbtr" ] || [ $alg == "lgbmr" ] || [ $alg == "bor" ] || [ $alg == "fmr" ]; then class_name=com.bigdata.compare.ml.DownEvaluationVerify -elif [ $alg == "logr" ] || [ $alg == "svm" ] || [ $alg == "dtc" ] || [ $alg == "gbdtc" ] || [ $alg == "rfc" ] || [ $alg == "xgbtc" ] ; then +elif [ $alg == "logr" ] || [ $alg == "svm" ] || [ $alg == "dtc" ] || [ $alg == "gbdtc" ] || [ $alg == "rfc" ] || [ $alg == "xgbtc" ] || [ $alg == "crf" ] || [ $alg == "lgbmc" ] || [ $alg == "boc" ] || [ $alg == "fmc" ]; then class_name=com.bigdata.compare.ml.UpEvaluationVerify elif [ $alg == "cov" ] || [ $alg == "pca" ] || [ $alg == "pearson" ] || [ $alg == "spca" ] || [ $alg == "spearman" ]; then class_name=com.bigdata.compare.ml.MatrixVerify @@ -41,6 +41,9 @@ elif [ $alg == "svd" ] ; then class_name=com.bigdata.compare.ml.SVDVerify elif [ $alg == "dtb" ] ; then class_name=com.bigdata.compare.ml.DTBVerify +elif [ $alg == "encoder" ] ; then + class_name=com.bigdata.compare.ml.EncoderVerify + else alg_usage exit 0 diff --git a/tools/kal-test/bin/compare/ml/te_compare.sh b/tools/kal-test/bin/compare/ml/te_compare.sh new file mode 100644 index 0000000000000000000000000000000000000000..0d0477d016d0792f6af4da60c0ab125b474b3d8e --- /dev/null +++ b/tools/kal-test/bin/compare/ml/te_compare.sh @@ -0,0 +1,41 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: path of SimRank opt result: eg [hdfs:///tmp/ml/result/TargetEncoder/taobao]" + echo "2nd argument: path of SimRank raw result: eg [hdfs:///tmp/ml/result/TargetEncoder/taobao_raw]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 2 ]; then + usage + exit 0 +fi + +path0=$1 +path1=$2 + +source conf/ml/ml_datasets.properties +scala_version=scalaVersion +scala_version_val=${!scala_version} + +spark-submit \ +--class com.bigdata.compare.ml.TEVerify \ +--master yarn \ +--num-executors 26 \ +--executor-memory 40g \ +--executor-cores 8 \ +--conf spark.driver.memory=128g \ +--conf spark.driver.maxResultSize=40g \ +--conf spark.network.timeout=60000s \ +--conf spark.rpc.askTimeout=60000s \ +--conf spark.executor.heartbeatInterval=600s \ +--conf spark.eventLog.enabled=false \ +./lib/kal-test_${scala_version_val}-0.1.jar ${path0} ${path1} \ No newline at end of file diff --git a/tools/kal-test/bin/graph/cd_run.sh b/tools/kal-test/bin/graph/cd_run.sh index 6e94cbc4dbc693ee13aa0b9d65bfdb5854abcd94..dcf9d545ddeb3ab65f5f82818923bf8ad9882775 100644 --- a/tools/kal-test/bin/graph/cd_run.sh +++ b/tools/kal-test/bin/graph/cd_run.sh @@ -19,6 +19,7 @@ if [ $# -ne 1 ];then fi dataset_name=$1 +is_raw="no" if [ $dataset_name != 'simulate1' ] && [ $dataset_name != 'simulate2' ] && [ $dataset_name != 'usaRoad' ]; then echo 'invalid dataset' @@ -67,16 +68,12 @@ scala_version=scalaVersion scala_version_val=${!scala_version} input_path=${!dataset_name} -output_path="${output_path_prefix}/cd/${dataset_name}" +output_path="/tmp/graph/result/cd/${is_raw}/${dataset_name}" echo "${dataset_name}: ${input_path},${output_path}" echo "start to clean exist output" hdfs dfs -rm -r -f -skipTrash ${output_path} -scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ -scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ -scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ - echo "start to clean cache and sleep 30s" ssh server1 "echo 3 > /proc/sys/vm/drop_caches" ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" @@ -85,6 +82,11 @@ ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" sleep 30 echo "start to submit spark jobs -- cd-${dataset_name}" + +scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ +scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ +scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + spark-submit \ --class com.bigdata.graph.CycleDetectionWithConstrainsRunner \ --deploy-mode ${deploy_mode} \ @@ -110,4 +112,4 @@ spark-submit \ --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ --conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ -./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${input_path} ${output_path} "run" "no" ${cpu_name} | tee ./log/log \ No newline at end of file +./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${input_path} ${output_path} "run" "no" ${cpu_name} | tee ./log/log diff --git a/tools/kal-test/bin/graph/clusteringcoefficient_run.sh b/tools/kal-test/bin/graph/clusteringcoefficient_run.sh index 73c994ed948ff5ec04944c9d90e7fc0c9a8751e8..236006b050de23096154f88a14a43d0d9d9dbdf3 100644 --- a/tools/kal-test/bin/graph/clusteringcoefficient_run.sh +++ b/tools/kal-test/bin/graph/clusteringcoefficient_run.sh @@ -128,7 +128,7 @@ else scp lib/lcc_kaiyuan.jar root@agent3:/opt/graph_classpath/ spark-submit \ - --class com.bigdata.graph.ClusteringCoefficientRunner \ + --class com.bigdata.graph.ClusteringCoefficientRawRunner \ --master yarn \ --deploy-mode ${deploy_mode_val} \ --name "clusteringcoefficient_${dataset_name}_${api_name}" \ @@ -150,8 +150,6 @@ else --conf spark.core.connection.ack.wait.timeout=60000s \ --conf spark.executor.extraJavaOptions="-Xms35g" \ --conf spark.rdd.compress=true \ - --jars "lib/lcc_kaiyuan.jar" \ - --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/lcc_kaiyuan.jar" \ - --conf "spark.executor.extraClassPath=/opt/graph_classpath/lcc_kaiyuan.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar" \ ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${num_partitions_val} ${weight} ${is_raw} ${data_path_val} ${api_name} ${output_path} | tee ./log/log fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/sgm_run.sh b/tools/kal-test/bin/graph/sgm_run.sh index d2f5c3689fcc183715088ccb4aa79a8f99849b55..11cb351c2c1ce3784d74bdcfda0a06d34b69a4b8 100644 --- a/tools/kal-test/bin/graph/sgm_run.sh +++ b/tools/kal-test/bin/graph/sgm_run.sh @@ -227,7 +227,7 @@ else --conf spark.shuffle.memoryFraction=${shuffle_memoryFraction_val} \ --conf spark.rdd.compress=${rdd_compress_val} \ --conf spark.memory.useLegacyMode=${memory_useLegacyMode_val} \ - ./lib/pegasus-spark_2.11-0.1.0-SNAPSHOT_openSource.jar yarn ${data_path_val} ${output_path} ${queryGraph_path_val} ${num_colors_val} 232 "," ${graph_split_val} 10000 > sgm_temp.log + ./lib/pegasus-spark_${scala_version_val}-0.1.0-SNAPSHOT.jar yarn ${data_path_val} ${output_path} ${queryGraph_path_val} ${num_colors_val} 232 "," ${graph_split_val} 10000 > sgm_temp.log num_subgraphs=$(cat sgm_temp.log | grep "number of matched subgraphs" | awk -F '[\t]' '{print $2}') costTime=$(cat sgm_temp.log | grep "cost time" | awk -F '[\t]' '{print $2}') currentTime=$(date "+%Y%m%d_H%M%S") diff --git a/tools/kal-test/bin/graph/slpa_run.sh b/tools/kal-test/bin/graph/slpa_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..828df3a4f8b85f78a4970b8908b5c4df76277ccb --- /dev/null +++ b/tools/kal-test/bin/graph/slpa_run.sh @@ -0,0 +1,111 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: comlj, orkut, uk_2002, arabic_2005" + echo "2nd argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 2 ]; then + usage + exit 0 +fi + +source conf/graph/slpa/slpa_spark.properties + +dataset_name=$1 +is_raw=$2 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${cpu_name}-${is_raw} + +# concatnate strings as a new variable +num_executors="numExectuors_"${dataset_name}_${cpu_name} +executor_cores="executorCores_"${dataset_name}_${cpu_name} +executor_memory="executorMemory_"${dataset_name}_${cpu_name} +extra_java_options="extraJavaOptions_"${dataset_name}_${cpu_name} +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +deploy_mode_val=${!deploy_mode} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${deploy_mode} : ${deploy_mode_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${extra_java_options_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +data_path=${dataset_name} +data_path_val=${!data_path} +echo "${dataset_name} : ${data_path_val}" + +outputPath="/tmp/graph/result/slpa/${dataset_name}/${is_raw}" +hdfs dfs -rm -r -f ${outputPath} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- SLPA" +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + +spark-submit \ + --class com.bigdata.graph.SLPARunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 100g \ + --conf spark.scheduler.maxRegisteredResourcesWaitingTime=3600000 \ + --conf spark.rpc.askTimeout=36000 \ + --conf spark.akka.timeout=3600 \ + --conf spark.worker.timeout=3600 \ + --conf spark.network.timeout=6000s \ + --conf spark.storage.blockManagerSlaveTimeoutMs=600000 \ + --conf spark.shuffle.blockTransferService=nio \ + --conf spark.driver.maxResultSize=200g \ + --conf spark.shuffle.manager=SORT \ + --conf spark.broadcast.blockSize=25g \ + --conf spark.akka.frameSize=2046 \ + --conf spark.core.connection.ack.wait.timeout=60000s \ + --conf spark.storage.memoryFraction=0.2 \ + --conf spark.shuffle.memoryFraction=0.6 \ + --conf spark.rdd.compress=true \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${outputPath} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/wce_run.sh b/tools/kal-test/bin/graph/wce_run.sh index 4eb0cfc559bd4f55963ad8e6d333436e179ab7a3..be9915298b5525d7c16ea5b7527571f755e991a0 100644 --- a/tools/kal-test/bin/graph/wce_run.sh +++ b/tools/kal-test/bin/graph/wce_run.sh @@ -19,6 +19,7 @@ if [ $# -ne 1 ];then fi dataset_name=$1 +is_raw=no source conf/graph/wce/wce_spark.properties @@ -57,7 +58,7 @@ scala_version=scalaVersion scala_version_val=${!scala_version} data_path_val=${!dataset_name} -output_path="${output_path_prefix}/wce/${dataset_name}" +output_path="/tmp/graph/result/wce/${is_raw}/${dataset_name}" echo "${dataset_name} : ${data_path_val}" echo "output_path : ${output_path}" hdfs dfs -rm -r -f ${output_path} @@ -69,11 +70,12 @@ ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" sleep 30 +echo "start to submit spark jobs -- wce-${dataset_name}" + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ -echo "start to submit spark jobs -- wce-${dataset_name}" spark-submit \ --class com.bigdata.graph.WCERunner \ --driver-memory 80g \ diff --git a/tools/kal-test/bin/graph_workflow.sh b/tools/kal-test/bin/graph_workflow.sh index 5b6b1f2b1368c21ae38f22d7cdb0295ec1c6b81e..830cbfc1ca6cf82bc60b11bdaa6a0b49470148e3 100644 --- a/tools/kal-test/bin/graph_workflow.sh +++ b/tools/kal-test/bin/graph_workflow.sh @@ -48,265 +48,265 @@ ssh_mkdir agent2 $graph_classpath ssh_mkdir agent3 $graph_classpath # betweenness -./bin/graph/betweenness_run.sh cit_patents ${is_raw} ${is_check} 2>&1 | tee -a logs/betweenness_cit_patents_${type}.log -./bin/graph/betweenness_run.sh enwiki_2018 ${is_raw} ${is_check} 2>&1 | tee -a logs/betweenness_enwiki_2018_${type}.log -./bin/graph/betweenness_run.sh uk_2002 ${is_raw} ${is_check} 2>&1 | tee -a logs/betweenness_uk_2002_${type}.log +bash bin/graph/betweenness_run.sh cit_patents ${is_raw} ${is_check} 2>&1 | tee -a logs/betweenness_cit_patents_${type}.log +bash bin/graph/betweenness_run.sh enwiki_2018 ${is_raw} ${is_check} 2>&1 | tee -a logs/betweenness_enwiki_2018_${type}.log +bash bin/graph/betweenness_run.sh uk_2002 ${is_raw} ${is_check} 2>&1 | tee -a logs/betweenness_uk_2002_${type}.log # bfs -./bin/graph/bfs_run.sh cit_patents ${is_raw} 2>&1 | tee -a logs/bfs_cit_patents_${type}.log -./bin/graph/bfs_run.sh enwiki_2018 ${is_raw} 2>&1 | tee -a logs/bfs_enwiki_2018_${type}.log -./bin/graph/bfs_run.sh arabic_2005 ${is_raw} 2>&1 | tee -a logs/bfs_arabic_2005_${type}.log -./bin/graph/bfs_run.sh graph500_22 ${is_raw} 2>&1 | tee -a logs/bfs_graph500_22_${type}.log -./bin/graph/bfs_run.sh graph500_23 ${is_raw} 2>&1 | tee -a logs/bfs_graph500_23_${type}.log -./bin/graph/bfs_run.sh graph500_25 ${is_raw} 2>&1 | tee -a logs/bfs_graph500_25_${type}.log +bash bin/graph/bfs_run.sh cit_patents ${is_raw} 2>&1 | tee -a logs/bfs_cit_patents_${type}.log +bash bin/graph/bfs_run.sh enwiki_2018 ${is_raw} 2>&1 | tee -a logs/bfs_enwiki_2018_${type}.log +bash bin/graph/bfs_run.sh arabic_2005 ${is_raw} 2>&1 | tee -a logs/bfs_arabic_2005_${type}.log +bash bin/graph/bfs_run.sh graph500_22 ${is_raw} 2>&1 | tee -a logs/bfs_graph500_22_${type}.log +bash bin/graph/bfs_run.sh graph500_23 ${is_raw} 2>&1 | tee -a logs/bfs_graph500_23_${type}.log +bash bin/graph/bfs_run.sh graph500_25 ${is_raw} 2>&1 | tee -a logs/bfs_graph500_25_${type}.log # cc -./bin/graph/cc_run.sh graph500_25 ${is_raw} 2>&1 | tee -a logs/cc_graph500_25_${type}.log -./bin/graph/cc_run.sh graph500_26 ${is_raw} 2>&1 | tee -a logs/cc_graph500_26_${type}.log -./bin/graph/cc_run.sh liveJournal ${is_raw} 2>&1 | tee -a logs/cc_liveJournal_${type}.log +bash bin/graph/cc_run.sh graph500_25 ${is_raw} 2>&1 | tee -a logs/cc_graph500_25_${type}.log +bash bin/graph/cc_run.sh graph500_26 ${is_raw} 2>&1 | tee -a logs/cc_graph500_26_${type}.log +bash bin/graph/cc_run.sh liveJournal ${is_raw} 2>&1 | tee -a logs/cc_liveJournal_${type}.log # cd -./bin/graph/cd_run.sh simulate1 2>&1 | tee -a logs/cd_simulate1.log -./bin/graph/cd_run.sh simulate2 2>&1 | tee -a logs/cd_simulate2.log -./bin/graph/cd_run.sh usaRoad 2>&1 | tee -a logs/cd_usaRoad.log +bash bin/graph/cd_run.sh simulate1 2>&1 | tee -a logs/cd_simulate1.log +bash bin/graph/cd_run.sh simulate2 2>&1 | tee -a logs/cd_simulate2.log +bash bin/graph/cd_run.sh usaRoad 2>&1 | tee -a logs/cd_usaRoad.log # closeness -./bin/graph/closeness_run.sh cit_patents weighted ${is_check} 2>&1 | tee -a logs/closeness_cit_patents_weighted_${type}.log -./bin/graph/closeness_run.sh uk_2002 weighted ${is_check} 2>&1 | tee -a logs/closeness_uk_2002_weighted_${type}.log +bash bin/graph/closeness_run.sh cit_patents weighted ${is_check} 2>&1 | tee -a logs/closeness_cit_patents_weighted_${type}.log +bash bin/graph/closeness_run.sh uk_2002 weighted ${is_check} 2>&1 | tee -a logs/closeness_uk_2002_weighted_${type}.log -./bin/graph/closeness_run.sh cit_patents unweighted ${is_check} 2>&1 | tee -a logs/closeness_cit_patents_unweighted_${type}.log -./bin/graph/closeness_run.sh uk_2002 unweighted ${is_check} 2>&1 | tee -a logs/closeness_uk_2002_unweighted_${type}.log +bash bin/graph/closeness_run.sh cit_patents unweighted ${is_check} 2>&1 | tee -a logs/closeness_cit_patents_unweighted_${type}.log +bash bin/graph/closeness_run.sh uk_2002 unweighted ${is_check} 2>&1 | tee -a logs/closeness_uk_2002_unweighted_${type}.log # clusteringcoefficient -./bin/graph/clusteringcoefficient_run.sh cit_patents lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_cit_patents_lcc_weighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh uk_2002 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_uk_2002_lcc_weighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh arabic_2005 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_arabic_2005_lcc_weighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh cit_patents lcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_cit_patents_lcc_unweighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh uk_2002 lcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_uk_2002_lcc_unweighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh arabic_2005 lcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_arabic_2005_lcc_unweighted_${type}.log - -./bin/graph/clusteringcoefficient_run.sh graph500_22 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_22_lcc_weighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh graph500_23 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_23_lcc_weighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh graph500_24 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_24_lcc_weighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh graph500_25 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_25_lcc_weighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh graph500_22 lcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_22_lcc_unweighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh graph500_23 lcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_23_lcc_unweighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh graph500_24 lcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_24_lcc_unweighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh graph500_25 lcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_25_lcc_unweighted_${type}.log - -./bin/graph/clusteringcoefficient_run.sh graph500_22 avgcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_22_avgcc_unweighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh graph500_23 avgcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_23_avgcc_unweighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh graph500_24 avgcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_24_avgcc_unweighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh graph500_25 avgcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_25_avgcc_unweighted_${type}.log - -./bin/graph/clusteringcoefficient_run.sh graph500_22 globalcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_22_globalcc_unweighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh graph500_23 globalcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_23_globalcc_unweighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh graph500_24 globalcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_24_globalcc_unweighted_${type}.log -./bin/graph/clusteringcoefficient_run.sh graph500_25 globalcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_25_globalcc_unweighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh cit_patents lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_cit_patents_lcc_weighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh uk_2002 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_uk_2002_lcc_weighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh arabic_2005 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_arabic_2005_lcc_weighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh cit_patents lcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_cit_patents_lcc_unweighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh uk_2002 lcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_uk_2002_lcc_unweighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh arabic_2005 lcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_arabic_2005_lcc_unweighted_${type}.log + +bash bin/graph/clusteringcoefficient_run.sh graph500_22 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_22_lcc_weighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh graph500_23 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_23_lcc_weighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh graph500_24 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_24_lcc_weighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh graph500_25 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_25_lcc_weighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh graph500_22 lcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_22_lcc_unweighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh graph500_23 lcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_23_lcc_unweighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh graph500_24 lcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_24_lcc_unweighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh graph500_25 lcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_25_lcc_unweighted_${type}.log + +bash bin/graph/clusteringcoefficient_run.sh graph500_22 avgcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_22_avgcc_unweighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh graph500_23 avgcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_23_avgcc_unweighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh graph500_24 avgcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_24_avgcc_unweighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh graph500_25 avgcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_25_avgcc_unweighted_${type}.log + +bash bin/graph/clusteringcoefficient_run.sh graph500_22 globalcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_22_globalcc_unweighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh graph500_23 globalcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_23_globalcc_unweighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh graph500_24 globalcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_24_globalcc_unweighted_${type}.log +bash bin/graph/clusteringcoefficient_run.sh graph500_25 globalcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_25_globalcc_unweighted_${type}.log # degree -./bin/graph/degree_run.sh mycielskian20 degrees ${is_raw} 2>&1 | tee -a logs/degree_mycielskian20_degrees_${type}.log -./bin/graph/degree_run.sh gap_kron degrees ${is_raw} 2>&1 | tee -a logs/degree_gap_kron_degrees_${type}.log -./bin/graph/degree_run.sh com_friendster degrees ${is_raw} 2>&1 | tee -a logs/degree_com_friendster_degrees_${type}.log +bash bin/graph/degree_run.sh mycielskian20 degrees ${is_raw} 2>&1 | tee -a logs/degree_mycielskian20_degrees_${type}.log +bash bin/graph/degree_run.sh gap_kron degrees ${is_raw} 2>&1 | tee -a logs/degree_gap_kron_degrees_${type}.log +bash bin/graph/degree_run.sh com_friendster degrees ${is_raw} 2>&1 | tee -a logs/degree_com_friendster_degrees_${type}.log -./bin/graph/degree_run.sh it_2004 inDegrees ${is_raw} 2>&1 | tee -a logs/degree_it_2004_inDegrees_${type}.log -./bin/graph/degree_run.sh twitter7 inDegrees ${is_raw} 2>&1 | tee -a logs/degree_twitter7_inDegrees_${type}.log -./bin/graph/degree_run.sh uk_2007_05 inDegrees ${is_raw} 2>&1 | tee -a logs/degree_uk_2007_05_inDegrees_${type}.log +bash bin/graph/degree_run.sh it_2004 inDegrees ${is_raw} 2>&1 | tee -a logs/degree_it_2004_inDegrees_${type}.log +bash bin/graph/degree_run.sh twitter7 inDegrees ${is_raw} 2>&1 | tee -a logs/degree_twitter7_inDegrees_${type}.log +bash bin/graph/degree_run.sh uk_2007_05 inDegrees ${is_raw} 2>&1 | tee -a logs/degree_uk_2007_05_inDegrees_${type}.log -./bin/graph/degree_run.sh it_2004 outDegrees ${is_raw} 2>&1 | tee -a logs/degree_it_2004_outDegrees_${type}.log -./bin/graph/degree_run.sh twitter7 outDegrees ${is_raw} 2>&1 | tee -a logs/degree_twitter7_outDegrees_${type}.log -./bin/graph/degree_run.sh uk_2007_05 outDegrees ${is_raw} 2>&1 | tee -a logs/degree_uk_2007_05_outDegrees_${type}.log +bash bin/graph/degree_run.sh it_2004 outDegrees ${is_raw} 2>&1 | tee -a logs/degree_it_2004_outDegrees_${type}.log +bash bin/graph/degree_run.sh twitter7 outDegrees ${is_raw} 2>&1 | tee -a logs/degree_twitter7_outDegrees_${type}.log +bash bin/graph/degree_run.sh uk_2007_05 outDegrees ${is_raw} 2>&1 | tee -a logs/degree_uk_2007_05_outDegrees_${type}.log # incpr -./bin/graph/incpr_run.sh twitter_2010 0.001 1 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.001_1_${type}.log -./bin/graph/incpr_run.sh twitter_2010 0.001 2 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.001_2_${type}.log -./bin/graph/incpr_run.sh twitter_2010 0.001 3 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.001_3_${type}.log -./bin/graph/incpr_run.sh twitter_2010 0.001 4 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.001_4_${type}.log -./bin/graph/incpr_run.sh twitter_2010 0.001 5 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.001_5_${type}.log - -./bin/graph/incpr_run.sh twitter_2010 0.01 1 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.01_1_${type}.log -./bin/graph/incpr_run.sh twitter_2010 0.01 2 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.01_2_${type}.log -./bin/graph/incpr_run.sh twitter_2010 0.01 3 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.01_3_${type}.log -./bin/graph/incpr_run.sh twitter_2010 0.01 4 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.01_4_${type}.log -./bin/graph/incpr_run.sh twitter_2010 0.01 5 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.01_5_${type}.log - -./bin/graph/incpr_run.sh twitter_2010 0.05 1 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.05_1_${type}.log -./bin/graph/incpr_run.sh twitter_2010 0.05 2 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.05_2_${type}.log -./bin/graph/incpr_run.sh twitter_2010 0.05 3 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.05_3_${type}.log -./bin/graph/incpr_run.sh twitter_2010 0.05 4 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.05_4_${type}.log -./bin/graph/incpr_run.sh twitter_2010 0.05 5 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.05_5_${type}.log +bash bin/graph/incpr_run.sh twitter_2010 0.001 1 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.001_1_${type}.log +bash bin/graph/incpr_run.sh twitter_2010 0.001 2 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.001_2_${type}.log +bash bin/graph/incpr_run.sh twitter_2010 0.001 3 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.001_3_${type}.log +bash bin/graph/incpr_run.sh twitter_2010 0.001 4 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.001_4_${type}.log +bash bin/graph/incpr_run.sh twitter_2010 0.001 5 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.001_5_${type}.log + +bash bin/graph/incpr_run.sh twitter_2010 0.01 1 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.01_1_${type}.log +bash bin/graph/incpr_run.sh twitter_2010 0.01 2 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.01_2_${type}.log +bash bin/graph/incpr_run.sh twitter_2010 0.01 3 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.01_3_${type}.log +bash bin/graph/incpr_run.sh twitter_2010 0.01 4 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.01_4_${type}.log +bash bin/graph/incpr_run.sh twitter_2010 0.01 5 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.01_5_${type}.log + +bash bin/graph/incpr_run.sh twitter_2010 0.05 1 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.05_1_${type}.log +bash bin/graph/incpr_run.sh twitter_2010 0.05 2 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.05_2_${type}.log +bash bin/graph/incpr_run.sh twitter_2010 0.05 3 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.05_3_${type}.log +bash bin/graph/incpr_run.sh twitter_2010 0.05 4 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.05_4_${type}.log +bash bin/graph/incpr_run.sh twitter_2010 0.05 5 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.05_5_${type}.log # kcore -./bin/graph/kcore_run.sh graph500_22 ${is_raw} 2>&1 | tee -a logs/kcore_graph500_22_${type}.log -./bin/graph/kcore_run.sh graph500_23 ${is_raw} 2>&1 | tee -a logs/kcore_graph500_23_${type}.log -./bin/graph/kcore_run.sh graph500_25 ${is_raw} 2>&1 | tee -a logs/kcore_graph500_25_${type}.log -./bin/graph/kcore_run.sh graph500_26 ${is_raw} 2>&1 | tee -a logs/kcore_graph500_26_${type}.log +bash bin/graph/kcore_run.sh graph500_22 ${is_raw} 2>&1 | tee -a logs/kcore_graph500_22_${type}.log +bash bin/graph/kcore_run.sh graph500_23 ${is_raw} 2>&1 | tee -a logs/kcore_graph500_23_${type}.log +bash bin/graph/kcore_run.sh graph500_25 ${is_raw} 2>&1 | tee -a logs/kcore_graph500_25_${type}.log +bash bin/graph/kcore_run.sh graph500_26 ${is_raw} 2>&1 | tee -a logs/kcore_graph500_26_${type}.log # louvain -./bin/graph/louvain_run.sh graph500_22 ${is_raw} 2>&1 | tee -a logs/louvain_graph500_22_${type}.log -./bin/graph/louvain_run.sh graph500_24 ${is_raw} 2>&1 | tee -a logs/louvain_graph500_24_${type}.log -./bin/graph/louvain_run.sh graph500_25 ${is_raw} 2>&1 | tee -a logs/louvain_graph500_25_${type}.log +bash bin/graph/louvain_run.sh graph500_22 ${is_raw} 2>&1 | tee -a logs/louvain_graph500_22_${type}.log +bash bin/graph/louvain_run.sh graph500_24 ${is_raw} 2>&1 | tee -a logs/louvain_graph500_24_${type}.log +bash bin/graph/louvain_run.sh graph500_25 ${is_raw} 2>&1 | tee -a logs/louvain_graph500_25_${type}.log -./bin/graph/louvain_run.sh cit_patents no 2>&1 | tee -a logs/louvain_cit_patents_${type}.log -./bin/graph/louvain_run.sh uk_2002 no 2>&1 | tee -a logs/louvain_uk_2002_${type}.log -./bin/graph/louvain_run.sh arabic_2005 no 2>&1 | tee -a logs/louvain_arabic_2005_${type}.log +bash bin/graph/louvain_run.sh cit_patents no 2>&1 | tee -a logs/louvain_cit_patents_${type}.log +bash bin/graph/louvain_run.sh uk_2002 no 2>&1 | tee -a logs/louvain_uk_2002_${type}.log +bash bin/graph/louvain_run.sh arabic_2005 no 2>&1 | tee -a logs/louvain_arabic_2005_${type}.log # lpa -./bin/graph/lpa_run.sh graph500_22 runConvergence no 2>&1 | tee -a logs/lpa_graph500_22_runConvergence_${type}.log -./bin/graph/lpa_run.sh graph500_24 runConvergence no 2>&1 | tee -a logs/lpa_graph500_24_runConvergence_${type}.log -./bin/graph/lpa_run.sh graph500_25 runConvergence no 2>&1 | tee -a logs/lpa_graph500_25_runConvergence_${type}.log +bash bin/graph/lpa_run.sh graph500_22 runConvergence no 2>&1 | tee -a logs/lpa_graph500_22_runConvergence_${type}.log +bash bin/graph/lpa_run.sh graph500_24 runConvergence no 2>&1 | tee -a logs/lpa_graph500_24_runConvergence_${type}.log +bash bin/graph/lpa_run.sh graph500_25 runConvergence no 2>&1 | tee -a logs/lpa_graph500_25_runConvergence_${type}.log -./bin/graph/lpa_run.sh graph500_22 run ${is_raw} 2>&1 | tee -a logs/lpa_graph500_22_run_${type}.log -./bin/graph/lpa_run.sh graph500_24 run ${is_raw} 2>&1 | tee -a logs/lpa_graph500_24_run_${type}.log -./bin/graph/lpa_run.sh graph500_25 run ${is_raw} 2>&1 | tee -a logs/lpa_graph500_25_run_${type}.log +bash bin/graph/lpa_run.sh graph500_22 run ${is_raw} 2>&1 | tee -a logs/lpa_graph500_22_run_${type}.log +bash bin/graph/lpa_run.sh graph500_24 run ${is_raw} 2>&1 | tee -a logs/lpa_graph500_24_run_${type}.log +bash bin/graph/lpa_run.sh graph500_25 run ${is_raw} 2>&1 | tee -a logs/lpa_graph500_25_run_${type}.log # mce -./bin/graph/mce_run.sh graph500_23 2>&1 | tee -a logs/mce_graph500_23_${type}.log -./bin/graph/mce_run.sh graph500_24 2>&1 | tee -a logs/mce_graph500_24_${type}.log -./bin/graph/mce_run.sh graph500_25 2>&1 | tee -a logs/mce_graph500_25_${type}.log +bash bin/graph/mce_run.sh graph500_23 2>&1 | tee -a logs/mce_graph500_23_${type}.log +bash bin/graph/mce_run.sh graph500_24 2>&1 | tee -a logs/mce_graph500_24_${type}.log +bash bin/graph/mce_run.sh graph500_25 2>&1 | tee -a logs/mce_graph500_25_${type}.log # modularity -./bin/graph/modularity_run.sh graph500_23 ${is_raw} 2>&1 | tee -a logs/modularity_graph500_23_${type}.log -./bin/graph/modularity_run.sh graph500_25 ${is_raw} 2>&1 | tee -a logs/modularity_graph500_25_${type}.log -./bin/graph/modularity_run.sh graph500_26 ${is_raw} 2>&1 | tee -a logs/modularity_graph500_26_${type}.log +bash bin/graph/modularity_run.sh graph500_23 ${is_raw} 2>&1 | tee -a logs/modularity_graph500_23_${type}.log +bash bin/graph/modularity_run.sh graph500_25 ${is_raw} 2>&1 | tee -a logs/modularity_graph500_25_${type}.log +bash bin/graph/modularity_run.sh graph500_26 ${is_raw} 2>&1 | tee -a logs/modularity_graph500_26_${type}.log -./bin/graph/modularity_run.sh uk_2002 no 2>&1 | tee -a logs/modularity_uk_${type}.log -./bin/graph/modularity_run.sh arabic_2005 no 2>&1 | tee -a logs/modularity_arabic_${type}.log -./bin/graph/modularity_run.sh twitter no 2>&1 | tee -a logs/modularity_twitter_${type}.log +bash bin/graph/modularity_run.sh uk_2002 no 2>&1 | tee -a logs/modularity_uk_${type}.log +bash bin/graph/modularity_run.sh arabic_2005 no 2>&1 | tee -a logs/modularity_arabic_${type}.log +bash bin/graph/modularity_run.sh twitter no 2>&1 | tee -a logs/modularity_twitter_${type}.log # mssp -./bin/graph/mssp_run.sh soc_liveJournal 5 ${is_raw} 2>&1 | tee -a logs/mssp_liveJournal_5_${type}.log -./bin/graph/mssp_run.sh uk_2002 5 ${is_raw} 2>&1 | tee -a logs/mssp_uk_2002_5_${type}.log -./bin/graph/mssp_run.sh arabic_2005 5 ${is_raw} 2>&1 | tee -a logs/mssp_arabic_2005_5_${type}.log +bash bin/graph/mssp_run.sh soc_liveJournal 5 ${is_raw} 2>&1 | tee -a logs/mssp_liveJournal_5_${type}.log +bash bin/graph/mssp_run.sh uk_2002 5 ${is_raw} 2>&1 | tee -a logs/mssp_uk_2002_5_${type}.log +bash bin/graph/mssp_run.sh arabic_2005 5 ${is_raw} 2>&1 | tee -a logs/mssp_arabic_2005_5_${type}.log -./bin/graph/mssp_run.sh soc_liveJournal 50 ${is_raw} 2>&1 | tee -a logs/mssp_liveJournal_50_${type}.log -./bin/graph/mssp_run.sh uk_2002 50 ${is_raw} 2>&1 | tee -a logs/mssp_uk_2002_50_${type}.log -./bin/graph/mssp_run.sh arabic_2005 50 ${is_raw} 2>&1 | tee -a logs/mssp_arabic_2005_50_${type}.log +bash bin/graph/mssp_run.sh soc_liveJournal 50 ${is_raw} 2>&1 | tee -a logs/mssp_liveJournal_50_${type}.log +bash bin/graph/mssp_run.sh uk_2002 50 ${is_raw} 2>&1 | tee -a logs/mssp_uk_2002_50_${type}.log +bash bin/graph/mssp_run.sh arabic_2005 50 ${is_raw} 2>&1 | tee -a logs/mssp_arabic_2005_50_${type}.log # node2vec -./bin/graph/node2vec_run.sh cit_patents ${is_raw} ${is_check} 2>&1 | tee -a logs/node2vec_cit_patents_${type}.log -./bin/graph/node2vec_run.sh soc_liveJournal no ${is_check} 2>&1 | tee -a logs/node2vec_soc_liveJournal_${type}.log -./bin/graph/node2vec_run.sh uk_2002 no ${is_check} 2>&1 | tee -a logs/node2vec_uk_2002_${type}.log +bash bin/graph/node2vec_run.sh cit_patents ${is_raw} ${is_check} 2>&1 | tee -a logs/node2vec_cit_patents_${type}.log +bash bin/graph/node2vec_run.sh soc_liveJournal no ${is_check} 2>&1 | tee -a logs/node2vec_soc_liveJournal_${type}.log +bash bin/graph/node2vec_run.sh uk_2002 no ${is_check} 2>&1 | tee -a logs/node2vec_uk_2002_${type}.log # ppr -./bin/graph/ppr_run.sh cit_patents fixMS ${is_raw} 1 2>&1 | tee -a logs/ppr_cit_patents_fixMS_1_${type}.log -./bin/graph/ppr_run.sh cit_patents fixMS ${is_raw} 5 2>&1 | tee -a logs/ppr_cit_patents_fixMS_5_${type}.log -./bin/graph/ppr_run.sh cit_patents fixMS ${is_raw} 10 2>&1 | tee -a logs/ppr_cit_patents_fixMS_10_${type}.log -./bin/graph/ppr_run.sh cit_patents fixMS ${is_raw} 50 2>&1 | tee -a logs/ppr_cit_patents_fixMS_50_${type}.log -./bin/graph/ppr_run.sh cit_patents fixMS ${is_raw} 100 2>&1 | tee -a logs/ppr_cit_patents_fixMS_100_${type}.log - -./bin/graph/ppr_run.sh uk_2002 fixMS ${is_raw} 1 2>&1 | tee -a logs/ppr_uk_2002_fixMS_1_${type}.log -./bin/graph/ppr_run.sh uk_2002 fixMS ${is_raw} 5 2>&1 | tee -a logs/ppr_uk_2002_fixMS_5_${type}.log -./bin/graph/ppr_run.sh uk_2002 fixMS ${is_raw} 10 2>&1 | tee -a logs/ppr_uk_2002_fixMS_10_${type}.log -./bin/graph/ppr_run.sh uk_2002 fixMS ${is_raw} 50 2>&1 | tee -a logs/ppr_uk_2002_fixMS_50_${type}.log -./bin/graph/ppr_run.sh uk_2002 fixMS ${is_raw} 100 2>&1 | tee -a logs/ppr_uk_2002_fixMS_100_${type}.log - -./bin/graph/ppr_run.sh arabic_2005 fixMS ${is_raw} 1 2>&1 | tee -a logs/ppr_arabic_2005_fixMS_1_${type}.log -./bin/graph/ppr_run.sh arabic_2005 fixMS ${is_raw} 5 2>&1 | tee -a logs/ppr_arabic_2005_fixMS_5_${type}.log -./bin/graph/ppr_run.sh arabic_2005 fixMS ${is_raw} 10 2>&1 | tee -a logs/ppr_arabic_2005_fixMS_10_${type}.log -./bin/graph/ppr_run.sh arabic_2005 fixMS ${is_raw} 50 2>&1 | tee -a logs/ppr_arabic_2005_fixMS_50_${type}.log -./bin/graph/ppr_run.sh arabic_2005 fixMS ${is_raw} 100 2>&1 | tee -a logs/ppr_arabic_2005_fixMS_100_${type}.log - -./bin/graph/ppr_run.sh cit_patents fixSS ${is_raw} 2>&1 | tee -a logs/ppr_cit_patents_fixSS_${type}.log -./bin/graph/ppr_run.sh uk_2002 fixSS ${is_raw} 2>&1 | tee -a logs/ppr_uk_2002_fixSS_${type}.log -./bin/graph/ppr_run.sh arabic_2005 fixSS ${is_raw} 2>&1 | tee -a logs/ppr_arabic_2005_fixSS_${type}.log - -./bin/graph/ppr_run.sh cit_patents conSS ${is_raw} 2>&1 | tee -a logs/ppr_cit_patents_conSS_${type}.log -./bin/graph/ppr_run.sh uk_2002 conSS ${is_raw} 2>&1 | tee -a logs/ppr_uk_2002_conSS_${type}.log -./bin/graph/ppr_run.sh arabic_2005 conSS ${is_raw} 2>&1 | tee -a logs/ppr_arabic_2005_conSS_${type}.log +bash bin/graph/ppr_run.sh cit_patents fixMS ${is_raw} 1 2>&1 | tee -a logs/ppr_cit_patents_fixMS_1_${type}.log +bash bin/graph/ppr_run.sh cit_patents fixMS ${is_raw} 5 2>&1 | tee -a logs/ppr_cit_patents_fixMS_5_${type}.log +bash bin/graph/ppr_run.sh cit_patents fixMS ${is_raw} 10 2>&1 | tee -a logs/ppr_cit_patents_fixMS_10_${type}.log +bash bin/graph/ppr_run.sh cit_patents fixMS ${is_raw} 50 2>&1 | tee -a logs/ppr_cit_patents_fixMS_50_${type}.log +bash bin/graph/ppr_run.sh cit_patents fixMS ${is_raw} 100 2>&1 | tee -a logs/ppr_cit_patents_fixMS_100_${type}.log + +bash bin/graph/ppr_run.sh uk_2002 fixMS ${is_raw} 1 2>&1 | tee -a logs/ppr_uk_2002_fixMS_1_${type}.log +bash bin/graph/ppr_run.sh uk_2002 fixMS ${is_raw} 5 2>&1 | tee -a logs/ppr_uk_2002_fixMS_5_${type}.log +bash bin/graph/ppr_run.sh uk_2002 fixMS ${is_raw} 10 2>&1 | tee -a logs/ppr_uk_2002_fixMS_10_${type}.log +bash bin/graph/ppr_run.sh uk_2002 fixMS ${is_raw} 50 2>&1 | tee -a logs/ppr_uk_2002_fixMS_50_${type}.log +bash bin/graph/ppr_run.sh uk_2002 fixMS ${is_raw} 100 2>&1 | tee -a logs/ppr_uk_2002_fixMS_100_${type}.log + +bash bin/graph/ppr_run.sh arabic_2005 fixMS ${is_raw} 1 2>&1 | tee -a logs/ppr_arabic_2005_fixMS_1_${type}.log +bash bin/graph/ppr_run.sh arabic_2005 fixMS ${is_raw} 5 2>&1 | tee -a logs/ppr_arabic_2005_fixMS_5_${type}.log +bash bin/graph/ppr_run.sh arabic_2005 fixMS ${is_raw} 10 2>&1 | tee -a logs/ppr_arabic_2005_fixMS_10_${type}.log +bash bin/graph/ppr_run.sh arabic_2005 fixMS ${is_raw} 50 2>&1 | tee -a logs/ppr_arabic_2005_fixMS_50_${type}.log +bash bin/graph/ppr_run.sh arabic_2005 fixMS ${is_raw} 100 2>&1 | tee -a logs/ppr_arabic_2005_fixMS_100_${type}.log + +bash bin/graph/ppr_run.sh cit_patents fixSS ${is_raw} 2>&1 | tee -a logs/ppr_cit_patents_fixSS_${type}.log +bash bin/graph/ppr_run.sh uk_2002 fixSS ${is_raw} 2>&1 | tee -a logs/ppr_uk_2002_fixSS_${type}.log +bash bin/graph/ppr_run.sh arabic_2005 fixSS ${is_raw} 2>&1 | tee -a logs/ppr_arabic_2005_fixSS_${type}.log + +bash bin/graph/ppr_run.sh cit_patents conSS ${is_raw} 2>&1 | tee -a logs/ppr_cit_patents_conSS_${type}.log +bash bin/graph/ppr_run.sh uk_2002 conSS ${is_raw} 2>&1 | tee -a logs/ppr_uk_2002_conSS_${type}.log +bash bin/graph/ppr_run.sh arabic_2005 conSS ${is_raw} 2>&1 | tee -a logs/ppr_arabic_2005_conSS_${type}.log # pr -./bin/graph/pr_run.sh cit_patents run ${is_raw} 2>&1 | tee -a logs/pr_cit_patents_run_${type}.log -./bin/graph/pr_run.sh uk_2002 run ${is_raw} 2>&1 | tee -a logs/pr_uk_2002_run_${type}.log -./bin/graph/pr_run.sh arabic_2005 run ${is_raw} 2>&1 | tee -a logs/pr_arabic_2005_run_${type}.log +bash bin/graph/pr_run.sh cit_patents run ${is_raw} 2>&1 | tee -a logs/pr_cit_patents_run_${type}.log +bash bin/graph/pr_run.sh uk_2002 run ${is_raw} 2>&1 | tee -a logs/pr_uk_2002_run_${type}.log +bash bin/graph/pr_run.sh arabic_2005 run ${is_raw} 2>&1 | tee -a logs/pr_arabic_2005_run_${type}.log -./bin/graph/pr_run.sh cit_patents runUntilConvergence ${is_raw} 2>&1 | tee -a logs/pr_cit_patents_runUntilConvergence_${type}.log -./bin/graph/pr_run.sh uk_2002 runUntilConvergence ${is_raw} 2>&1 | tee -a logs/pr_uk_2002_runUntilConvergence_${type}.log -./bin/graph/pr_run.sh arabic_2005 runUntilConvergence ${is_raw} 2>&1 | tee -a logs/pr_arabic_2005_runUntilConvergence_${type}.log +bash bin/graph/pr_run.sh cit_patents runUntilConvergence ${is_raw} 2>&1 | tee -a logs/pr_cit_patents_runUntilConvergence_${type}.log +bash bin/graph/pr_run.sh uk_2002 runUntilConvergence ${is_raw} 2>&1 | tee -a logs/pr_uk_2002_runUntilConvergence_${type}.log +bash bin/graph/pr_run.sh arabic_2005 runUntilConvergence ${is_raw} 2>&1 | tee -a logs/pr_arabic_2005_runUntilConvergence_${type}.log # scc -./bin/graph/scc_run.sh cit_patents ${is_raw} 2>&1 | tee -a logs/scc_cit_patents_${type}.log -./bin/graph/scc_run.sh enwiki_2018 ${is_raw} 2>&1 | tee -a logs/scc_enwiki_2018_${type}.log -./bin/graph/scc_run.sh arabic_2005 ${is_raw} 2>&1 | tee -a logs/scc_arabic_2005_${type}.log +bash bin/graph/scc_run.sh cit_patents ${is_raw} 2>&1 | tee -a logs/scc_cit_patents_${type}.log +bash bin/graph/scc_run.sh enwiki_2018 ${is_raw} 2>&1 | tee -a logs/scc_enwiki_2018_${type}.log +bash bin/graph/scc_run.sh arabic_2005 ${is_raw} 2>&1 | tee -a logs/scc_arabic_2005_${type}.log # sgm -./bin/graph/sgm_run.sh graph500_19 4dgn Identical no 2>&1 | tee -a logs/sgm_graph500_19_4dgn_Identical_${type}.log -./bin/graph/sgm_run.sh graph500_19 4sqr Identical no 2>&1 | tee -a logs/sgm_graph500_19_4sqr_Identical_${type}.log -./bin/graph/sgm_run.sh graph500_19 5tree Identical no 2>&1 | tee -a logs/sgm_graph500_19_5tree_Identical_${type}.log -./bin/graph/sgm_run.sh graph500_19 6star Identical no 2>&1 | tee -a logs/sgm_graph500_19_6star_Identical_${type}.log - -./bin/graph/sgm_run.sh liveJournal 4dgn Identical no 2>&1 | tee -a logs/sgm_liveJournal_4dgn_Identical_${type}.log -./bin/graph/sgm_run.sh liveJournal 4sqr Identical no 2>&1 | tee -a logs/sgm_liveJournal_4sqr_Identical_${type}.log -./bin/graph/sgm_run.sh liveJournal 5tree Identical no 2>&1 | tee -a logs/sgm_liveJournal_5tree_Identical_${type}.log -./bin/graph/sgm_run.sh liveJournal 6star Identical no 2>&1 | tee -a logs/sgm_liveJournal_6star_Identical_${type}.log - -./bin/graph/sgm_run.sh com_orkut 4dgn Identical no 2>&1 | tee -a logs/sgm_com_orkut_4dgn_Identical_${type}.log -./bin/graph/sgm_run.sh com_orkut 4sqr Identical no 2>&1 | tee -a logs/sgm_com_orkut_4sqr_Identical_${type}.log -./bin/graph/sgm_run.sh com_orkut 5tree Identical no 2>&1 | tee -a logs/sgm_com_orkut_5tree_Identical_${type}.log -./bin/graph/sgm_run.sh com_orkut 6star Identical no 2>&1 | tee -a logs/sgm_com_orkut_6star_Identical_${type}.log - -./bin/graph/sgm_run.sh graph500_19 4dgn unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_graph500_19_unIdentical_4dgn_${type}.log -./bin/graph/sgm_run.sh graph500_19 4clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_graph500_19_unIdentical_4clique_${type}.log -./bin/graph/sgm_run.sh graph500_19 5clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_graph500_19_unIdentical_5clique_${type}.log -./bin/graph/sgm_run.sh graph500_19 6clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_graph500_19_unIdentical_6clique_${type}.log - -./bin/graph/sgm_run.sh liveJournal 4dgn unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_liveJournal_unIdentical_4dgn_${type}.log -./bin/graph/sgm_run.sh liveJournal 4clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_liveJournal_unIdentical_4clique_${type}.log -./bin/graph/sgm_run.sh liveJournal 5clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_liveJournal_unIdentical_5clique_${type}.log -./bin/graph/sgm_run.sh liveJournal 6clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_liveJournal_unIdentical_6clique_${type}.log - -./bin/graph/sgm_run.sh com_orkut 4dgn unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_com_orkut_unIdentical_4dgn_${type}.log -./bin/graph/sgm_run.sh com_orkut 4clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_com_orkut_unIdentical_4clique_${type}.log -./bin/graph/sgm_run.sh com_orkut 5clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_com_orkut_unIdentical_5clique_${type}.log -./bin/graph/sgm_run.sh com_orkut 6clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_com_orkut_unIdentical_6clique_${type}.log +bash bin/graph/sgm_run.sh graph500_19 4dgn Identical no 2>&1 | tee -a logs/sgm_graph500_19_4dgn_Identical_${type}.log +bash bin/graph/sgm_run.sh graph500_19 4sqr Identical no 2>&1 | tee -a logs/sgm_graph500_19_4sqr_Identical_${type}.log +bash bin/graph/sgm_run.sh graph500_19 5tree Identical no 2>&1 | tee -a logs/sgm_graph500_19_5tree_Identical_${type}.log +bash bin/graph/sgm_run.sh graph500_19 6star Identical no 2>&1 | tee -a logs/sgm_graph500_19_6star_Identical_${type}.log + +bash bin/graph/sgm_run.sh liveJournal 4dgn Identical no 2>&1 | tee -a logs/sgm_liveJournal_4dgn_Identical_${type}.log +bash bin/graph/sgm_run.sh liveJournal 4sqr Identical no 2>&1 | tee -a logs/sgm_liveJournal_4sqr_Identical_${type}.log +bash bin/graph/sgm_run.sh liveJournal 5tree Identical no 2>&1 | tee -a logs/sgm_liveJournal_5tree_Identical_${type}.log +bash bin/graph/sgm_run.sh liveJournal 6star Identical no 2>&1 | tee -a logs/sgm_liveJournal_6star_Identical_${type}.log + +bash bin/graph/sgm_run.sh com_orkut 4dgn Identical no 2>&1 | tee -a logs/sgm_com_orkut_4dgn_Identical_${type}.log +bash bin/graph/sgm_run.sh com_orkut 4sqr Identical no 2>&1 | tee -a logs/sgm_com_orkut_4sqr_Identical_${type}.log +bash bin/graph/sgm_run.sh com_orkut 5tree Identical no 2>&1 | tee -a logs/sgm_com_orkut_5tree_Identical_${type}.log +bash bin/graph/sgm_run.sh com_orkut 6star Identical no 2>&1 | tee -a logs/sgm_com_orkut_6star_Identical_${type}.log + +bash bin/graph/sgm_run.sh graph500_19 4dgn unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_graph500_19_unIdentical_4dgn_${type}.log +bash bin/graph/sgm_run.sh graph500_19 4clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_graph500_19_unIdentical_4clique_${type}.log +bash bin/graph/sgm_run.sh graph500_19 5clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_graph500_19_unIdentical_5clique_${type}.log +bash bin/graph/sgm_run.sh graph500_19 6clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_graph500_19_unIdentical_6clique_${type}.log + +bash bin/graph/sgm_run.sh liveJournal 4dgn unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_liveJournal_unIdentical_4dgn_${type}.log +bash bin/graph/sgm_run.sh liveJournal 4clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_liveJournal_unIdentical_4clique_${type}.log +bash bin/graph/sgm_run.sh liveJournal 5clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_liveJournal_unIdentical_5clique_${type}.log +bash bin/graph/sgm_run.sh liveJournal 6clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_liveJournal_unIdentical_6clique_${type}.log + +bash bin/graph/sgm_run.sh com_orkut 4dgn unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_com_orkut_unIdentical_4dgn_${type}.log +bash bin/graph/sgm_run.sh com_orkut 4clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_com_orkut_unIdentical_4clique_${type}.log +bash bin/graph/sgm_run.sh com_orkut 5clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_com_orkut_unIdentical_5clique_${type}.log +bash bin/graph/sgm_run.sh com_orkut 6clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_com_orkut_unIdentical_6clique_${type}.log # tc -./bin/graph/tc_run.sh graph500_22 run ${is_raw} 2>&1 | tee -a logs/tc_graph500_22_run_${type}.log -./bin/graph/tc_run.sh graph500_23 run ${is_raw} 2>&1 | tee -a logs/tc_graph500_23_run_${type}.log -./bin/graph/tc_run.sh graph500_24 run ${is_raw} 2>&1 | tee -a logs/tc_graph500_24_run_${type}.log -./bin/graph/tc_run.sh graph500_25 run ${is_raw} 2>&1 | tee -a logs/tc_graph500_25_run_${type}.log -./bin/graph/tc_run.sh graph500_26 run ${is_raw} 2>&1 | tee -a logs/tc_graph500_26_run_${type}.log - -./bin/graph/tc_run.sh graph500_22 preCanonical ${is_raw} 2>&1 | tee -a logs/tc_graph500_22_preCanonical_${type}.log -./bin/graph/tc_run.sh graph500_23 preCanonical ${is_raw} 2>&1 | tee -a logs/tc_graph500_23_preCanonical_${type}.log -./bin/graph/tc_run.sh graph500_24 preCanonical ${is_raw} 2>&1 | tee -a logs/tc_graph500_24_preCanonical_${type}.log -./bin/graph/tc_run.sh graph500_25 preCanonical ${is_raw} 2>&1 | tee -a logs/tc_graph500_25_preCanonical_${type}.log -./bin/graph/tc_run.sh graph500_26 preCanonical ${is_raw} 2>&1 | tee -a logs/tc_graph500_26_preCanonical_${type}.log +bash bin/graph/tc_run.sh graph500_22 run ${is_raw} 2>&1 | tee -a logs/tc_graph500_22_run_${type}.log +bash bin/graph/tc_run.sh graph500_23 run ${is_raw} 2>&1 | tee -a logs/tc_graph500_23_run_${type}.log +bash bin/graph/tc_run.sh graph500_24 run ${is_raw} 2>&1 | tee -a logs/tc_graph500_24_run_${type}.log +bash bin/graph/tc_run.sh graph500_25 run ${is_raw} 2>&1 | tee -a logs/tc_graph500_25_run_${type}.log +bash bin/graph/tc_run.sh graph500_26 run ${is_raw} 2>&1 | tee -a logs/tc_graph500_26_run_${type}.log + +bash bin/graph/tc_run.sh graph500_22 preCanonical ${is_raw} 2>&1 | tee -a logs/tc_graph500_22_preCanonical_${type}.log +bash bin/graph/tc_run.sh graph500_23 preCanonical ${is_raw} 2>&1 | tee -a logs/tc_graph500_23_preCanonical_${type}.log +bash bin/graph/tc_run.sh graph500_24 preCanonical ${is_raw} 2>&1 | tee -a logs/tc_graph500_24_preCanonical_${type}.log +bash bin/graph/tc_run.sh graph500_25 preCanonical ${is_raw} 2>&1 | tee -a logs/tc_graph500_25_preCanonical_${type}.log +bash bin/graph/tc_run.sh graph500_26 preCanonical ${is_raw} 2>&1 | tee -a logs/tc_graph500_26_preCanonical_${type}.log # tpr -./bin/graph/tpr_run.sh twitter_tpr ${is_raw} 2>&1 | tee -a logs/tpr_twitter_${type}.log +bash bin/graph/tpr_run.sh twitter_tpr ${is_raw} 2>&1 | tee -a logs/tpr_twitter_${type}.log # tr -./bin/graph/tr_run.sh cit_patents run 100 2>&1 | tee -a logs/tr_cit_patents_run_100_${type}.log -./bin/graph/tr_run.sh cit_patents run 500 2>&1 | tee -a logs/tr_cit_patents_run_500_${type}.log -./bin/graph/tr_run.sh cit_patents run 1000 2>&1 | tee -a logs/tr_cit_patents_run_1000_${type}.log -./bin/graph/tr_run.sh uk_2002 run 100 2>&1 | tee -a logs/tr_uk_2002_run_100_${type}.log -./bin/graph/tr_run.sh uk_2002 run 500 2>&1 | tee -a logs/tr_uk_2002_run_500_${type}.log -./bin/graph/tr_run.sh uk_2002 run 1000 2>&1 | tee -a logs/tr_uk_2002_run_1000_${type}.log -./bin/graph/tr_run.sh arabic_2005 run 100 2>&1 | tee -a logs/tr_arabic_2005_run_100_${type}.log -./bin/graph/tr_run.sh arabic_2005 run 500 2>&1 | tee -a logs/tr_arabic_2005_run_500_${type}.log -./bin/graph/tr_run.sh arabic_2005 run 1000 2>&1 | tee -a logs/tr_arabic_2005_run_1000_${type}.log - -./bin/graph/tr_run.sh cit_patents runUntilConvergence 100 2>&1 | tee -a logs/tr_cit_patents_runUntilConvergence_100_${type}.log -./bin/graph/tr_run.sh cit_patents runUntilConvergence 500 2>&1 | tee -a logs/tr_cit_patents_runUntilConvergence_500_${type}.log -./bin/graph/tr_run.sh cit_patents runUntilConvergence 1000 2>&1 | tee -a logs/tr_cit_patents_runUntilConvergence_1000_${type}.log -./bin/graph/tr_run.sh uk_2002 runUntilConvergence 100 2>&1 | tee -a logs/tr_uk_2002_runUntilConvergence_100_${type}.log -./bin/graph/tr_run.sh uk_2002 runUntilConvergence 500 2>&1 | tee -a logs/tr_uk_2002_runUntilConvergence_500_${type}.log -./bin/graph/tr_run.sh uk_2002 runUntilConvergence 1000 2>&1 | tee -a logs/tr_uk_2002_runUntilConvergence_1000_${type}.log -./bin/graph/tr_run.sh arabic_2005 runUntilConvergence 100 2>&1 | tee -a logs/tr_arabic_2005_runUntilConvergence_100_${type}.log -./bin/graph/tr_run.sh arabic_2005 runUntilConvergence 500 2>&1 | tee -a logs/tr_arabic_2005_runUntilConvergence_500_${type}.log -./bin/graph/tr_run.sh arabic_2005 runUntilConvergence 1000 2>&1 | tee -a logs/tr_arabic_2005_runUntilConvergence_1000_${type}.log +bash bin/graph/tr_run.sh cit_patents run 100 2>&1 | tee -a logs/tr_cit_patents_run_100_${type}.log +bash bin/graph/tr_run.sh cit_patents run 500 2>&1 | tee -a logs/tr_cit_patents_run_500_${type}.log +bash bin/graph/tr_run.sh cit_patents run 1000 2>&1 | tee -a logs/tr_cit_patents_run_1000_${type}.log +bash bin/graph/tr_run.sh uk_2002 run 100 2>&1 | tee -a logs/tr_uk_2002_run_100_${type}.log +bash bin/graph/tr_run.sh uk_2002 run 500 2>&1 | tee -a logs/tr_uk_2002_run_500_${type}.log +bash bin/graph/tr_run.sh uk_2002 run 1000 2>&1 | tee -a logs/tr_uk_2002_run_1000_${type}.log +bash bin/graph/tr_run.sh arabic_2005 run 100 2>&1 | tee -a logs/tr_arabic_2005_run_100_${type}.log +bash bin/graph/tr_run.sh arabic_2005 run 500 2>&1 | tee -a logs/tr_arabic_2005_run_500_${type}.log +bash bin/graph/tr_run.sh arabic_2005 run 1000 2>&1 | tee -a logs/tr_arabic_2005_run_1000_${type}.log + +bash bin/graph/tr_run.sh cit_patents runUntilConvergence 100 2>&1 | tee -a logs/tr_cit_patents_runUntilConvergence_100_${type}.log +bash bin/graph/tr_run.sh cit_patents runUntilConvergence 500 2>&1 | tee -a logs/tr_cit_patents_runUntilConvergence_500_${type}.log +bash bin/graph/tr_run.sh cit_patents runUntilConvergence 1000 2>&1 | tee -a logs/tr_cit_patents_runUntilConvergence_1000_${type}.log +bash bin/graph/tr_run.sh uk_2002 runUntilConvergence 100 2>&1 | tee -a logs/tr_uk_2002_runUntilConvergence_100_${type}.log +bash bin/graph/tr_run.sh uk_2002 runUntilConvergence 500 2>&1 | tee -a logs/tr_uk_2002_runUntilConvergence_500_${type}.log +bash bin/graph/tr_run.sh uk_2002 runUntilConvergence 1000 2>&1 | tee -a logs/tr_uk_2002_runUntilConvergence_1000_${type}.log +bash bin/graph/tr_run.sh arabic_2005 runUntilConvergence 100 2>&1 | tee -a logs/tr_arabic_2005_runUntilConvergence_100_${type}.log +bash bin/graph/tr_run.sh arabic_2005 runUntilConvergence 500 2>&1 | tee -a logs/tr_arabic_2005_runUntilConvergence_500_${type}.log +bash bin/graph/tr_run.sh arabic_2005 runUntilConvergence 1000 2>&1 | tee -a logs/tr_arabic_2005_runUntilConvergence_1000_${type}.log # wce -./bin/graph/wce_run.sh graph500_24 2>&1 | tee -a logs/wce_graph500_24_${type}.log -./bin/graph/wce_run.sh graph500_25 2>&1 | tee -a logs/wce_graph500_25_${type}.log -./bin/graph/wce_run.sh graph500_26 2>&1 | tee -a logs/wce_graph500_26_${type}.log +bash bin/graph/wce_run.sh graph500_24 2>&1 | tee -a logs/wce_graph500_24_${type}.log +bash bin/graph/wce_run.sh graph500_25 2>&1 | tee -a logs/wce_graph500_25_${type}.log +bash bin/graph/wce_run.sh graph500_26 2>&1 | tee -a logs/wce_graph500_26_${type}.log # wpr -./bin/graph/wpr_run.sh cage14 static ${is_raw} 2>&1 | tee -a logs/wpr_cage14_static_${type}.log -./bin/graph/wpr_run.sh GAP_road static ${is_raw} 2>&1 | tee -a logs/wpr_GAP_road_static_${type}.log -./bin/graph/wpr_run.sh GAP_twitter static ${is_raw} 2>&1 | tee -a logs/wpr_GAP_twitter_static_${type}.log +bash bin/graph/wpr_run.sh cage14 static ${is_raw} 2>&1 | tee -a logs/wpr_cage14_static_${type}.log +bash bin/graph/wpr_run.sh GAP_road static ${is_raw} 2>&1 | tee -a logs/wpr_GAP_road_static_${type}.log +bash bin/graph/wpr_run.sh GAP_twitter static ${is_raw} 2>&1 | tee -a logs/wpr_GAP_twitter_static_${type}.log -./bin/graph/wpr_run.sh cage14 convergence ${is_raw} 2>&1 | tee -a logs/wpr_cage14_convergence_${type}.log -./bin/graph/wpr_run.sh GAP_road convergence ${is_raw} 2>&1 | tee -a logs/wpr_GAP_road_convergence_${type}.log -./bin/graph/wpr_run.sh GAP_twitter convergence ${is_raw} 2>&1 | tee -a logs/wpr_GAP_twitter_convergence_${type}.log +bash bin/graph/wpr_run.sh cage14 convergence ${is_raw} 2>&1 | tee -a logs/wpr_cage14_convergence_${type}.log +bash bin/graph/wpr_run.sh GAP_road convergence ${is_raw} 2>&1 | tee -a logs/wpr_GAP_road_convergence_${type}.log +bash bin/graph/wpr_run.sh GAP_twitter convergence ${is_raw} 2>&1 | tee -a logs/wpr_GAP_twitter_convergence_${type}.log diff --git a/tools/kal-test/bin/ml/crf_run.sh b/tools/kal-test/bin/ml/crf_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..72cd79bdf9d17f2eb951509ba686019dbefb3e1f --- /dev/null +++ b/tools/kal-test/bin/ml/crf_run.sh @@ -0,0 +1,128 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1rd argument: name of dataset: [cityu/msr/as]" + echo "2th argument: optimization algorithm or raw: [no/yes]" + echo "3th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 3 ]; then + usage + exit 0 +fi + +source conf/ml/crf/crf_spark.properties +dataset_name=$1 +is_raw=$2 +if_check=$3 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors=${cpu_name}_${dataset_name}"_numExecutors" +executor_cores=${cpu_name}_${dataset_name}"_executorCores" +executor_memory=${cpu_name}_${dataset_name}"_executorMemory" +executor_extra_java_options=${cpu_name}_${dataset_name}"_extraJavaOptions" +driver_cores=${cpu_name}_${dataset_name}"_driverCores" +driver_memory=${cpu_name}_${dataset_name}"_driverMemory" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +executor_extra_java_options_val=${!executor_extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} + +echo "master : ${master}" +echo "deployMode : ${deployMode}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${executor_extra_java_options} : ${executor_extra_java_options_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${executor_extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master} ] \ + || [ ! ${deployMode} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path=${!dataset_name} +echo "${dataset_name} : ${data_path}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- CRF-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.CRFRunner \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master} \ + --conf "spark.executor.extraJavaOptions=${executor_extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.driver.maxResultSize=256g" \ + --driver-java-options "-Xms15g" \ + --conf "spark.task.maxFailures=1" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.CRFRunner \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master} \ + --driver-java-options "-Xms15g" \ + --conf "spark.executor.extraJavaOptions=${executor_extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.driver.maxResultSize=256g" \ + --driver-class-path "lib/snakeyaml-1.19.jar:lib/fastutil-8.3.1.jar" \ + --jars "lib/boostkit-ml-kernel-client_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/dtb_run.sh b/tools/kal-test/bin/ml/dtb_run.sh index f18aa27202fefda4145fd4936cdd82b134325734..2a18ec5b44fbd8c444435b1ef600649b5a0b1047 100644 --- a/tools/kal-test/bin/ml/dtb_run.sh +++ b/tools/kal-test/bin/ml/dtb_run.sh @@ -142,7 +142,7 @@ else --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ --conf "spark.executor.instances=${num_executors_val}" \ --conf "spark.taskmaxFailures=${max_failures_val}" \ - --jars "lib/snakeyaml-1.19.jar,lib/boostkit-ml-kernel-client-${scala_version_val}-${kal_version_val}-${spark_version_val}.jar" \ + --jars "lib/snakeyaml-1.19.jar,lib/boostkit-ml-kernel-client_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar" \ --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar" \ ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${is_raw} ${spark_conf} | tee ./log/log fi diff --git a/tools/kal-test/bin/ml/encoder_run.sh b/tools/kal-test/bin/ml/encoder_run.sh index 3a99061284bbd6b58597475dcb74c4700f41be2e..6f78353383e85b0e8cec36ef8c777388096a2cb9 100644 --- a/tools/kal-test/bin/ml/encoder_run.sh +++ b/tools/kal-test/bin/ml/encoder_run.sh @@ -108,7 +108,7 @@ if [ ${is_raw} == "no" ]; then --conf spark.eventLog.enabled=true \ --conf spark.driver.maxResultSize=40g \ --conf spark.network.timeout=60s \ - --conf "spark.driver.extraJavaOptions=-Xss5g -Dlog4j.configuration=file:./log4j.properties" \ + --conf "spark.driver.extraJavaOptions=-Xss5g" \ --driver-java-options "-Xms15g" \ --driver-class-path "lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ --jars "lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ diff --git a/tools/kal-test/bin/ml/fm_run.sh b/tools/kal-test/bin/ml/fm_run.sh index 8ae5603ba396f0c803327f535882a2a92d245272..124f2d7e08a7e30e0f7f4719360de9d315aabea4 100644 --- a/tools/kal-test/bin/ml/fm_run.sh +++ b/tools/kal-test/bin/ml/fm_run.sh @@ -4,8 +4,8 @@ set -e function usage() { echo "Usage: " echo "1st argument: type of algorithm: [classification/regression]" - echo "2nd argument: name of dataset: [epsilon/higgs/avazu/kdda]" - echo "3rd argument: name of API: [fit/fit1/fit2/fit3]" + echo "2nd argument: name of dataset: [higgs/avazu]" + echo "3rd argument: name of API: [fit]" echo "4th argument: optimization algorithm or raw: [no/yes]" echo "5th argument: Whether to Compare Results [no/yes]" } diff --git a/tools/kal-test/bin/ml/hdb_run.sh b/tools/kal-test/bin/ml/hdb_run.sh index 644c72b6def8bca9f4c24600657af51db74a5e0a..3c2afc5657c9f1f9f0657218a0233926f1117535 100644 --- a/tools/kal-test/bin/ml/hdb_run.sh +++ b/tools/kal-test/bin/ml/hdb_run.sh @@ -105,8 +105,8 @@ if [ ${is_raw} == "no" ]; then --conf "spark.task.maxFailures=100" \ --conf "spark.executor.extraJavaOptions=${executor_extra_java_options_val}" \ --driver-java-options "-Dlog4j.configuration=file:./log4j.properties" \ - --driver-class-path "lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ - --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar:/opt/ml_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar:/opt/ml_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log else spark-submit \ diff --git a/tools/kal-test/bin/ml/if_run.sh b/tools/kal-test/bin/ml/if_run.sh index 973ae805d705e21a503ea953b1c1576ff0e0e34e..85293094c8d2ce53e73308b1800f68dff7bb39a9 100644 --- a/tools/kal-test/bin/ml/if_run.sh +++ b/tools/kal-test/bin/ml/if_run.sh @@ -3,7 +3,7 @@ set -e function usage() { echo "Usage: " - echo "1rd argument: name of dataset: [if_40M_1k/if_1M_1k]" + echo "1rd argument: name of dataset: [if_40M_1K/if_1M_1K]" echo "2th argument: optimization algorithm or raw: [no/yes]" echo "3th argument: Whether to Compare Results [no/yes]" } @@ -98,7 +98,7 @@ if [ ${is_raw} == "no" ]; then --master ${master} \ --driver-java-options "-Xms15g -Dlog4j.configuration=file:./log4j.properties" \ --conf "spark.driver.maxResultSize=2g" \ - --conf "spark.sophon.isolationForest.parLevel=100" \ + --conf "spark.boostkit.isolationForest.parLevel=100" \ --jars "lib/isolation-forest_3.1.1_2.12-2.0.8.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ --driver-class-path "lib/isolation-forest_3.1.1_2.12-2.0.8.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/kal-test_${scala_version_val}-0.1.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ --conf "spark.executor.extraClassPath=/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ @@ -116,6 +116,6 @@ else --driver-java-options "-Xms15g -Dlog4j.configuration=file:./log4j.properties" \ --conf "spark.driver.maxResultSize=2g" \ --driver-class-path "lib/isolation-forest_3.1.1_2.12-2.0.8.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar" \ - --jars "lib/isolation-forest_3.1.1_2.12-2.0.8.jar,lib/boostkit-ml-kernel-client-${scala_version_val}-${kal_version_val}-${spark_version_val}.jar" \ + --jars "lib/isolation-forest_3.1.1_2.12-2.0.8.jar,lib/boostkit-ml-kernel-client_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar" \ ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/lgbm_run.sh b/tools/kal-test/bin/ml/lgbm_run.sh index b9e86227477a8646415ca619e6eb7b053c733c2e..813c8dfe620709eba01aa083ab451999ce0f04d0 100644 --- a/tools/kal-test/bin/ml/lgbm_run.sh +++ b/tools/kal-test/bin/ml/lgbm_run.sh @@ -4,7 +4,7 @@ set -e function usage() { echo "Usage: " echo "1st argument: type of algorithm: [classification/regression]" - echo "2nd argument: name of dataset:mnist8m, higgs,criteo " + echo "2nd argument: name of dataset:mnist8m, higgs " echo "3rd argument: optimization algorithm or raw: [no/yes]" echo "4th argument: Whether to Compare Results [no/yes]" } @@ -93,9 +93,9 @@ sleep 30 echo "start to submit spark jobs --- lgbm-${model_conf}" if [ ${is_raw} == "no" ]; then - scp lib/lightgbmlib.jar lib/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar lib/boostkit-lightgbm-kernel_${scala_version_val}-1.3.0.jar lib/fastutil-8.3.1.jar root@agent1:/opt/ml_classpath/ - scp lib/lightgbmlib.jar lib/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar lib/boostkit-lightgbm-kernel_${scala_version_val}-1.3.0.jar lib/fastutil-8.3.1.jar root@agent2:/opt/ml_classpath/ - scp lib/lightgbmlib.jar lib/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar lib/boostkit-lightgbm-kernel_${scala_version_val}-1.3.0.jar lib/fastutil-8.3.1.jar root@agent3:/opt/ml_classpath/ + scp lib/boostkit-lightgbmlib-${kal_version_val}.jar lib/boostkit-mmlspark-${scala_version_val}-${spark_version_val}-${kal_version_val}.jar lib/boostkit-lightgbm-kernel-${scala_version_val}-${spark_version_val}-${kal_version_val}.jar lib/fastutil-8.3.1.jar root@agent1:/opt/ml_classpath/ + scp lib/boostkit-lightgbmlib-${kal_version_val}.jar lib/boostkit-mmlspark-${scala_version_val}-${spark_version_val}-${kal_version_val}.jar lib/boostkit-lightgbm-kernel-${scala_version_val}-${spark_version_val}-${kal_version_val}.jar lib/fastutil-8.3.1.jar root@agent2:/opt/ml_classpath/ + scp lib/boostkit-lightgbmlib-${kal_version_val}.jar lib/boostkit-mmlspark-${scala_version_val}-${spark_version_val}-${kal_version_val}.jar lib/boostkit-lightgbm-kernel-${scala_version_val}-${spark_version_val}-${kal_version_val}.jar lib/fastutil-8.3.1.jar root@agent3:/opt/ml_classpath/ spark-submit \ --class com.bigdata.ml.LightGBMRunner \ @@ -107,15 +107,19 @@ if [ ${is_raw} == "no" ]; then --executor-memory ${executor_memory_val} \ --conf "spark.executor.memoryOverhead=${executor_memory_overhead_val}" \ --master ${master} \ - --files=lib/lib_lightgbm_close.so \ + --files=lib/libboostkit_lightgbm_close.so \ --conf "spark.executor.extraJavaOptions=${executor_extra_java_options_val}" \ - --jars "lib/lightgbmlib.jar,lib/fastutil-8.3.1.jar,lib/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar,lib/boostkit-lightgbm-kernel_${scala_version_val}-1.3.0.jar" \ - --driver-class-path "lib/lightgbmlib.jar:lib/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar:lib/boostkit-lightgbm-kernel_${scala_version_val}-1.3.0.jar:lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar" \ - --conf "spark.executor.extraClassPath=/opt/ml_classpath/lightgbmlib.jar:/opt/ml_classpath/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar:/opt/ml_classpath/boostkit-lightgbm-kernel_${scala_version_val}-1.3.0.jar:/opt/ml_classpath/fastutil-8.3.1.jar" \ + --jars "lib/boostkit-lightgbmlib-${kal_version_val}.jar,lib/fastutil-8.3.1.jar,lib/boostkit-mmlspark-${scala_version_val}-${spark_version_val}-${kal_version_val}.jar,lib/boostkit-lightgbm-kernel-${scala_version_val}-${spark_version_val}-${kal_version_val}.jar" \ + --driver-class-path "lib/boostkit-lightgbmlib-${kal_version_val}.jar:lib/boostkit-mmlspark-${scala_version_val}-${spark_version_val}-${kal_version_val}.jar:lib/boostkit-lightgbm-kernel-${scala_version_val}-${spark_version_val}-${kal_version_val}.jar:lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/boostkit-lightgbmlib-${kal_version_val}.jar:/opt/ml_classpath/boostkit-mmlspark-${scala_version_val}-${spark_version_val}-${kal_version_val}.jar:/opt/ml_classpath/boostkit-lightgbm-kernel-${scala_version_val}-${spark_version_val}-${kal_version_val}.jar:/opt/ml_classpath/fastutil-8.3.1.jar" \ ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log else + scp lib/boostkit-lightgbmlib-${kal_version_val}.jar lib/mmlspark_2.12_spark3.1.2-0.0.0+79-09152193.jar lib/fastutil-8.3.1.jar root@agent1:/opt/ml_classpath/ + scp lib/boostkit-lightgbmlib-${kal_version_val}.jar lib/mmlspark_2.12_spark3.1.2-0.0.0+79-09152193.jar lib/fastutil-8.3.1.jar root@agent2:/opt/ml_classpath/ + scp lib/boostkit-lightgbmlib-${kal_version_val}.jar lib/mmlspark_2.12_spark3.1.2-0.0.0+79-09152193.jar lib/fastutil-8.3.1.jar root@agent3:/opt/ml_classpath/ + spark-submit \ - --class com.bigdata.ml.LightGBMRunner \ + --class com.bigdata.ml.LightGBMRawRunner \ --deploy-mode ${deployMode} \ --driver-cores ${driver_cores_val} \ --driver-memory ${driver_memory_val} \ @@ -123,9 +127,9 @@ else --executor-cores ${executor_cores_val} \ --executor-memory ${executor_memory_val} \ --master ${master} \ - --jars "lib/lightgbmlib.jar,lib/snakeyaml-1.19.jar,lib/fastutil-8.3.1.jar,lib/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar" \ + --jars "lib/lightgbmlib.jar,lib/snakeyaml-1.19.jar,lib/fastutil-8.3.1.jar,lib/mmlspark_2.12_spark3.1.2-0.0.0+79-09152193.jar" \ --conf "spark.executor.extraJavaOptions=${executor_extra_java_options_val}" \ - --driver-class-path "lib/lightgbmlib.jar,lib/snakeyaml-1.19.jar,lib/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar" \ - --conf "spark.executor.extraClassPath=/opt/ml_classpath/lightgbmlib.jar:/opt/ml_classpath/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar:/opt/ml_classpath/fastutil-8.3.1.jar" \ + --driver-class-path "lib/lightgbmlib.jar,lib/snakeyaml-1.19.jar,lib/mmlspark_2.12_spark3.1.2-0.0.0+79-09152193.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/lightgbmlib.jar:/opt/ml_classpath/mmlspark_2.12_spark3.1.2-0.0.0+79-09152193.jar:/opt/ml_classpath/fastutil-8.3.1.jar" \ ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/nmf_run.sh b/tools/kal-test/bin/ml/nmf_run.sh index 6fe737663a9d0b454a01743be0e99adace817a26..84dab959437613cc8bf637c4b47c1cd30350b5ef 100644 --- a/tools/kal-test/bin/ml/nmf_run.sh +++ b/tools/kal-test/bin/ml/nmf_run.sh @@ -123,6 +123,6 @@ else --master ${master} \ --conf "spark.executor.extraJavaOptions=${executor_extra_java_options_val} -Xss512m" \ --driver-class-path "lib/snakeyaml-1.19.jar" \ - --jars "lib/boostkit-ml-kernel-client-${scala_version_val}-${kal_version_val}-${spark_version_val}.jar" \ + --jars "lib/boostkit-ml-kernel-client_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar.jar" \ ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/spca_run.sh b/tools/kal-test/bin/ml/spca_run.sh index c849fd1eb932fbdb3d601e89c80b3af91eb0b5ef..7f3e2fdb0c1fa6b97cc8db15782a1e039b7c8e86 100644 --- a/tools/kal-test/bin/ml/spca_run.sh +++ b/tools/kal-test/bin/ml/spca_run.sh @@ -137,7 +137,7 @@ else --executor-cores ${executor_cores_val} \ --executor-memory ${executor_memory_val} \ --master ${master_val} \ - --jars "lib/snakeyaml-1.19.jar,lib/boostkit-ml-kernel-client-${scala_version_val}-${kal_version_val}-${spark_version_val}.jar" \ + --jars "lib/snakeyaml-1.19.jar,lib/boostkit-ml-kernel-client_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar" \ --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ --conf "spark.executor.instances=${num_executors_val}" \ --conf "spark.executor.memory_overhead=${memory_overhead_val}" \ diff --git a/tools/kal-test/bin/ml_workflow.sh b/tools/kal-test/bin/ml_workflow.sh index a5282a70563d2835b8fa3f78ec1d2d8a1b20cd99..bcd62ab1783b410640b36a0266aad63d63fb6310 100644 --- a/tools/kal-test/bin/ml_workflow.sh +++ b/tools/kal-test/bin/ml_workflow.sh @@ -69,6 +69,11 @@ bash bin/ml/cov_run.sh CP10M1K ${is_raw} ${if_check} 2>&1 | tee -a logs/cov_CP10 bash bin/ml/cov_run.sh CP2M5K ${is_raw} ${if_check} 2>&1 | tee -a logs/cov_CP2M5K_${type}.log bash bin/ml/cov_run.sh CP1M10K ${is_raw} ${if_check} 2>&1 | tee -a logs/cov_CP1M10K_${type}.log +#CRF +bash bin/ml/crf_run.sh cityu ${is_raw} ${if_check} 2>&1 | tee -a logs/crf_cityu_${type}.log +bash bin/ml/crf_run.sh msr ${is_raw} ${if_check} 2>&1 | tee -a logs/crf_msr_${type}.log +bash bin/ml/crf_run.sh as ${is_raw} ${if_check} 2>&1 | tee -a logs/crf_as_${type}.log + #DBSCAN bash bin/ml/dbscan_run.sh bremenSmall ${is_raw} 2>&1 | tee -a logs/dbscan_bremenSmall_${type}.log bash bin/ml/dbscan_run.sh farm ${is_raw} 2>&1 | tee -a logs/dbscan_farm_${type}.log @@ -121,14 +126,10 @@ bash bin/ml/encoder_run.sh encoder_400m ${is_raw} ${if_check} 2>&1 | tee -a logs bash bin/ml/encoder_run.sh encoder_800m ${is_raw} ${if_check} 2>&1 | tee -a logs/encoder_encoder_800m_${type}.log #fm -bash bin/ml/fm_run.sh classification epsilon fit ${is_raw} ${if_check} 2>&1 | tee -a logs/fmc_epsilon_${type}.log -bash bin/ml/fm_run.sh regression epsilon fit ${is_raw} ${if_check} 2>&1 | tee -a logs/fmr_epsilon_${type}.log bash bin/ml/fm_run.sh classification higgs fit ${is_raw} ${if_check} 2>&1 | tee -a logs/fmc_higgs_${type}.log bash bin/ml/fm_run.sh regression higgs fit ${is_raw} ${if_check} 2>&1 | tee -a logs/fmr_higgs_${type}.log bash bin/ml/fm_run.sh classification avazu fit ${is_raw} ${if_check} 2>&1 | tee -a logs/fmc_avazu_${type}.log bash bin/ml/fm_run.sh regression avazu fit ${is_raw} ${if_check} 2>&1 | tee -a logs/fmr_avazu_${type}.log -bash bin/ml/fm_run.sh classification kdda fit ${is_raw} ${if_check} 2>&1 | tee -a logs/fmc_kdda_${type}.log -bash bin/ml/fm_run.sh regression kdda fit ${is_raw} ${if_check} 2>&1 | tee -a logs/fmr_kdda_${type}.log #fpg bash bin/ml/fpg_run.sh Kosarak ${is_raw} ${if_check} 2>&1 | tee -a logs/fpg_Kosarak_${type}.log @@ -175,8 +176,8 @@ bash bin/ml/idf_run.sh D10m200m ${is_raw} ${if_check} 2>&1 | tee -a logs/idf_D10 bash bin/ml/idf_run.sh D2g250m ${is_raw} ${if_check} 2>&1 | tee -a logs/idf_D2g250m_${type}.log #if -bash bin/ml/if_run.sh if_40M_1k ${is_raw} ${if_check} 2>&1 | tee -a logs/if_if_40M_1k_${type}.log -bash bin/ml/if_run.sh if_1M_1k ${is_raw} ${if_check} 2>&1 | tee -a logs/if_if_1M_1k_${type}.log +bash bin/ml/if_run.sh if_40M_1K ${is_raw} ${if_check} 2>&1 | tee -a logs/if_if_40M_1K_${type}.log +bash bin/ml/if_run.sh if_1M_1K ${is_raw} ${if_check} 2>&1 | tee -a logs/if_if_1M_1K_${type}.log # KMEANS bash bin/ml/kmeans_run.sh dataframe D1200M20 fit ${is_raw} ${if_check} 2>&1 | tee -a logs/kmeans_D1200M20_fit_${type}.log diff --git a/tools/kal-test/bin/preprocess/graph/mce_data_process.sh b/tools/kal-test/bin/preprocess/graph/mce_data_process.sh new file mode 100644 index 0000000000000000000000000000000000000000..d53f6774581dcaff69183243eff77f99fc2e3b6d --- /dev/null +++ b/tools/kal-test/bin/preprocess/graph/mce_data_process.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -e + +case "$1" in +-h | --help | ?) + echo "Usage: " + echo "1st argument: inputPath of dataset" + echo "2nd argument: outPath of dataset" + exit 0 + ;; +esac + +if [ $# -ne 2 ]; then + echo "please input 2 arguments: " + echo "1st argument: inputPath of dataset" + echo "2nd argument: outPath of dataset" + exit 0 +fi + +inputPath=$1 +outPath=$2 +hadoop fs -mkdir -p ${outPath} +hadoop fs -rm -r ${outPath} + +source conf/graph/graph_datasets.properties +scala_version=scalaVersion +scala_version_val=${!scala_version} + +spark-submit \ +--class com.bigdata.preprocess.graph.MceRawDataProcess \ +--master yarn \ +--num-executors 39 \ +--executor-memory 23g \ +--executor-cores 7 \ +--driver-memory 80g \ +./lib/kal-test_${scala_version_val}-0.1.jar ${inputPath} ${outPath} | tee ./log/log \ No newline at end of file diff --git a/tools/kal-test/bin/preprocess/ml/als_data_gen.sh b/tools/kal-test/bin/preprocess/ml/als_data_gen.sh new file mode 100644 index 0000000000000000000000000000000000000000..f59459fe40283925c9249835460e34635f71ca94 --- /dev/null +++ b/tools/kal-test/bin/preprocess/ml/als_data_gen.sh @@ -0,0 +1,50 @@ +saveDataPath=/tmp/ml/dataset/ALS + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +ALS_Path=${saveDataPath}/${spark_version_val}"/als" +ALSbs_Path=${saveDataPath}/${spark_version_val}"/als_bs" +ALSh_Path=${saveDataPath}/${spark_version_val}"/als_h" + +spark-submit \ +--class com.bigdata.preprocess.ml.ALSDataGenRun \ +--master yarn \ +--deploy-mode client \ +--driver-cores 36 \ +--driver-memory 50g \ +--num-executors 12 \ +--executor-cores 23 \ +--executor-memory 79g \ +--conf "spark.executor.extraJavaOptions=-Xms20g -Xss5g" \ +./lib/kal-test_${scala_version_val}-0.1.jar \ +${ALS_Path} 50000 60000 0.05 true + +spark-submit \ +--class com.bigdata.preprocess.ml.ALSDataGenRun \ +--master yarn \ +--deploy-mode client \ +--driver-cores 36 \ +--driver-memory 50g \ +--num-executors 12 \ +--executor-cores 23 \ +--executor-memory 79g \ +--conf "spark.executor.extraJavaOptions=-Xms20g -Xss5g" \ +./lib/kal-test_${scala_version_val}-0.1.jar \ +${ALSbs_Path} 100000 120000 0.02 true + +spark-submit \ +--class com.bigdata.preprocess.ml.ALSDataGenRun \ +--master yarn \ +--deploy-mode client \ +--driver-cores 36 \ +--driver-memory 50g \ +--num-executors 12 \ +--executor-cores 23 \ +--executor-memory 79g \ +--conf "spark.executor.extraJavaOptions=-Xms20g -Xss5g" \ +./lib/kal-test_${scala_version_val}-0.1.jar \ +${ALSh_Path} 25000 30000 0.05 true \ No newline at end of file diff --git a/tools/kal-test/conf/graph/bfs/bfs_spark.properties b/tools/kal-test/conf/graph/bfs/bfs_spark.properties index 511ce9a1438c03f02aced4fdb6411baf24df1d19..9de6eda0eb56c318376ce783d060d5cc0c2d21c3 100644 --- a/tools/kal-test/conf/graph/bfs/bfs_spark.properties +++ b/tools/kal-test/conf/graph/bfs/bfs_spark.properties @@ -72,7 +72,7 @@ arabic_2005_split="\t" arabic_2005_q=true graph500_22_split=" " graph500_22_q=false -graph500_23_split="," +graph500_23_split=" " graph500_23_q=false graph500_25_split=" " graph500_25_q=false \ No newline at end of file diff --git a/tools/kal-test/conf/graph/graph_datasets.properties b/tools/kal-test/conf/graph/graph_datasets.properties index ebbbc81834445a7e3c49b419a5bf2541a2c78899..81c1e0e2830ec68f3e5c405feb2ba0216e4f7dec 100644 --- a/tools/kal-test/conf/graph/graph_datasets.properties +++ b/tools/kal-test/conf/graph/graph_datasets.properties @@ -2,7 +2,7 @@ sparkVersion=spark3.1.1 # kalVersion -kalVersion=2.2.0 +kalVersion=3.0.0 # scalaVersion scalaVersion=2.12 @@ -15,6 +15,8 @@ graph500_24=hdfs:///tmp/graph/dataset/graph500-24.e graph500_25=hdfs:///tmp/graph/dataset/graph500-25.e graph500_26=hdfs:///tmp/graph/dataset/graph500-26.e liveJournal=hdfs:///tmp/graph/dataset/com-lj.ungraph.txt +comlj=hdfs:///tmp/graph/dataset/com-lj.ungraph.txt +orkut=hdfs:///tmp/graph/dataset/com-orkut.ungraph.txt mycielskian20=hdfs:///tmp/graph/dataset/mycielskian20.mtx gap_kron=hdfs:///tmp/graph/dataset/GAP-kron.mtx com_friendster=hdfs:///tmp/graph/dataset/com-Friendster.mtx diff --git a/tools/kal-test/conf/graph/modularity/modularity_spark.properties b/tools/kal-test/conf/graph/modularity/modularity_spark.properties index bad2d1c24f5ce0e04bc59fddb997972bbc6fb8f8..2782e3cf37403af47a6281fa3e696ee025d9e1a4 100644 --- a/tools/kal-test/conf/graph/modularity/modularity_spark.properties +++ b/tools/kal-test/conf/graph/modularity/modularity_spark.properties @@ -15,10 +15,10 @@ graph500_26_executorCores_aarch64=3 graph500_26_executorMemory_aarch64=11G graph500_26_extraJavaOptions_aarch64=-Xms11g -uk_numExectuors_aarch64=47 -uk_executorCores_aarch64=6 -uk_executorMemory_aarch64=19G -uk_extraJavaOptions_aarch64=-Xms19g +uk_2002_numExectuors_aarch64=47 +uk_2002_executorCores_aarch64=6 +uk_2002_executorMemory_aarch64=19G +uk_2002_extraJavaOptions_aarch64=-Xms19g arabic_2005_numExectuors_aarch64=47 arabic_2005_executorCores_aarch64=6 @@ -30,10 +30,10 @@ twitter_executorCores_aarch64=3 twitter_executorMemory_aarch64=9G twitter_extraJavaOptions_aarch64=-Xms9g -uk_numExectuors_x86_64=33 -uk_executorCores_x86_64=7 -uk_executorMemory_x86_64=28G -uk_extraJavaOptions_x86_64=-Xms28g +uk_2002_numExectuors_x86_64=33 +uk_2002_executorCores_x86_64=7 +uk_2002_executorMemory_x86_64=28G +uk_2002_extraJavaOptions_x86_64=-Xms28g arabic_2005_numExectuors_x86_64=47 arabic_2005_executorCores_x86_64=5 diff --git a/tools/kal-test/conf/graph/slpa/slpa.yml b/tools/kal-test/conf/graph/slpa/slpa.yml new file mode 100644 index 0000000000000000000000000000000000000000..4af010ca15af35a468dcd7fe681305512a7fd46c --- /dev/null +++ b/tools/kal-test/conf/graph/slpa/slpa.yml @@ -0,0 +1,35 @@ +slpa: + opt: + comlj: + splitGraph: "\t" + partitions: 284 + iterNum: 100 + threshold: 0.2 + isWeighted: false + isDirected: false + + orkut: + splitGraph: "\t" + partitions: 284 + iterNum: 100 + threshold: 0.2 + isWeighted: false + isDirected: false + + uk_2002: + splitGraph: "\t" + partitions: 284 + iterNum: 100 + threshold: 0.2 + isWeighted: false + isDirected: true + + arabic_2005: + splitGraph: "\t" + partitions: 284 + iterNum: 100 + threshold: 0.2 + isWeighted: false + isDirected: true + + diff --git a/tools/kal-test/conf/graph/slpa/slpa_spark.properties b/tools/kal-test/conf/graph/slpa/slpa_spark.properties new file mode 100644 index 0000000000000000000000000000000000000000..874d03b3f47ab606fbb298369116ef29b674d303 --- /dev/null +++ b/tools/kal-test/conf/graph/slpa/slpa_spark.properties @@ -0,0 +1,40 @@ +deployMode=client +numExectuors_comlj_aarch64=71 +executorCores_comlj_aarch64=4 +executorMemory_comlj_aarch64=12G +extraJavaOptions_comlj_aarch64=-Xms12G + +numExectuors_comlj_x86_64=71 +executorCores_comlj_x86_64=4 +executorMemory_comlj_x86_64=12G +extraJavaOptions_comlj_x86_64=-Xms12G + +numExectuors_orkut_aarch64=71 +executorCores_orkut_aarch64=4 +executorMemory_orkut_aarch64=12G +extraJavaOptions_orkut_aarch64=-Xms12G + +numExectuors_orkut_x86_64=71 +executorCores_orkut_x86_64=4 +executorMemory_orkut_x86_64=12G +extraJavaOptions_orkut_x86_64=-Xms12G + +numExectuors_uk_2002_aarch64=71 +executorCores_uk_2002_aarch64=4 +executorMemory_uk_2002_aarch64=12G +extraJavaOptions_uk_2002_aarch64=-Xms12G + +numExectuors_uk_2002_x86_64=71 +executorCores_uk_2002_x86_64=4 +executorMemory_uk_2002_x86_64=12G +extraJavaOptions_uk_2002_x86_64=-Xms12G + +numExectuors_arabic_2005_aarch64=71 +executorCores_arabic_2005_aarch64=4 +executorMemory_arabic_2005_aarch64=12G +extraJavaOptions_arabic_2005_aarch64=-Xms12G + +numExectuors_arabic_2005_x86_64=71 +executorCores_arabic_2005_x86_64=4 +executorMemory_arabic_2005_x86_64=12G +extraJavaOptions_arabic_2005_x86_64=-Xms12G diff --git a/tools/kal-test/conf/ml/crf/crf.yml b/tools/kal-test/conf/ml/crf/crf.yml new file mode 100644 index 0000000000000000000000000000000000000000..d20dc77d5f6d1894efc7f4d325ca57c65cb861ac --- /dev/null +++ b/tools/kal-test/conf/ml/crf/crf.yml @@ -0,0 +1,67 @@ +#crf model params + +crf: + opt: + cityu: + pt: 30 + maxIter: 300 + regParam: 0.01 + freq: 15 + compLevel: 1 + nThread: 6 + tol: 0.001 + calAcc: true + templatePath: "/tmp/ml/dataset/CRF/template" + msr: + pt: 60 + maxIter: 300 + regParam: 0.01 + freq: 20 + compLevel: 1 + nThread: 3 + tol: 0.001 + calAcc: true + templatePath: "/tmp/ml/dataset/CRF/template" + as: + pt: 3 + maxIter: 300 + regParam: 0.01 + freq: 40 + compLevel: 1 + nThread: 62 + tol: 0.001 + calAcc: true + templatePath: "/tmp/ml/dataset/CRF/template" + + + raw: + cityu: + pt: 35 + maxIter: 300 + regParam: 0.01 + freq: 15 + tol: 0.001 + calAcc: true + templatePath: "/tmp/ml/dataset/CRF/template" + compLevel: + nThread: + msr: + pt: 35 + maxIter: 300 + regParam: 0.01 + freq: 20 + tol: 0.001 + calAcc: true + templatePath: "/tmp/ml/dataset/CRF/template" + compLevel: + nThread: + as: + pt: 70 + maxIter: 300 + regParam: 0.01 + freq: 40 + tol: 0.001 + calAcc: true + templatePath: "/tmp/ml/dataset/CRF/template" + compLevel: + nThread: diff --git a/tools/kal-test/conf/ml/crf/crf_spark.properties b/tools/kal-test/conf/ml/crf/crf_spark.properties new file mode 100644 index 0000000000000000000000000000000000000000..817a91de7f72cd375ec360c319ab064807c98062 --- /dev/null +++ b/tools/kal-test/conf/ml/crf/crf_spark.properties @@ -0,0 +1,43 @@ +master=yarn +deployMode=client +aarch64_cityu_driverCores=36 +aarch64_cityu_driverMemory=50G +aarch64_cityu_numExecutors=30 +aarch64_cityu_executorCores=6 +aarch64_cityu_executorMemory=30G +aarch64_cityu_extraJavaOptions=-Xms30g + +aarch64_msr_driverCores=36 +aarch64_msr_driverMemory=50G +aarch64_msr_numExecutors=60 +aarch64_msr_executorCores=3 +aarch64_msr_executorMemory=15G +aarch64_msr_extraJavaOptions=-Xms15g + +aarch64_as_driverCores=36 +aarch64_as_driverMemory=50G +aarch64_as_numExecutors=3 +aarch64_as_executorCores=62 +aarch64_as_executorMemory=300G +aarch64_as_extraJavaOptions=-Xms300g + +x86_64_cityu_driverCores=36 +x86_64_cityu_driverMemory=50G +x86_64_cityu_numExecutors=35 +x86_64_cityu_executorCores=8 +x86_64_cityu_executorMemory=26G +x86_64_cityu_extraJavaOptions=-Xms26g + +x86_64_msr_driverCores=36 +x86_64_msr_driverMemory=50G +x86_64_msr_numExecutors=35 +x86_64_msr_executorCores=8 +x86_64_msr_executorMemory=26G +x86_64_msr_extraJavaOptions=-Xms26g + +x86_64_as_driverCores=36 +x86_64_as_driverMemory=50G +x86_64_as_numExecutors=35 +x86_64_as_executorCores=8 +x86_64_as_executorMemory=26G +x86_64_as_extraJavaOptions=-Xms26g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/dt/dt.yml b/tools/kal-test/conf/ml/dt/dt.yml index acbb91488356d50d0f7325c6003dca61ebb2ff17..684463f877a5bb176ffad2ae9aa0b9df8ba24585 100644 --- a/tools/kal-test/conf/ml/dt/dt.yml +++ b/tools/kal-test/conf/ml/dt/dt.yml @@ -6,8 +6,8 @@ dt: dataframe: higgs: genericPt: 300 - maxMemoryInMB: 5120 - pt: 36 + maxMemoryInMB: 3072 + pt: 22 numCopiesInput: 7 maxDepth: 17 maxBins: 512 @@ -17,11 +17,11 @@ dt: bcVariables: false featuresType: "array" copyStrategy: "normal" - useDFCollPtner: "true" + useDFCollPtner: "false" mnist8m: genericPt: 300 maxMemoryInMB: 4096 - pt: 36 + pt: 22 numCopiesInput: 7 maxDepth: 14 maxBins: 128 @@ -31,11 +31,11 @@ dt: bcVariables: false featuresType: "array" copyStrategy: "normal" - useDFCollPtner: "true" + useDFCollPtner: "false" epsilon: genericPt: 300 maxMemoryInMB: 4096 - pt: 36 + pt: 22 numCopiesInput: 7 maxDepth: 14 maxBins: 128 @@ -45,12 +45,12 @@ dt: bcVariables: false featuresType: "array" copyStrategy: "normal" - useDFCollPtner: "true" + useDFCollPtner: "false" rdd: higgs: genericPt: 300 - maxMemoryInMB: 5120 - pt: 36 + maxMemoryInMB: 3072 + pt: 22 numCopiesInput: 7 maxDepth: 17 maxBins: 512 @@ -60,11 +60,11 @@ dt: bcVariables: false featuresType: "array" copyStrategy: "normal" - useDFCollPtner: "true" + useDFCollPtner: "false" mnist8m: genericPt: 300 maxMemoryInMB: 4096 - pt: 36 + pt: 22 numCopiesInput: 7 maxDepth: 14 maxBins: 128 @@ -74,11 +74,11 @@ dt: bcVariables: false featuresType: "array" copyStrategy: "normal" - useDFCollPtner: "true" + useDFCollPtner: "false" epsilon: genericPt: 300 maxMemoryInMB: 4096 - pt: 36 + pt: 22 numCopiesInput: 7 maxDepth: 14 maxBins: 128 @@ -88,13 +88,13 @@ dt: bcVariables: false featuresType: "array" copyStrategy: "normal" - useDFCollPtner: "true" + useDFCollPtner: "false" regression: dataframe: higgs: genericPt: 300 - maxMemoryInMB: 4096 - pt: 36 + maxMemoryInMB: 3072 + pt: 22 numCopiesInput: 7 maxDepth: 17 maxBins: 512 @@ -104,11 +104,11 @@ dt: bcVariables: false featuresType: "array" copyStrategy: "normal" - useDFCollPtner: "true" + useDFCollPtner: "false" mnist8m: genericPt: 300 maxMemoryInMB: 4096 - pt: 36 + pt: 22 numCopiesInput: 7 maxDepth: 15 maxBins: 128 @@ -118,11 +118,11 @@ dt: bcVariables: false featuresType: "array" copyStrategy: "normal" - useDFCollPtner: "true" + useDFCollPtner: "false" epsilon: genericPt: 300 maxMemoryInMB: 4096 - pt: 36 + pt: 22 numCopiesInput: 7 maxDepth: 13 maxBins: 128 @@ -132,12 +132,12 @@ dt: bcVariables: false featuresType: "array" copyStrategy: "normal" - useDFCollPtner: "true" + useDFCollPtner: "false" rdd: higgs: genericPt: 300 - maxMemoryInMB: 4096 - pt: 36 + maxMemoryInMB: 3072 + pt: 22 numCopiesInput: 7 maxDepth: 17 maxBins: 512 @@ -147,11 +147,11 @@ dt: bcVariables: false featuresType: "array" copyStrategy: "normal" - useDFCollPtner: "true" + useDFCollPtner: "false" mnist8m: genericPt: 300 maxMemoryInMB: 4096 - pt: 36 + pt: 22 numCopiesInput: 7 maxDepth: 15 maxBins: 128 @@ -161,11 +161,11 @@ dt: bcVariables: false featuresType: "array" copyStrategy: "normal" - useDFCollPtner: "true" + useDFCollPtner: "false" epsilon: genericPt: 300 maxMemoryInMB: 4096 - pt: 36 + pt: 22 numCopiesInput: 7 maxDepth: 13 maxBins: 128 @@ -175,7 +175,7 @@ dt: bcVariables: false featuresType: "array" copyStrategy: "normal" - useDFCollPtner: "true" + useDFCollPtner: "false" raw: classification: diff --git a/tools/kal-test/conf/ml/dt/dt_spark.properties b/tools/kal-test/conf/ml/dt/dt_spark.properties index aeb84524c3b13bc3965722d09feaefbe63c9ee5a..bc3e05a0d738ddcf0d6256c63a1c75f4e8bcb2bd 100644 --- a/tools/kal-test/conf/ml/dt/dt_spark.properties +++ b/tools/kal-test/conf/ml/dt/dt_spark.properties @@ -9,35 +9,35 @@ maxFailures=1 compress=false # arm -aarch64_classification_higgs_numExectuors=12 -aarch64_classification_higgs_executorCores=23 -aarch64_classification_higgs_executorMemory=79G -aarch64_classification_higgs_extraJavaOptions=-Xms79g +aarch64_classification_higgs_numExectuors=47 +aarch64_classification_higgs_executorCores=4 +aarch64_classification_higgs_executorMemory=19G +aarch64_classification_higgs_extraJavaOptions=-Xms19G -aarch64_regression_higgs_numExectuors=12 -aarch64_regression_higgs_executorCores=23 -aarch64_regression_higgs_executorMemory=79G -aarch64_regression_higgs_extraJavaOptions=-Xms79g +aarch64_regression_higgs_numExectuors=47 +aarch64_regression_higgs_executorCores=4 +aarch64_regression_higgs_executorMemory=19G +aarch64_regression_higgs_extraJavaOptions=-Xms19G -aarch64_classification_mnist8m_numExectuors=35 -aarch64_classification_mnist8m_executorCores=8 -aarch64_classification_mnist8m_executorMemory=26G -aarch64_classification_mnist8m_extraJavaOptions=-Xms26g +aarch64_classification_mnist8m_numExectuors=47 +aarch64_classification_mnist8m_executorCores=4 +aarch64_classification_mnist8m_executorMemory=19G +aarch64_classification_mnist8m_extraJavaOptions=-Xms19G -aarch64_regression_mnist8m_numExectuors=35 -aarch64_regression_mnist8m_executorCores=8 -aarch64_regression_mnist8m_executorMemory=26G -aarch64_regression_mnist8m_extraJavaOptions=-Xms26g +aarch64_regression_mnist8m_numExectuors=47 +aarch64_regression_mnist8m_executorCores=4 +aarch64_regression_mnist8m_executorMemory=19G +aarch64_regression_mnist8m_extraJavaOptions=-Xms19G -aarch64_classification_epsilon_numExectuors=12 -aarch64_classification_epsilon_executorCores=23 -aarch64_classification_epsilon_executorMemory=79G -aarch64_classification_epsilon_extraJavaOptions=-Xms79g +aarch64_classification_epsilon_numExectuors=47 +aarch64_classification_epsilon_executorCores=4 +aarch64_classification_epsilon_executorMemory=19G +aarch64_classification_epsilon_extraJavaOptions=-Xms19G -aarch64_regression_epsilon_numExectuors=12 -aarch64_regression_epsilon_executorCores=23 -aarch64_regression_epsilon_executorMemory=79G -aarch64_regression_epsilon_extraJavaOptions=-Xms79g +aarch64_regression_epsilon_numExectuors=47 +aarch64_regression_epsilon_executorCores=4 +aarch64_regression_epsilon_executorMemory=19G +aarch64_regression_epsilon_extraJavaOptions=-Xms19G # x86_64 x86_64_classification_higgs_numExectuors=12 diff --git a/tools/kal-test/conf/ml/dtb/dtb.yml b/tools/kal-test/conf/ml/dtb/dtb.yml index af22edd9a8406b480c234af4b9c27de38e02d89d..9d91f2dc52658172564a3aae34ed2c15a24fcbe7 100644 --- a/tools/kal-test/conf/ml/dtb/dtb.yml +++ b/tools/kal-test/conf/ml/dtb/dtb.yml @@ -1,8 +1,8 @@ dtb: opt: higgs: - genericPt: 276 - pt: 276 + genericPt: 184 + pt: 184 maxDepth: 5 maxBins: 10000 maxMemoryInMB: 256 @@ -13,8 +13,8 @@ dtb: bcVariables: false mnist8m: - genericPt: 276 - pt: 276 + genericPt: 184 + pt: 184 maxDepth: 5 maxBins: 10000 maxMemoryInMB: 256 @@ -26,8 +26,8 @@ dtb: raw: higgs: - genericPt: 232 - pt: 116 + genericPt: 280 + pt: 280 maxDepth: 5 maxBins: 10000 maxMemoryInMB: 256 @@ -38,8 +38,8 @@ dtb: bcVariables: false mnist8m: - genericPt: 232 - pt: 232 + genericPt: 280 + pt: 280 maxDepth: 5 maxBins: 10000 maxMemoryInMB: 256 diff --git a/tools/kal-test/conf/ml/gbdt/gbdt_spark.properties b/tools/kal-test/conf/ml/gbdt/gbdt_spark.properties index e27362108186ad4d9be2fb2b348a4485539e8338..a5503bb3aec9991a3687a38e67ef013bc6a41610 100644 --- a/tools/kal-test/conf/ml/gbdt/gbdt_spark.properties +++ b/tools/kal-test/conf/ml/gbdt/gbdt_spark.properties @@ -1,11 +1,11 @@ # Spark parameters numExectuors=12 -epsilon_executorCores_aarch64=19 +epsilon_executorCores_aarch64=15 epsilon_executorCores_x86_64=19 -rcv_executorCores_aarch64=19 +rcv_executorCores_aarch64=15 rcv_executorCores_x86_64=19 -D10M4096libsvm_executorCores_aarch64=23 +D10M4096libsvm_executorCores_aarch64=15 D10M4096libsvm_executorCores_x86_64=19 executorMemory=77G diff --git a/tools/kal-test/conf/ml/knn/knn.yml b/tools/kal-test/conf/ml/knn/knn.yml index 1a7372c6d36f11e5e219e823044e0225ed38c989..f50441f469374c43d1296a8a72dfcd60734d8f46 100644 --- a/tools/kal-test/conf/ml/knn/knn.yml +++ b/tools/kal-test/conf/ml/knn/knn.yml @@ -3,7 +3,7 @@ knn: opt: glove: - pt: 280 + pt: 188 k: 100 testNum: 50000 testBatchSize: 5000 @@ -12,19 +12,19 @@ knn: subTreeLeafSize: 30 gist: - pt: 280 + pt: 188 k: 100 testNum: 50000 - testBatchSize: 5000 + testBatchSize: 1000 topTreeSizeRate: 10.0 topTreeLeafSize: 10 subTreeLeafSize: 30 deep1b: - pt: 280 + pt: 188 k: 100 testNum: 50000 - testBatchSize: 2000 + testBatchSize: 1000 topTreeSizeRate: 10.0 topTreeLeafSize: 10 subTreeLeafSize: 30 diff --git a/tools/kal-test/conf/ml/knn/knn_spark.properties b/tools/kal-test/conf/ml/knn/knn_spark.properties index ecedc585b140b29e36d9a9b93c69952fc1e28e3e..24c9cb1e98af37a197d9c47f52603858ea013135 100644 --- a/tools/kal-test/conf/ml/knn/knn_spark.properties +++ b/tools/kal-test/conf/ml/knn/knn_spark.properties @@ -5,26 +5,26 @@ compress=false driverCores_glove_aarch64=36 driverMemory_glove_aarch64=50G -numExectuors_glove_aarch64=35 -executorCores_glove_aarch64=8 -executorMemory_glove_aarch64=26G -extraJavaOptions_glove_aarch64=-Xms26g +numExectuors_glove_aarch64=47 +executorCores_glove_aarch64=4 +executorMemory_glove_aarch64=19G +extraJavaOptions_glove_aarch64=-Xms19g execMemOverhead_glove_aarch64=3G driverCores_gist_aarch64=36 driverMemory_gist_aarch64=50G -numExectuors_gist_aarch64=35 -executorCores_gist_aarch64=8 -executorMemory_gist_aarch64=26G -extraJavaOptions_gist_aarch64=-Xms26g +numExectuors_gist_aarch64=47 +executorCores_gist_aarch64=4 +executorMemory_gist_aarch64=19G +extraJavaOptions_gist_aarch64=-Xms19g execMemOverhead_gist_aarch64=3G driverCores_deep1b_aarch64=36 driverMemory_deep1b_aarch64=50G -numExectuors_deep1b_aarch64=35 -executorCores_deep1b_aarch64=8 -executorMemory_deep1b_aarch64=26G -extraJavaOptions_deep1b_aarch64=-Xms26g +numExectuors_deep1b_aarch64=47 +executorCores_deep1b_aarch64=4 +executorMemory_deep1b_aarch64=19G +extraJavaOptions_deep1b_aarch64=-Xms19g execMemOverhead_deep1b_aarch64=3G diff --git a/tools/kal-test/conf/ml/lgbm/lgbm.yml b/tools/kal-test/conf/ml/lgbm/lgbm.yml index 32145ecfbc14a3aaaf90fa3f8ef4f849f14fb2f8..c8f12c07dc0a537b53743923be67be6c4df0420d 100644 --- a/tools/kal-test/conf/ml/lgbm/lgbm.yml +++ b/tools/kal-test/conf/ml/lgbm/lgbm.yml @@ -157,7 +157,7 @@ lgbm: max_depth: 6 max_bin: 16 num_round: 500 - num_tasks: 59 + num_tasks: 177 min_gain_to_split: 1.0 lambda_l2: 1.0 num_leaves: 64 @@ -181,7 +181,7 @@ lgbm: max_depth: 6 max_bin: 16 num_round: 500 - num_tasks: 59 + num_tasks: 177 min_gain_to_split: 1.0 lambda_l2: 1.0 num_leaves: 64 @@ -205,7 +205,7 @@ lgbm: max_depth: 6 max_bin: 16 num_round: 500 - num_tasks: 59 + num_tasks: 177 min_gain_to_split: 1.0 lambda_l2: 1.0 num_leaves: 64 @@ -230,7 +230,7 @@ lgbm: max_depth: 6 max_bin: 16 num_round: 500 - num_tasks: 59 + num_tasks: 177 min_gain_to_split: 1.0 lambda_l2: 1.0 num_leaves: 64 @@ -254,7 +254,7 @@ lgbm: max_depth: 6 max_bin: 16 num_round: 500 - num_tasks: 59 + num_tasks: 177 min_gain_to_split: 1.0 lambda_l2: 1.0 num_leaves: 64 @@ -278,7 +278,7 @@ lgbm: max_depth: 6 max_bin: 16 num_round: 500 - num_tasks: 59 + num_tasks: 177 min_gain_to_split: 1.0 lambda_l2: 1.0 num_leaves: 64 diff --git a/tools/kal-test/conf/ml/lgbm/lgbm_spark.properties b/tools/kal-test/conf/ml/lgbm/lgbm_spark.properties index 9534063e174d231ec08753089a8680a3c1e5477d..153a8fab877372b941402e9d67ffd9a32b78999e 100644 --- a/tools/kal-test/conf/ml/lgbm/lgbm_spark.properties +++ b/tools/kal-test/conf/ml/lgbm/lgbm_spark.properties @@ -8,23 +8,23 @@ aarch64_mnist8m_numExecutors=60 aarch64_mnist8m_executorCores=4 aarch64_mnist8m_executorMemory=15G aarch64_mnist8m_executorMemOverhead=2G -aarch64_mnist8m_extraJavaOptions=-Xms12g +aarch64_mnist8m_extraJavaOptions=-Xms15g aarch64_higgs_driverCores=36 aarch64_higgs_driverMemory=50G -aarch64_higgs_numExecutors=71 +aarch64_higgs_numExecutors=60 aarch64_higgs_executorCores=4 -aarch64_higgs_executorMemory=12G +aarch64_higgs_executorMemory=15G aarch64_higgs_executorMemOverhead=2G -aarch64_higgs_extraJavaOptions=-Xms12g +aarch64_higgs_extraJavaOptions=-Xms15g aarch64_criteo_driverCores=36 aarch64_criteo_driverMemory=50G -aarch64_criteo_numExecutors=71 -aarch64_criteo_executorCores=4 -aarch64_criteo_executorMemory=12G +aarch64_criteo_numExecutors=78 +aarch64_criteo_executorCores=3 +aarch64_criteo_executorMemory=11G aarch64_criteo_executorMemOverhead=2G -aarch64_criteo_extraJavaOptions=-Xms12g +aarch64_criteo_extraJavaOptions=-Xms11g x86_64_mnist8m_driverCores=36 x86_64_mnist8m_driverMemory=50G diff --git a/tools/kal-test/conf/ml/ml_datasets.properties b/tools/kal-test/conf/ml/ml_datasets.properties index 97b5b03c68aed0614d5167d8b077c0492ff59829..12e7af91bd47881a196550c62ea7c6c8f1accc5e 100644 --- a/tools/kal-test/conf/ml/ml_datasets.properties +++ b/tools/kal-test/conf/ml/ml_datasets.properties @@ -2,7 +2,7 @@ sparkVersion=spark3.1.1 # kalVersion -kalVersion=2.2.0 +kalVersion=3.0.0 # scalaVersion scalaVersion=2.12 @@ -15,6 +15,7 @@ epsilon=hdfs:///tmp/ml/dataset/epsilon_train,hdfs:///tmp/ml/dataset/epsilon_test rcv=hdfs:///tmp/ml/dataset/rcv1bin_train,hdfs:///tmp/ml/dataset/rcv1bin_test mnist8m=hdfs:///tmp/ml/dataset/mnist8m_train,hdfs:///tmp/ml/dataset/mnist8m_test higgs=hdfs:///tmp/ml/dataset/higgs_train,hdfs:///tmp/ml/dataset/higgs_test +avazu=hdfs:///tmp/ml/dataset/avazu_train,hdfs:///tmp/ml/dataset/avazu_test D10M4096libsvm=hdfs:///tmp/ml/dataset/10M4096libsvm,hdfs:///tmp/ml/dataset/10M4096libsvm ECBDL14=hdfs:///tmp/ml/dataset/ECBDL14_train.orc,hdfs:///tmp/ml/dataset/ECBDL14_test.orc D10M4096=hdfs:///tmp/ml/dataset/svm_10m4096_train,hdfs:///tmp/ml/dataset/svm_10m4096_test @@ -31,9 +32,9 @@ RUCCI=hdfs:///tmp/ml/dataset/RUCCI/Rucci1.mtx D20M200K=hdfs:///tmp/ml/dataset/lda_20m200k,hdfs:///tmp/ml/dataset/lda_20m200k nytimes=hdfs:///tmp/ml/dataset/nytimes,hdfs:///tmp/ml/dataset/nytimes pubmed=hdfs:///tmp/ml/dataset/pubmed,hdfs:///tmp/ml/dataset/pubmed -als=hdfs:///tmp/ml/dataset/ALS -alsbs=hdfs:///tmp/ml/dataset/ALS_bs -alsh=hdfs:///tmp/ml/dataset/ALS_h +als=hdfs:///tmp/ml/dataset/ALS_311 +alsbs=hdfs:///tmp/ml/dataset/ALS_bs_311 +alsh=hdfs:///tmp/ml/dataset/ALS_h_311 glove=hdfs:///tmp/ml/dataset/GloVe gist=hdfs:///tmp/ml/dataset/GIST deep1b=hdfs:///tmp/ml/dataset/DEEP1B @@ -70,8 +71,8 @@ alibaba_node_downstreamTestFile=hdfs:///tmp/ml/dataset/w2v/cases/UBA/downstreamT alibaba_taobao=hdfs:///tmp/ml/dataset/w2v/product/taobao/sentences alibaba_taobao_downstreamTrainFile=hdfs:///tmp/ml/dataset/w2v/product/taobao/downstreamTrain alibaba_taobao_downstreamTestFile=hdfs:///tmp/ml/dataset/w2v/product/taobao/downstreamTest -if_40M_1k=hdfs:///tmp/ml/dataset/40M_1k -if_1M_1k=hdfs:///tmp/ml/dataset/1M_1k +if_40M_1K=hdfs:///tmp/ml/dataset/40M_1k +if_1M_1K=hdfs:///tmp/ml/dataset/1M_1k encoder_400m=hdfs:///tmp/ml/dataset/encoder/encoder_400m,./datasets/featureMap_400m.json encoder_800m=hdfs:///tmp/ml/dataset/encoder/encoder_800m,./datasets/featureMap_800m.json movielens=hdfs:///tmp/ml/dataset/movielens/movielens_train_with_folds.csv,hdfs:///tmp/ml/dataset/movielens/movielens_test.csv @@ -84,6 +85,9 @@ MT=hdfs:///tmp/ml/dataset/nmf/Movies_and_TV.csv BostonHousing=hdfs:///tmp/ml/dataset/BostonHousing.csv TitanicRf=hdfs:///tmp/ml/dataset/titanic.csv TitanicGBT=hdfs:///tmp/ml/dataset/titanic.csv +cityu=hdfs:///tmp/ml/dataset/CRF/cityu +msr=hdfs:///tmp/ml/dataset/CRF/msr +as=hdfs:///tmp/ml/dataset/CRF/as Hibench1m_100="./datasets/kmeans_1m_100" Hibench1m_200="./datasets/kmeans_1m_200" diff --git a/tools/kal-test/conf/ml/svm/svm.yml b/tools/kal-test/conf/ml/svm/svm.yml index c8b3e62eeefb78a46fdb216af6cd36fa98c30b10..b64d233e0f25c1948f68d6616df25060a15c130e 100644 --- a/tools/kal-test/conf/ml/svm/svm.yml +++ b/tools/kal-test/conf/ml/svm/svm.yml @@ -2,38 +2,38 @@ svm: opt: ECBDL14: - numPartitions: 180 + numPartitions: 276 regParam: 0.01 maxIter: 1000 tolerance: 1E-6 epsilon: - numPartitions: 180 + numPartitions: 276 regParam: 0.01 maxIter: 1000 tolerance: 1E-6 rcv: - numPartitions: 180 + numPartitions: 276 regParam: 0.01 maxIter: 1000 tolerance: 1E-6 raw: ECBDL14: - numPartitions: 284 + numPartitions: 228 regParam: 0.01 maxIter: 1000 tolerance: 1E-6 epsilon: - numPartitions: 284 + numPartitions: 228 regParam: 0.01 maxIter: 1000 tolerance: 1E-6 rcv: - numPartitions: 284 + numPartitions: 228 regParam: 0.01 maxIter: 1000 tolerance: 1E-6 diff --git a/tools/kal-test/conf/ml/xgbt/xgbt_spark.properties b/tools/kal-test/conf/ml/xgbt/xgbt_spark.properties index bcd3299c548ff4d3ff896cdc7602d0975aba29fc..8c66e40ecadcc62881bd4c3fa0c6ed5b978032c6 100644 --- a/tools/kal-test/conf/ml/xgbt/xgbt_spark.properties +++ b/tools/kal-test/conf/ml/xgbt/xgbt_spark.properties @@ -23,12 +23,12 @@ aarch64_classification_higgs_numPartitions=51 x86_64_classification_mnist8m_driverCores=40 x86_64_classification_mnist8m_driverMemory=300G -x86_64_classification_mnist8m_numExecutors=18 +x86_64_classification_mnist8m_numExecutors=24 x86_64_classification_mnist8m_executorCores=9 x86_64_classification_mnist8m_taskCpus=9 -x86_64_classification_mnist8m_executorMemory=50G -x86_64_classification_mnist8m_extraJavaOptions=-Xms50g -x86_64_classification_mnist8m_numPartitions=18 +x86_64_classification_mnist8m_executorMemory=37G +x86_64_classification_mnist8m_extraJavaOptions=-Xms37g +x86_64_classification_mnist8m_numPartitions=24 aarch64_classification_mnist8m_driverCores=40 aarch64_classification_mnist8m_driverMemory=300G diff --git a/tools/kal-test/pom.xml b/tools/kal-test/pom.xml index f0b4e30c10176f5e6e6d3b6d4fcd8188e903cc2a..f0d483c1edb82b5af0eeac113a541017482cb27f 100644 --- a/tools/kal-test/pom.xml +++ b/tools/kal-test/pom.xml @@ -13,7 +13,7 @@ 1.8 UTF-8 2.12 - 2.2.0 + 3.0.0 3.1.1 diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/CDVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/CDVerify.scala new file mode 100644 index 0000000000000000000000000000000000000000..c79591e57a7366d1dae6c344397d5a6f3679f097 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/CDVerify.scala @@ -0,0 +1,20 @@ +// scalastyle:off println +package com.bigdata.compare.graph + +import org.apache.spark.{SparkConf, SparkContext} + +object CDVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val sparkConf = new SparkConf().setAppName("DegreeVerify") + val sc = SparkContext.getOrCreate(sparkConf) + val rdd0 = sc.textFile(path0).collect() + val rdd1 = sc.textFile(path1).collect() + + val flag = rdd0.length == rdd1.length + sc.stop() + println(s"Static Nodes Count: ${rdd0.length}, ${rdd1.length}") + println(s"The algorithm is correct: ${flag}") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/DeepWalkVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/DeepWalkVerify.scala index 3d223fbf8cabaaf503758a2cd3d54d4274a42b8b..e90896c0cf8c79e7e1d75278ef4ffd881192bce3 100644 --- a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/DeepWalkVerify.scala +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/DeepWalkVerify.scala @@ -1,4 +1,3 @@ -/* // scalastyle:off package com.bigdata.compare.graph @@ -151,6 +150,4 @@ object DeepWalkVerify { throw e } } -} - - */ +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/CDDegreeVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/DegreeVerify.scala similarity index 96% rename from tools/kal-test/src/main/scala/com/bigdata/compare/graph/CDDegreeVerify.scala rename to tools/kal-test/src/main/scala/com/bigdata/compare/graph/DegreeVerify.scala index c765c09ed301498f69695d5678ae9aa0af6d694d..c0c6ae94fc4aa8337308e94d2331afad97e5f87b 100644 --- a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/CDDegreeVerify.scala +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/DegreeVerify.scala @@ -3,7 +3,7 @@ package com.bigdata.compare.graph import org.apache.spark.{SparkConf, SparkContext} -object CDDegreeVerify { +object DegreeVerify { def main(args: Array[String]): Unit = { val path0 = args(0) val path1 = args(1) diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/SubgraphMatchingVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/SubgraphMatchingVerify.scala new file mode 100644 index 0000000000000000000000000000000000000000..d2e36408f27e7a83c58db5fa9f24d489452d2fc5 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/SubgraphMatchingVerify.scala @@ -0,0 +1,194 @@ +package com.bigdata.compare.graph + +import scala.collection.mutable.ArrayBuffer +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.log4j.{Level, Logger} +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.rdd.RDD + +object SubgraphMatchingVerify extends Serializable { + + val ARGS_NUM = 7 + def main(args: Array[String]): Unit = { + + var master = "local" + var inputMatches = "" + var inputType = "" + var inputGraph = "" + var splitGraph = "" + var outputResult = "" + var partitionNum = 1 + + if (args.length < ARGS_NUM) { + if (master != "local") { + println("Wrong parameters, please check the input.") + } + } else { + master = args(0) + inputMatches = args(1) + inputType = args(2) + inputGraph = args(3) + splitGraph = args(4) + outputResult = args(5) + partitionNum = args(6).toInt + } + + FileSystem.get(new Configuration()).delete(new Path(outputResult), true) + Logger.getLogger("org").setLevel(Level.OFF) + Logger.getLogger("akka").setLevel(Level.OFF) + Logger.getRootLogger.setLevel(Level.ERROR) + + val supportedTypes = Array[String]("4sqr", "4dgn", "5tree", "6star") + require(supportedTypes.contains(inputType), "type not support, can only support 4sqr, 4dgn, 5tree and 6star") + + val sparkConf = new SparkConf().setMaster(master) + if (master.contains("local")) { + sparkConf.setAppName("Local test CheckResultSubgraphMatching") + } else { + sparkConf.setAppName("CheckResultSubgraphMatching test on" + inputMatches) + } + val sc = new SparkContext(sparkConf) + sc.setLogLevel("ERROR") + + val identicalMatches = loadExistingResults(sc, inputMatches, inputType) + val edgesRDD = sc.textFile(inputGraph, partitionNum) + .filter(line => line.nonEmpty && line.head != '#') + .map { line => + val tokens: Array[String] = line.split(splitGraph) + (tokens(0).toInt, tokens(1).toInt) + }.flatMap(x => Iterator((x._1, x._2), (x._2, x._1))) + .distinct() + .filter(x => x._1 != x._2) + val mGraphAdjMap = edgesRDD.groupBy(_._1) + .mapValues(f => {f.map(x => x._2).toArray}) + .collectAsMap() + + val mGraphBC = sc.broadcast(mGraphAdjMap) + + val rst = identicalMatches.mapPartitions{ f => { + val localGraph = mGraphBC.value + var res = new ArrayBuffer[Boolean]() + while (f.hasNext) { + val matchedNodes = f.next() + val tmp = checkCorrectness(inputType, matchedNodes, localGraph) + res = res ++ Array(tmp) + } + res.toIterator + }} + + rst.saveAsTextFile(outputResult) + val falseNum = rst.filter(x => !x) + .count() + if (falseNum > 0) { + println("there are %d instances are incorrect".format(falseNum)) + } else { + println("all instances are correct") + } + + } + + def instance2Nodes(edges: Array[String], inputTpye: String): Array[Int] = { + + val cooList = edges.flatMap{ edgeS => + var edge = edgeS.replaceAll("\\(", "") + edge = edge.replaceAll("\\)", "") + val node = edge.split(",").map(x => x.toLong) + Iterator((node(0), node(1)), (node(1), node(0))) + } + val adjListMap = cooList.groupBy(_._1) + .mapValues(f => {f.map(x => x._2)}) + val adjList = adjListMap.toArray + val nodes = adjList.sortBy(-_._2.length).map(x => x._1.toInt) + + // 4sqr nodes: 0 -> 1 -> 2 -> 3 -> 0 + if (inputTpye == "4sqr") { + if(adjListMap(nodes(0)).contains(nodes(3))) { + val id = if (adjListMap(nodes(0)).contains(nodes(1))) 2 else 1 + val tmp = nodes(id) + nodes(id) = nodes(3) + nodes(3) = tmp + } + } + + // 5tree nodes: 0 -> 1 -> 4, 0 -> 2, 0 -> 3 + if (inputTpye == "5tree" ) { + if(adjListMap(nodes(1)).contains(nodes(2)) || adjListMap(nodes(2)).contains(nodes(3))) { + val tmp = nodes(4) + if(adjListMap(nodes(1)).contains(nodes(2))) { + nodes(4) = nodes(2) + nodes(2) = tmp + } else { + nodes(4) = nodes(3) + nodes(3) = tmp + } + } + + } + + nodes + + } + + def loadExistingResults(sc: SparkContext, path: String, inputType: String): RDD[Array[Int]] = { + val existMatches = sc.textFile(path).flatMap(line => { + val x = line.split("\t") + val matchInstance = instance2Nodes(x, inputType) + Iterator(matchInstance) + }) + existMatches + } + + def checkCorrectness(testType: String, nodes: Array[Int], + localGraph: scala.collection.Map[Int, Array[Int]]): Boolean = { + + var tag =false + if (testType == "4sqr") { + val check03 = !localGraph(nodes(0)).contains(nodes(3)) + val check12 = !localGraph(nodes(1)).contains(nodes(2)) + tag = check03 && check12 + } + if (testType == "4dgn") { + tag = !localGraph(nodes(2)).contains(nodes(3)) + } + if (testType =="5tree") { + val isDegreeCorrect = localGraph(nodes(0)).size == 3 + var leafNodeNeigh = new ArrayBuffer[Int]() + var i = 2 + while (i < 5) { + val tmpNeigh = localGraph(nodes(i)) + leafNodeNeigh = leafNodeNeigh ++ tmpNeigh + i = i + 1 + } + var isNeighCorrect = false + i = 2 + while (i < 5) { + isNeighCorrect = !leafNodeNeigh.contains(nodes(i)) + i = i + 1 + } + isNeighCorrect = !localGraph(nodes(2)).contains(nodes(1)) && + !localGraph(nodes(3)).contains(nodes(1)) && !localGraph(nodes(4)).contains(nodes(0)) + tag = isDegreeCorrect && isNeighCorrect + } + if (testType == "6star") { + val centerNode = nodes(0) + val isDegreeCorrect = (localGraph(centerNode).size == 5) + var leafNodeNeigh = new ArrayBuffer[Int]() + var i = 1 + while (i < 6) { + val tmpNeigh = localGraph(nodes(i)) + leafNodeNeigh = leafNodeNeigh ++ tmpNeigh + i = i + 1 + } + var isNeighCorrect = false + i = 1 + while (i < 6) { + isNeighCorrect = !leafNodeNeigh.contains(nodes(i)) + i = i + 1 + } + tag = isDegreeCorrect && isNeighCorrect + } + tag + } + +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/EncoderVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/EncoderVerify.scala index 72a82e50ba1f76664b85a30ef06b6100557b2ce3..62d69964ef5497ba016269fc331bf84102cda086 100644 --- a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/EncoderVerify.scala +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/EncoderVerify.scala @@ -46,10 +46,10 @@ object EncoderVerify { val diff = rdd1.subtract(rdd2).count() println(s"Exec Successful: different count: ${diff}") if (diff == 0) { - return "false" + return "true" } else { - return "true" + return "false" } } diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/SimRankVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/SimRankVerify.scala index ea440588e66b18e316846a2b2648fd662fc96edf..34d73503f50f143d3469d80581403526a92fdb54 100644 --- a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/SimRankVerify.scala +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/SimRankVerify.scala @@ -7,7 +7,7 @@ import org.apache.spark.{SparkConf, SparkContext} import java.io.FileWriter -object SimRankVerify { +object SimRankVerify extends Serializable { val EPS = 1e-7 def main(args: Array[String]): Unit = { diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/ClusteringCoefficientRawRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/ClusteringCoefficientRawRunner.scala new file mode 100644 index 0000000000000000000000000000000000000000..4fb3490993667a58911ab6cd9c6d750b0109343e --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/ClusteringCoefficientRawRunner.scala @@ -0,0 +1,145 @@ +package com.bigdata.graph + +import java.io.{FileWriter, InputStreamReader} +import java.util + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.Graph +import org.apache.spark.graphx.lib.LocalClusteringCoefficient +import org.apache.spark.{SparkConf, SparkContext} +class ClusteringCoefficientRawConfig extends Serializable { + @BeanProperty var clusteringCoefficient: util.HashMap[String, util.HashMap[String, Object]] = _ +} +class ClusteringCoefficientRawParms extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var isDirect: Boolean = _ + @BeanProperty var isWeight: String = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var computePartitions: Int = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var LocalClusteringCoefficient: AnyVal = _ + @BeanProperty var AverageClusteringCoefficient: AnyVal = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} +object ClusteringCoefficientRawRunner { + + def main(args: Array[String]): Unit = { + try { + val datsetName = args(0) + val computePartitions = args(1).toInt + val isWeight = args(2) + val isRaw = args(3) + val inputPath = args(4) + val api = args(5) + val outputPath = args(6) + + val weightedBool = isWeight match { + case "weighted" => true + case "unweighted" => false + case _ => throw new Exception("illegal weighted value") + } + + val stream: InputStreamReader = Utils.getStream("conf/graph/clusteringcoefficient/clusteringcoefficient.yml") + val representer = new Representer + representer.addClassTag(classOf[ClusteringCoefficientRawParms], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[ClusteringCoefficientRawConfig]), representer, options) + val description = new TypeDescription(classOf[ClusteringCoefficientRawParms]) + yaml.addTypeDescription(description) + val config: ClusteringCoefficientRawConfig = yaml.load(stream).asInstanceOf[ClusteringCoefficientRawConfig] + val paramsMap: util.HashMap[String, Object] = config.clusteringCoefficient + .get(isRaw match { + case "no" => "opt" + case _ => "raw" + }) + .get(datsetName) + .asInstanceOf[util.HashMap[String, Object]] + + val params = new ClusteringCoefficientRawParms + + val splitGraph: String = paramsMap.get("splitGraph").toString + val isDirect: Boolean = paramsMap.get("isDirect").toString.toBoolean + + params.setDatasetName(datsetName) + params.setComputePartitions(computePartitions) + params.setIsWeight(isWeight) + params.setInputPath(inputPath) + params.setIsRaw(isRaw) + params.setApiName(api) + params.setOutputPath(outputPath) + params.setSplitGraph(splitGraph) + params.setIsDirect(isDirect) + params.setAlgorithmName("ClusteringCoefficient") + params.setTestcaseType(s"ClusteringCoefficient_${datsetName}_${api}_${isWeight}_Raw") + + println("inputPath:" + inputPath) + println("outputPath:" + outputPath) + + val appName = s"${params.testcaseType}" + + val sparkConf = new SparkConf() + .setAppName(appName) + .setMaster("yarn") + val sc = new SparkContext(sparkConf) + + val startTime: Long = System.currentTimeMillis() + + val inputRDD = Util.readUndirectDataFromHDFS(sc, inputPath, splitGraph, params.getComputePartitions) + .map(f => (f._1.toLong, f._2.toLong)) + + val graph = if (weightedBool) { + Graph.fromEdgeTuples(inputRDD, 0.0).mapEdges(f => 1.0) + } else { + Graph.fromEdgeTuples(inputRDD, 0.0) + } + + val result = api match { + case "lcc" => + val result = LocalClusteringCoefficient.run(graph).vertices + Util.saveDataToHDFS(result, ",", params.outputPath) + case "avgcc" => + val result = LocalClusteringCoefficient.run(graph) + var res = 0.0 + val gNum = graph.vertices.count() + if(gNum != 0) { + res = result.vertices.map(_._2).sum() / gNum + } + params.setAverageClusteringCoefficient(res) + println(res) + case "globalcc" => + val result: Double = LocalClusteringCoefficient.runGlobalClusteringCoefficient(graph) + params.setLocalClusteringCoefficient(result) + println(result) + case _ => throw new Exception("illegal api") + } + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + println(s"Exec Successful: costTime: ${costTime}s") + params.setCostTime(costTime) + + Utils.checkDirs("report") + val writer = new FileWriter( + s"report/${params.testcaseType}_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + + sc.stop() + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/ClusteringCoefficientRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/ClusteringCoefficientRunner.scala index 898b239d240aa67c2be844feeb489d6cd8cd25d6..5f7fae5c63e8adba1e5fdb9b74e368bee8d33c77 100644 --- a/tools/kal-test/src/main/scala/com/bigdata/graph/ClusteringCoefficientRunner.scala +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/ClusteringCoefficientRunner.scala @@ -88,7 +88,7 @@ object ClusteringCoefficientRunner { println("inputPath:" + inputPath) println("outputPath:" + outputPath) - val appName = s"ClusteringCoefficient_${api}_${isWeight}_${datsetName}" + val appName = s"${params.testcaseType}" val sparkConf = new SparkConf() .setAppName(appName) @@ -127,7 +127,7 @@ object ClusteringCoefficientRunner { Utils.checkDirs("report") val writer = new FileWriter( - s"report/ClusteringCoefficient_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + s"report/${params.testcaseType}_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") yaml.dump(params, writer) sc.stop() diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/FraudarRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/FraudarRunner.scala index 938628ffc5590c8c570885051e50a7642bd61dc3..d430981a3048a6f6924b10e4955205dcacae3435 100644 --- a/tools/kal-test/src/main/scala/com/bigdata/graph/FraudarRunner.scala +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/FraudarRunner.scala @@ -66,7 +66,8 @@ object FraudarRunner { params.setPartitions(paramsMap.get("partitions").toString.toInt) params.setDatasetName(datasetName) params.setDataPath(dataPath) - params.setOutputPath(outputPath) + params.setJSetOutPath(jSetOutPath) + params.setISetOutPath(iSetOutPath) params.setIsRaw(isRaw) params.setAlgorithmName("Fraudar") params.setTestcaseType(s"${params.algorithmName}_${datasetName}_${isRaw}") @@ -121,6 +122,7 @@ class FraudarKernel { sc.setLogLevel("WARN") val startTime = System.currentTimeMillis() val bipartGraph = Util.readUndirectDataFromHDFS(sc, params.dataPath, params.splitGraph, params.partitions) + .map(f => (f._1.toLong, f._2.toLong)) .persist(StorageLevel.MEMORY_AND_DISK_SER) bipartGraph.foreachPartition(f => {}) diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/SLPARunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/SLPARunner.scala new file mode 100644 index 0000000000000000000000000000000000000000..49f7a0e4db98fd42488f3913883e3aa09bdb4a89 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/SLPARunner.scala @@ -0,0 +1,141 @@ +// scalastyle:off + +package com.bigdata.graph +import com.bigdata.utils.Utils + +import org.apache.spark.graphx.lib.{SpearkListenerLabelPropagation, Parameters} +import org.apache.spark.sql.SparkSession +import org.apache.spark.{SparkConf, SparkContext} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.{File, FileWriter, InputStreamReader} +import java.util +import scala.beans.BeanProperty + +class SLPAConfig extends Serializable { + @BeanProperty var slpa: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class SLPAParams extends Serializable { + @BeanProperty var splitGraph: String = _ + @BeanProperty var partitions: Int = _ + @BeanProperty var isWeight: Boolean = _ + @BeanProperty var isDirected: Boolean = _ + @BeanProperty var iterNum: Int = _ + @BeanProperty var threshold: Double = _ + + @BeanProperty var outputPath: String = _ + @BeanProperty var dataPath: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + + +object SLPARunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, platformName, isRaw) = (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val outputPath = args(2) + val representer = new Representer + representer.addClassTag(classOf[SLPAParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val stream: InputStreamReader = Utils.getStream("conf/graph/slpa/slpa.yml") + val yaml = new Yaml(new Constructor(classOf[SLPAConfig]), representer, options) + val description = new TypeDescription(classOf[SLPAParams]) + yaml.addTypeDescription(description) + val config: SLPAConfig = yaml.load(stream).asInstanceOf[SLPAConfig] + + val params = new SLPAParams() + val paramsMap = + config.slpa.get(isRaw match { + case "no" => "opt" + case _ => "raw" + }).get(datasetName).asInstanceOf[util.HashMap[String, Object]] + params.setSplitGraph(paramsMap.get("splitGraph").asInstanceOf[String]) + params.setPartitions(paramsMap.get("partitions").asInstanceOf[Int]) + params.setIsWeight(paramsMap.get("isWeight").asInstanceOf[Boolean]) + params.setIsDirected(paramsMap.get("isDirected").asInstanceOf[Boolean]) + params.setIterNum(paramsMap.get("iterNum").asInstanceOf[Int]) + params.setThreshold(paramsMap.get("threshold").asInstanceOf[Double]) + params.setDatasetName(datasetName) + params.setDataPath(dataPath) + params.setOutputPath(outputPath) + params.setIsRaw(isRaw) + params.setAlgorithmName("SLPA") + params.setTestcaseType(s"${params.algorithmName}_${datasetName}_${isRaw}") + + val conf = new SparkConf().setAppName(params.testcaseType) + val spark = SparkSession.builder.config(conf).getOrCreate() + val sc = spark.sparkContext + + val startTime = System.currentTimeMillis() + val edges = Util.readGraphFromHDFS(sc, params.dataPath, params.splitGraph, params.isWeight, params.partitions) + edges.foreachPartition{f => {}} + + val slpaGraph = SpearkListenerLabelPropagation.buildGraph(edges, params.isDirected) + val slpaComm = SpearkListenerLabelPropagation.run(slpaGraph, params.iterNum, params.threshold) + slpaComm.edges.foreachPartition{f => {}} + + val outPutComm = s"${outputPath}/res" + val outPutComp = s"${outputPath}/resForComparsion" + + val vertex2Comm = slpaComm.vertices.map(x => (x._1,"[" + x._2.mkString(",") + "]")) + vertex2Comm.foreachPartition{f => {}} + vertex2Comm.saveAsTextFile(outPutComm) + + val finishTime = System.currentTimeMillis() + val costTime = (finishTime - startTime) / 1000 + + obtainCommunities(sc, outPutComm, outPutComp) + + params.setCostTime(costTime) + println(s"Exec Successful: costTime: ${costTime}s") + + val folder = new File("report") + if (!folder.exists()) { + val mkdir = folder.mkdirs() + println(s"Create dir report ${mkdir}") + } + val writer = new FileWriter( + s"report/${params.testcaseType}_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } + + def obtainCommunities(sc: SparkContext, commPath: String, outPath: String): Unit = { + val labelIds = sc.textFile(commPath) + .flatMap(line => { + val x = line.split(",") + var tmp = x(0).replace("(","") + val id = tmp.toLong + tmp = x(1).replace(")","") + tmp = tmp.replace("[","") + tmp = tmp.replace("]","") + val labels = tmp.split(",") + val attr = new Array[(Long, Long)](labels.length) + var i = 0 + while (i < labels.length) { + attr(i) = (labels(i).toLong, id) + i = i + 1 + } + attr.toIterator + }) + labelIds.groupByKey() + .map(x => x._2.toArray.mkString(" ")) + .repartition(1) + .saveAsTextFile(outPath) + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/WeightedLablePropagationRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/WeightedLablePropagationRunner.scala index 191d6dd8a9e843f9dbd374f51af3bb964628a21a..ccae82c36379ff6f9447d0a42952e556b64588c5 100644 --- a/tools/kal-test/src/main/scala/com/bigdata/graph/WeightedLablePropagationRunner.scala +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/WeightedLablePropagationRunner.scala @@ -25,6 +25,7 @@ class WLPAParams extends Serializable { @BeanProperty var splitGraph: String = _ @BeanProperty var commputePartition: Int = _ @BeanProperty var maxIter: Int = _ + @BeanProperty var partitions: Int = _ @BeanProperty var outputPath: String = _ @BeanProperty var dataPath: String = _ diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/BORunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/BORunner.scala index dcfd7a417832070dfd063d7c9e9658d7ab7ca2cf..21b6b6b8a645c6d683ea2bb81aacd08629af616c 100644 --- a/tools/kal-test/src/main/scala/com/bigdata/ml/BORunner.scala +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/BORunner.scala @@ -2,6 +2,7 @@ package com.bigdata.ml import com.bigdata.utils.Utils import com.bigdata.compare.ml.UpEvaluationVerify +import com.bigdata.compare.ml.DownEvaluationVerify import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.{GBTClassifier, RandomForestClassifier} @@ -117,7 +118,12 @@ object BORunner { Utils.checkDirs("report") if (ifCheck.equals("yes")) { - params.setIsCorrect(UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val isCorrect = params.datasetName match { + case "BostonHousing" => DownEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + case "TitanicRf" => UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + case "TitanicGBT" => UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + } + params.setIsCorrect(isCorrect) val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") writerIsCorrect.close() diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/CRFRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/CRFRunner.scala new file mode 100644 index 0000000000000000000000000000000000000000..adf9b7cf6a7d337338194c1c72267c68f382f4a8 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/CRFRunner.scala @@ -0,0 +1,316 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.UpEvaluationVerify + +import com.intel.ssg.bdt.nlp.{CRF, L2, Sequence, Token, Regularization} +import org.apache.spark.nlp.{CRF => CRFRaw, L2 => L2Raw, Sequence => Seq, Token => Tok} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.hadoop.fs.{FileSystem, Path} + +import java.io.{File, FileWriter} +import java.util +import scala.beans.BeanProperty + +class CRFConfig extends Serializable { + @BeanProperty var crf: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class CRFParams extends Serializable { + @BeanProperty var pt: Int = _ + @BeanProperty var maxIter: Int = _ + @BeanProperty var regParam: Double = _ + @BeanProperty var freq: Int = _ + @BeanProperty var compLevel: Int = _ + @BeanProperty var nThread: Int = _ + @BeanProperty var tol: Double = _ + @BeanProperty var calAcc: Boolean = _ + @BeanProperty var templatePath: String = _ + + + @BeanProperty var dataPath: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object CRFRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/crf/crf.yml") + val representer = new Representer + representer.addClassTag(classOf[CRFParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[CRFConfig]), representer, options) + val description = new TypeDescription(classOf[CRFParams]) + yaml.addTypeDescription(description) + val configs: CRFConfig = yaml.load(stream).asInstanceOf[CRFConfig] + val params = new CRFParams() + val paramsMap: util.HashMap[String, Object] = configs.crf.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + params.setPt(paramsMap.getOrDefault("pt", "276").asInstanceOf[Int]) + params.setMaxIter(paramsMap.getOrDefault("maxIter", "500").asInstanceOf[Int]) + params.setRegParam(paramsMap.getOrDefault("regParam", "0.01").asInstanceOf[Double]) + params.setFreq(paramsMap.getOrDefault("freq", "0.20").asInstanceOf[Int]) + params.setCompLevel(paramsMap.getOrDefault("compLevel", "0").asInstanceOf[Int]) + params.setNThread(paramsMap.getOrDefault("nThread", "1").asInstanceOf[Int]) + params.setTol(paramsMap.getOrDefault("tol", "0.001").asInstanceOf[Double]) + params.setCalAcc(paramsMap.getOrDefault("calAcc", "false").asInstanceOf[Boolean]) + params.setTemplatePath(paramsMap.getOrDefault("templatePath", "/").asInstanceOf[String]) + + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("CRF") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + + val spark = SparkSession.builder.config(conf).getOrCreate() + val costTime = new CRFKernel().runJob(spark, params) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s;isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class CRFKernel { + def runJob(spark: SparkSession, params: CRFParams): Double = { + import spark.implicits._ + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val dataPath = params.dataPath + val pt = params.pt + val maxIteration = params.maxIter + val regParam = params.regParam + val frequency = params.freq + val compLevel = params.compLevel + val nThread = params.nThread + val tol = params.tol + val calAcc = params.calAcc + val templatePath = params.templatePath + + val t1 = System.currentTimeMillis() + println("\n--------start--------\n") + println("* start: " + t1) + + // Load and parse the data + val templates: Array[String] = sc.textFile(templatePath).toLocalIterator.filter(_.nonEmpty).toArray + val testData = sc.textFile(dataPath + ".t").toLocalIterator.filter(_.nonEmpty).toArray + + val t2 = System.currentTimeMillis() + println("* loading & pre-preprocess time: ", (t2 - t1) / 1000.0 ) + + var predictTime = 0.0 + if (params.isRaw == "no") { + println("================== Running Opt CRF ==================") + + val trainDataOpt = sc.textFile(dataPath + ".tr").filter(_.nonEmpty).map { sentence => + val tokens = sentence.split("\t") + Sequence(tokens.map { token => + val tags: Array[String] = token.split('|') + Token.put(tags.last, tags.dropRight(1)) + }) + } + + var testArrayWithoutLabelOpt: Array[Sequence] = Array[Sequence]() + var testArrayWithLabelOpt: Array[Sequence] = Array[Sequence]() + + val trainRDDOpt = sc.parallelize(trainDataOpt.collect(), pt).repartition(pt).cache() + //println("number of training sequences: " + trainRDDOpt.count()) + + testArrayWithoutLabelOpt = testData.map(sentence => { + val tokens = sentence.split("\t") + Sequence(tokens.map(token => { + val tags = token.split('|') + Token.put(tags.dropRight(1)) + })) + }) + + testArrayWithLabelOpt = testData.map(sentence => { + val tokens = sentence.split("\t") + Sequence(tokens.map(token => { + val tags = token.split('|') + Token.put(tags.last, tags.dropRight(1)) + })) + }) + + val model = new CRF().setRegParam(regParam) + .setFreq(frequency) + .setMaxIterations(maxIteration) + .setTolerance(tol) + .setRegularization(L2) + .setCompLevel(compLevel) + .setCalcAccuracy(calAcc) + .setNumThread(nThread) + .runCRF(templates, trainRDDOpt, testArrayWithLabelOpt, testArrayWithoutLabelOpt) + + val time1 = System.currentTimeMillis() + val testDataWithLabelOpt = sc.textFile(dataPath + ".t").filter(_.nonEmpty).map { sentence => + val tokens = sentence.split("\t") + Sequence(tokens.map { token => + val tags: Array[String] = token.split('|') + Token.put(tags.dropRight(1)) + }) + } + + val testDataWithoutLabelOpt = sc.textFile(dataPath + ".t").filter(_.nonEmpty).map { sentence => + val tokens = sentence.split("\t") + Sequence(tokens.map { token => + val tags: Array[String] = token.split('|') + Token.put(tags.last, tags.dropRight(1)) + }) + } + + val results = model.predict(testDataWithoutLabelOpt) + val score = results + .zipWithIndex() + .map(_.swap) + .join(testDataWithLabelOpt.zipWithIndex().map(_.swap)) + .map(_._2) + .map(x => x._1.compare(x._2)) + .reduce(_ + _) + val total = testDataWithoutLabelOpt.map(_.toArray.length).reduce(_ + _) + + val acc = score / total.toDouble + Utils.saveEvaluation(acc, params.saveDataPath, sc) + val time2 = System.currentTimeMillis() + predictTime += (time2 - time1).toDouble / 1000 + } else { + println("================== Running Raw CRF ==================") + + val trainDataRaw = sc.textFile(dataPath + ".tr").filter(_.nonEmpty).map { sentence => + val tokens = sentence.split("\t") + Seq(tokens.map { token => + val tags: Array[String] = token.split('|') + Tok.put(tags.last, tags.dropRight(1)) + }) + } + + var testArrayWithoutLabelRaw: Array[Seq] = Array[Seq]() + var testArrayWithLabelRaw: Array[Seq] = Array[Seq]() + + val trainRDDRaw = sc.parallelize(trainDataRaw.collect(), pt).repartition(pt).cache() + //println("number of training sequences: " + trainRDDRaw.count()) + + testArrayWithoutLabelRaw = testData.map(sentence => { + val tokens = sentence.split("\t") + Seq(tokens.map(token => { + val tags = token.split('|') + Tok.put(tags.dropRight(1)) + })) + }) + + testArrayWithLabelRaw = testData.map(sentence => { + val tokens = sentence.split("\t") + Seq(tokens.map(token => { + val tags = token.split('|') + Tok.put(tags.last, tags.dropRight(1)) + })) + }) + + val model = new CRFRaw().setRegParam(regParam) + .setFreq(frequency) + .setMaxIterations(maxIteration) + .setEta(tol) + .setRegularization(L2Raw) + .setCalcAccuracy(calAcc) + .runCRF(templates, trainRDDRaw, testArrayWithLabelRaw, testArrayWithoutLabelRaw) + + val time1 = System.currentTimeMillis() + val testDataWithLabelRaw = sc.textFile(dataPath + ".t").filter(_.nonEmpty).map { sentence => + val tokens = sentence.split("\t") + Seq(tokens.map { token => + val tags: Array[String] = token.split('|') + Tok.put(tags.dropRight(1)) + }) + } + + val testDataWithoutLabelRaw = sc.textFile(dataPath + ".t").filter(_.nonEmpty).map { sentence => + val tokens = sentence.split("\t") + Seq(tokens.map { token => + val tags: Array[String] = token.split('|') + Tok.put(tags.last, tags.dropRight(1)) + }) + } + + val results = model.predict(testDataWithoutLabelRaw) + val score = results + .zipWithIndex() + .map(_.swap) + .join(testDataWithLabelRaw.zipWithIndex().map(_.swap)) + .map(_._2) + .map(x => x._1.compare(x._2)) + .reduce(_ + _) + + val total = testDataWithoutLabelRaw.map(_.toArray.length).reduce(_ + _) + + val acc = score / total.toDouble + Utils.saveEvaluation(acc, params.saveDataPath, sc) + val time2 = System.currentTimeMillis() + predictTime += (time2 - time1).toDouble / 1000 + } + val t3 = System.currentTimeMillis() + println("* train time = %f[s]", (t3 - t2) / 1000.0) + println("\n--------success--------\n") + + val totalTaining = (t3 - t1).toDouble / 1000 - predictTime + val coreTraining = (t3 - t2).toDouble / 1000 - predictTime + val dataLoading = (t2 - t1).toDouble / 1000 + println("[s]end2end train: " + totalTaining) + println("[s]data preprocess: " + dataLoading) + println("[s]core train: " + coreTraining) + + totalTaining + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/IDFRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/IDFRunner.scala index cc70db20ce0996313e6ff80f4d009e4c3f9c0104..aac9bda7000e891661da0179428e21223d5a42d9 100644 --- a/tools/kal-test/src/main/scala/com/bigdata/ml/IDFRunner.scala +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/IDFRunner.scala @@ -79,7 +79,7 @@ object IDFRunner{ params.setVerifiedDataPath(s"${params.saveDataPath}_raw") var appName = s"${params.algorithmName}_${datasetName}" if (isRaw.equals("yes")){ - var appName = s"${params.algorithmName}_${datasetName}_raw" + appName = s"${params.algorithmName}_${datasetName}_raw" params.setVerifiedDataPath(params.saveDataPath) params.setSaveDataPath(s"${params.saveDataPath}_raw") } @@ -88,9 +88,9 @@ object IDFRunner{ val conf = new SparkConf().setAppName(appName) conf.set("spark.driver.maxResultSize", "256G") if (isRaw.equals("no")){ - conf.set("spark.sophon.ml.idf.combineStrategy", + conf.set("spark.boostkit.ml.idf.combineStrategy", paramsMap.get("combineStrategy").asInstanceOf[String]) - conf.set("spark.sophon.ml.idf.fetchMethod", + conf.set("spark.boostkit.ml.idf.fetchMethod", paramsMap.get("fetchMethod").asInstanceOf[String]) } val spark = SparkSession.builder().config(conf).getOrCreate() diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/LightGBMRawRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/LightGBMRawRunner.scala new file mode 100644 index 0000000000000000000000000000000000000000..96ca6b15e90bea23226728b62be9e9d52ace2a29 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/LightGBMRawRunner.scala @@ -0,0 +1,276 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.UpEvaluationVerify +import com.bigdata.compare.ml.DownEvaluationVerify + +import com.microsoft.ml.spark.core.metrics.MetricConstants +import com.microsoft.ml.spark.train.ComputeModelStatistics +import com.microsoft.ml.spark.lightgbm.{LightGBMClassifier, LightGBMRegressor} +import com.typesafe.config.{Config, ConfigFactory} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.{Dataset, Row, SparkSession} +import org.apache.spark.SparkConf +import org.apache.spark.storage.StorageLevel +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.lang.System.nanoTime +import java.io.{File, FileWriter, PrintWriter} +import java.nio.file.{Paths, Files} +import java.util +import scala.beans.BeanProperty +import scala.util.Random + +class LightGBMRawConfig extends Serializable { + @BeanProperty var lgbm: util.HashMap[String, util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]]] = _ +} + +class LightGBMRawParams extends Serializable { + @BeanProperty var objective: String = _ + @BeanProperty var labelCol: String = _ + @BeanProperty var featuresCol: String = _ + @BeanProperty var verbosity: Int = _ + @BeanProperty var learningRate: Double = _ + @BeanProperty var maxDepth: Int = _ + @BeanProperty var maxBin: Int = _ + @BeanProperty var numIterations: Int = _ + @BeanProperty var numTasks: Int = _ + @BeanProperty var minGainToSplit: Double = _ + @BeanProperty var lambdaL2: Double = _ + @BeanProperty var numLeaves: Int = _ + @BeanProperty var minSumHessianInLeaf: Double = _ + @BeanProperty var minDataInLeaf: Int = _ + @BeanProperty var baggingFraction: Double = _ + @BeanProperty var baggingFreq: Int = _ + @BeanProperty var numThreads: Int = _ + @BeanProperty var networkCompression: Int = _ + @BeanProperty var histSynchAlgo: Int = _ + @BeanProperty var loglossApx: Int = _ + @BeanProperty var loglossApxEps: Double = _ + @BeanProperty var loadingBalance: String = _ + + @BeanProperty var trainingDataPath: String = _ + @BeanProperty var testDataPath: String = _ + @BeanProperty var algorithmType: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var evaluation: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object LightGBMRawRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (algorithmType, datasetName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3)) + val dataPath = args(1) + val dataPathSplit = dataPath.split(",") + val (trainingDataPath, testDataPath) = (dataPathSplit(0), dataPathSplit(1)) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/lgbm/lgbm.yml") + val representer = new Representer + representer.addClassTag(classOf[LightGBMRawParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[LightGBMRawConfig]), representer, options) + val description = new TypeDescription(classOf[LightGBMRawParams]) + yaml.addTypeDescription(description) + val configs: LightGBMRawConfig = yaml.load(stream).asInstanceOf[LightGBMRawConfig] + val params = new LightGBMRawParams() + val paramsMap: util.HashMap[String, Object] = configs.lgbm.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(algorithmType).get(datasetName) + params.setObjective(paramsMap.get("objective").asInstanceOf[String]) + params.setLabelCol(paramsMap.get("labelCol").asInstanceOf[String]) + params.setFeaturesCol(paramsMap.get("featuresCol").asInstanceOf[String]) + params.setVerbosity(paramsMap.get("verbosity").asInstanceOf[Int]) + params.setLearningRate(paramsMap.get("eta").asInstanceOf[Double]) + params.setMaxDepth(paramsMap.get("max_depth").asInstanceOf[Int]) + params.setMaxBin(paramsMap.get("max_bin").asInstanceOf[Int]) + params.setNumIterations(paramsMap.get("num_round").asInstanceOf[Int]) + params.setNumTasks(paramsMap.get("num_tasks").asInstanceOf[Int]) + params.setMinGainToSplit(paramsMap.get("min_gain_to_split").asInstanceOf[Double]) + params.setLambdaL2(paramsMap.get("lambda_l2").asInstanceOf[Double]) + params.setNumLeaves(paramsMap.get("num_leaves").asInstanceOf[Int]) + params.setMinSumHessianInLeaf(paramsMap.get("min_child_weight").asInstanceOf[Double]) + params.setMinDataInLeaf(paramsMap.get("min_data_in_leaf").asInstanceOf[Int]) + params.setBaggingFraction(paramsMap.get("bagging").asInstanceOf[Double]) + params.setBaggingFreq(paramsMap.get("bagging_freq").asInstanceOf[Int]) + params.setNumThreads(paramsMap.get("num_threads").asInstanceOf[Int]) + params.setNetworkCompression(paramsMap.get("network_compression").asInstanceOf[Int]) + params.setHistSynchAlgo(paramsMap.get("hist_synch_algo").asInstanceOf[Int]) + params.setLoglossApx(paramsMap.get("logloss_apx").asInstanceOf[Int]) + params.setLoglossApxEps(paramsMap.get("logloss_apx_eps").asInstanceOf[Double]) + params.setLoadingBalance(paramsMap.get("loading_balance").asInstanceOf[String]) + params.setTrainingDataPath(trainingDataPath) + params.setTestDataPath(testDataPath) + params.setAlgorithmType(algorithmType) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("LightGBM") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${algorithmType}_${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${algorithmType}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${algorithmType}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + + val (res, costTime) = new LightGBMRawKernel().runJob(spark, params) + params.setEvaluation(res) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + val isCorrect = params.algorithmType match { + case "classification" => UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + case "regression" => DownEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + } + params.setIsCorrect(isCorrect) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s; evaluation: ${res};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class LightGBMRawKernel{ + def runJob(spark: SparkSession, params: LightGBMRawParams): (Double, Double) = { + val sc = spark.sparkContext + sc.setLogLevel("INFO") + println(s"Initialized spark session.") + val t1 = System.currentTimeMillis() + + import spark.implicits._ + val trainData = spark.read.format("libsvm").option("vectorType", "sparse") + .load(params.trainingDataPath) + .repartition(params.numTasks) + .persist(StorageLevel.MEMORY_AND_DISK_SER) + val t2 = System.currentTimeMillis() + println("* after preprocess: " + t2) + + val lgbm = params.algorithmType match { + case "classification" =>{ + val classifier = new LightGBMClassifier() + .setObjective(params.objective) + .setLabelCol(params.labelCol) + .setFeaturesCol(params.featuresCol) + .setVerbosity(params.verbosity) + .setNumIterations(params.numIterations) + .setMaxDepth(params.maxDepth) + .setLearningRate(params.learningRate) + .setNumTasks(params.numTasks) + .setMaxBin(params.maxBin) + .setMinGainToSplit(params.minGainToSplit) + .setLambdaL2(params.lambdaL2) + .setNumLeaves(params.numLeaves) + .setMinDataInLeaf(params.minDataInLeaf) + .setMinSumHessianInLeaf(params.minSumHessianInLeaf) + .setBaggingFraction(params.baggingFraction) + .setBaggingFreq(params.baggingFreq) + classifier + } + case "regression" =>{ + val regressor = new LightGBMRegressor() + .setObjective(params.objective) + .setLabelCol(params.labelCol) + .setFeaturesCol(params.featuresCol) + .setVerbosity(params.verbosity) + .setNumIterations(params.numIterations) + .setMaxDepth(params.maxDepth) + .setLearningRate(params.learningRate) + .setNumTasks(params.numTasks) + .setMaxBin(params.maxBin) + .setMinGainToSplit(params.minGainToSplit) + .setLambdaL2(params.lambdaL2) + .setNumLeaves(params.numLeaves) + .setMinDataInLeaf(params.minDataInLeaf) + .setMinSumHessianInLeaf(params.minSumHessianInLeaf) + .setBaggingFraction(params.baggingFraction) + .setBaggingFreq(params.baggingFreq) + regressor + } + } + val model = lgbm.fit(trainData) + val t3 = System.currentTimeMillis() + println("* after train: " + t3) + + val testData = spark.read.format("libsvm").option("vectorType", "sparse") + .load(params.testDataPath) + .persist(StorageLevel.MEMORY_AND_DISK_SER) + println(s"Test data read successful. Number of partitions - ${testData.rdd.getNumPartitions}") + val predictions = model.transform(testData) + val (res, t4) = params.algorithmType match { + case "classification" =>{ + val metrics = new ComputeModelStatistics() + .setLabelCol("label") + .setScoresCol("probability") + .setScoredLabelsCol("prediction") + .setEvaluationMetric(MetricConstants.AccuracySparkMetric) + .transform(predictions) + val ecc = metrics.collect().apply(0).apply(1).asInstanceOf[Double] + val t4 = System.currentTimeMillis() + (ecc, t4) + } + case "regression" =>{ + // compute model metrics + val metrics = new ComputeModelStatistics() + .setEvaluationMetric("regression") + .setLabelCol("label") + .setScoresCol("prediction") + .transform(predictions) + // print metrics + val mse = metrics.collect().apply(0).apply(0).asInstanceOf[Double] + val t4 = System.currentTimeMillis() + (mse, t4) + } + } + println("Model predictions:") + predictions.select("prediction", "label", "features").show(5) + val trainingProcess = (t3 - t1).toDouble / 1000 + val trainingStep = (t3 - t2).toDouble / 1000 + val dataProcess = (t2 - t1).toDouble / 1000 + val predict = (t4 - t3).toDouble / 1000 + println("[s]train total: " + trainingProcess) + println("[s]data preprocess: " + dataProcess) + println("[s]train: " + trainingStep) + println("[s]predict: " + predict) + + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, trainingProcess) + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/preprocess/graph/MceRawDataProcess.scala b/tools/kal-test/src/main/scala/com/bigdata/preprocess/graph/MceRawDataProcess.scala new file mode 100644 index 0000000000000000000000000000000000000000..bbd65bf73ad0e12b1499d858d752c3dcfd851a8b --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/preprocess/graph/MceRawDataProcess.scala @@ -0,0 +1,82 @@ +package com.bigdata.preprocess.graph + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.storage.StorageLevel + +import scala.reflect.ClassTag + +object MceRawDataProcess extends Serializable { + private val DEFAULT_STORAGE_LEVEL = StorageLevel.MEMORY_AND_DISK_SER + val maxDegree = 2000 + + def main(args: Array[String]): Unit = { + val input = args(0) + val output = args(1) + val sparkConf = new SparkConf().setAppName("MceRawDataProcess") + val spark = SparkSession.builder.config(sparkConf).getOrCreate() + val split = " " + val partition = 284 + val sc = spark.sparkContext + val startTime = System.currentTimeMillis() + val edgeList = readUndirectDataFromHDFS(sc, input, split, partition) + val (newNodeID, resRDD) = reIdNodes(edgeList, maxDegree) + val numNodes = newNodeID.map(_._2).max() + 1 + val numEdge = resRDD.count() + resRDD.map(f => f._1 + "," + f._2).repartition(1).saveAsTextFile(output) + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + println(s"Exec Successful:numNodes:${numNodes},numEdge:${numEdge},MceRawDataProcess costTime:${costTime}s") + } + + private def reIdNodes[T: ClassTag](graph: RDD[(T, T)], maxDegree: Int): (RDD[(Int, T)], RDD[(Int, Int)]) = { + val tempInput = graph.flatMap(f => { + val ret = f._1.toString.compareTo(f._2.toString) + if (ret == 0) { + Iterator.empty + } else if (ret < 0) { + Iterator((f._1, f._2)) + } else { + Iterator((f._2, f._1)) + } + }).distinct().persist(DEFAULT_STORAGE_LEVEL) + + tempInput.foreachPartition(_ => {}) + + val newNodeId = tempInput.flatMap(f => Iterator((f._1, 1), (f._2, 1))) + .reduceByKey((x, y) => x + y) + .filter(f => f._2 != 1&& f._2 < maxDegree) + .sortBy(f => f._2) + .zipWithIndex().map(f => (f._1._1, f._2.toInt)) + .persist(DEFAULT_STORAGE_LEVEL) + + newNodeId.foreachPartition(_ => {}) + + val resEdgeRDD = tempInput.join(newNodeId).map(f => f._2).join(newNodeId).map(f => f._2) + val resRevRdgeRDD = resEdgeRDD.map(f => (f._2, f._1)) + val resRDD = resEdgeRDD.union(resRevRdgeRDD) + + (newNodeId.map(f => (f._2, f._1)), resRDD) + } + + def readUndirectDataFromHDFS(sc: SparkContext, + filePath: String, + split: String, + partition: Int): RDD[(Long, Long)] = { + sc.textFile(filePath, partition) + .flatMap(line => { + if (line.startsWith("#")) { + Iterator.empty + } else { + val x = line.split(split) + if (x.length < 2) { + Iterator.empty + } else { + val node1 = x(0).toLong + val node2 = x(1).toLong + Iterator((node1, node2)) + } + } + }) + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/preprocess/ml/ALSDataGenRun.scala b/tools/kal-test/src/main/scala/com/bigdata/preprocess/ml/ALSDataGenRun.scala new file mode 100644 index 0000000000000000000000000000000000000000..d62fd913afbc74f7170c127f06eeaeb8b0862b81 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/preprocess/ml/ALSDataGenRun.scala @@ -0,0 +1,57 @@ +package com.bigdata.preprocess.ml + +import org.apache.spark.rdd.{RDD, PairRDDFunctions} +import org.apache.spark.mllib.recommendation.Rating +import org.apache.spark.mllib.random._ +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.mllib.linalg.{Vectors,Vector} + +object ALSDataGenRun extends Serializable { + + def main(args: Array[String]): Unit = { + val conf = new SparkConf().setAppName("ALSDataGenRun") + val sc = new SparkContext(conf) + + var outputPath = "" + var numUsers: Int = 100 + var numProducts: Int = 100 + var sparsity: Double = 0.05 + var implicitPrefs: Boolean = false + val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) + val numPartitions = 200 + + if (args.length == 5) { + outputPath = args(0) + numUsers = args(1).toInt + numProducts = args(2).toInt + sparsity = args(3).toDouble + implicitPrefs = args(4).toBoolean + + println(s"Output Path: $outputPath") + println(s"Num of Users: $numUsers") + println(s"Num of Products: $numProducts") + println(s"sparsity: $sparsity") + println(s"Implicit Prefs: $implicitPrefs") + } else { + System.err.println( + s"Usage: $ALSDataGenRun " + ) + System.exit(1) + } + + val rawData: RDD[Vector] = RandomRDDs.normalVectorRDD(sc, numUsers, numProducts, numPartitions) + val rng = new java.util.Random() + val data = rawData.map{v => + val a = Array.fill[Double](v.size)(0.0) + v.foreachActive{(i, vi) => + if(rng.nextDouble <= sparsity) { + a(i) = vi + } + } + Vectors.dense(a).toSparse + } + data.saveAsObjectFile(outputPath) + + sc.stop() + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/org/apache/spark/graphx/lib/LocalClusteringCoefficient.scala b/tools/kal-test/src/main/scala/org/apache/spark/graphx/lib/LocalClusteringCoefficient.scala new file mode 100644 index 0000000000000000000000000000000000000000..3f6704d837434ada8492125f3542bc18caba0308 --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/graphx/lib/LocalClusteringCoefficient.scala @@ -0,0 +1,135 @@ + + +package org.apache.spark.graphx.lib + +import org.apache.spark.graphx._ + +import scala.reflect.ClassTag + +import scala.collection.mutable.ListBuffer + +/** + * Local clustering coefficient algorithm + * + * In a directed graph G=(V, E), we define the neighbourhood N_i of a vertex v_i as + * N_i={v_j: e_ij \in E or e_ji \in E} + * + * The local clustering coefficient C_i of a vertex v_i is then defined as + * C_i = |{e_jk: v_j, v_k \in N_i, e_jk \in E}| / (K_i * (K_i - 1)) + * where K_i=|N_i| is the number of neighbors of v_i + * + * Note that the input graph must have been partitioned using + * [[org.apache.spark.graphx.Graph#partitionBy]]. + */ +object LocalClusteringCoefficient { + /** + * Compute the local clustering coefficient for each vertex and + * return a graph with vertex value representing the local clustering coefficient of that vertex + * + * @param graph the graph for which to compute the connected components + * + * @return a graph with vertex attributes containing + * the local clustering coefficient of that vertex + * + */ + def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Graph[Double, ED] = { + // Remove redundant edges + val g = graph.groupEdges((a, b) => a).cache() + + // Construct set representations of the neighborhoods + val nbrSets: VertexRDD[VertexSet] = + g.collectNeighborIds(EdgeDirection.Either).mapValues { (vid, nbrs) => + val set = new VertexSet(nbrs.length) + var i = 0 + while (i < nbrs.size) { + // prevent self cycle + if(nbrs(i) != vid) { + set.add(nbrs(i)) + } + i += 1 + } + set + } + val counters = lccCompute(g, nbrSets) + + // count number of neighbors for each vertex + var nbNumMap = Map[VertexId, Int]() + nbrSets.collect().foreach { case (vid, nbVal) => + nbNumMap += (vid -> nbVal.size) + } + + // Merge counters with the graph + g.outerJoinVertices(counters) { + (vid, _, optCounter: Option[Double]) => + val dblCount: Double = optCounter.getOrElse(0) + val nbNum = nbNumMap(vid) + if (nbNum > 1) { + dblCount / (nbNum * (nbNum - 1)) + } + else { + 0 + } + } + } + + def runGlobalClusteringCoefficient[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Double = { + // Remove redundant edges + val g = graph.groupEdges((a, b) => a).cache() + + // Construct set representations of the neighborhoods + val nbrSets: VertexRDD[VertexSet] = + g.collectNeighborIds(EdgeDirection.Either).mapValues{(vid, nbrs) => + val set = new VertexSet(nbrs.length) + var i = 0 + while (i < nbrs.size) { + // prevent self cycle + if(nbrs(i) != vid) { + set.add(nbrs(i)) + } + i += 1 + } + set + } + val counters = lccCompute(g, nbrSets) + val openTc = nbrSets.map(f => { + val degree = f._2.size + degree * (degree - 1) / 2 + }).sum + val closeTc = counters.map(_._2).sum() / 2 + println(closeTc) + println(openTc) + closeTc / openTc + } + + + def lccCompute[VD: ClassTag, ED: ClassTag](g: Graph[VD, ED], nbrSets: VertexRDD[VertexSet]): VertexRDD[Double] = { + // join the sets with the graph + val setGraph: Graph[VertexSet, ED] = g.outerJoinVertices(nbrSets) { + (vid, _, optSet) => optSet.getOrElse(null) + } + + // Edge function computes intersection of smaller vertex with larger vertex + def edgeFunc(et: EdgeContext[VertexSet, ED, Double]){ + assert(et.srcAttr != null) + assert(et.dstAttr != null) + val (smallSet, largeSet) = if (et.srcAttr.size < et.dstAttr.size) { + (et.srcAttr, et.dstAttr) + } else { + (et.dstAttr, et.srcAttr) + } + val iter = smallSet.iterator + var buf = 0.0 + while (iter.hasNext) { + val vid = iter.next() + if (vid != et.srcId && vid != et.dstId && largeSet.contains(vid)) { + buf += 1.0 + } + } + et.sendToDst(buf) + et.sendToSrc(buf) + } + + // compute the intersection along edges + setGraph.aggregateMessages(edgeFunc, _ + _) + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/org/apache/spark/nlp/CRF.scala b/tools/kal-test/src/main/scala/org/apache/spark/nlp/CRF.scala new file mode 100644 index 0000000000000000000000000000000000000000..199b1e1adfdedbe1289c4090db42eeaf66c904a3 --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/nlp/CRF.scala @@ -0,0 +1,260 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// scalastyle:off +package org.apache.spark.nlp + +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.rdd.RDD +import org.apache.spark.internal.Logging + +trait Regularization + +case object L1 extends Regularization + +case object L2 extends Regularization + +/** + * CRF with support for multiple parallel runs + * L2 regParam = 1/(2.0 * sigma**2) + */ +class CRF private ( + private var freq: Int, + private var regParam: Double, + private var maxIterations: Int, + private var tolerance: Double, + private var regularization: Regularization) extends Serializable with Logging { + + private var calcAccuracy: Boolean = false + def this() = this(freq = 1, regParam = 0.5, maxIterations = 1000, tolerance = 1E-3, regularization = L2) + + def setRegParam(regParam: Double) = { + this.regParam = regParam + this + } + + def setFreq(freq: Int) = { + this.freq = freq + this + } + + def setMaxIterations(maxIterations: Int) = { + this.maxIterations = maxIterations + this + } + + def setEta(eta: Double) = { + this.tolerance = eta + this + } + + def setRegularization(regula: Regularization) = { + this.regularization = regula + this + } + + //set if need to calculate model's accuracy + //this requires testArrayWithLabel and testArrayWithoutLabel are given + def setCalcAccuracy(ca: Boolean): this.type = { + this.calcAccuracy = ca + this + } + + /** + * Internal method to train the CRF model + * + * @param template the template to train the model + * @param trains the source for the training + * @return the model of the source + */ + def runCRF( + template: Array[String], + trains: RDD[Sequence], + testArrayWithLabel: Array[Sequence] = Array[Sequence](), + testArrayWithoutLabel: Array[Sequence] = Array[Sequence]()): CRFModel = { + val featureIdx = new FeatureIndex() + featureIdx.openTemplate(template) + featureIdx.openTagSetDist(trains) + + val bcFeatureIdxI: Broadcast[FeatureIndex] = trains.context.broadcast(featureIdx) + val taggers = trains.map(train => { + val tagger: Tagger = new Tagger(bcFeatureIdxI.value.labels.size, LearnMode) + tagger.read(train, bcFeatureIdxI.value) + tagger + }) + + featureIdx.buildDictionaryDist(taggers, bcFeatureIdxI, freq) + + val bcFeatureIdxII = trains.context.broadcast(featureIdx) + val taggerList: RDD[Tagger] = taggers.map(bcFeatureIdxII.value.buildFeatures).cache() + + val model = runAlgorithm(taggerList, featureIdx, testArrayWithLabel, testArrayWithoutLabel) + taggerList.unpersist() + + model + } + + /** + * + * @param taggers the tagger in the template + * @param featureIdx the index of the feature + */ + def runAlgorithm( + taggers: RDD[Tagger], + featureIdx: FeatureIndex, + testArrayWithLabel: Array[Sequence] = Array[Sequence](), + testArrayWithoutLabel: Array[Sequence] = Array[Sequence]()): CRFModel = { + + logInfo("Starting CRF Iterations ( sentences: %d, features: %d, labels: %d )" + .format(taggers.count(), featureIdx.maxID, featureIdx.labels.length)) + + var updater: UpdaterCRF = null + regularization match { + case L1 => + updater = new L1Updater + case L2 => + updater = new L2Updater + case _ => + throw new Exception("only support L1-CRF and L2-CRF now") + } + + featureIdx.alpha = new CRFWithLBFGS(new CRFGradient, updater) + .setRegParam(regParam) + .setConvergenceTol(tolerance) + .setNumIterations(maxIterations) + .optimizer(taggers, featureIdx.initAlpha()) + + + // calculate the accuracy faster + if (calcAccuracy && testArrayWithLabel.length == testArrayWithoutLabel.length) { + if (testArrayWithLabel.length != 0) { + Accuracy.calc(featureIdx, testArrayWithLabel, testArrayWithoutLabel) + } else { + logInfo(s"test dataset not given.") + } + } + featureIdx.saveModel + } +} + +/** + * Top-level methods for calling CRF. + */ +object CRF { + + /** + * Train CRF Model + * + * @param templates Source templates for training the model + * @param train Source files for training the model + * @return Model + */ + + def train( + templates: Array[String], + train: RDD[Sequence], + regParam: Double, + freq: Int, + maxIteration: Int, + eta: Double, + regularization: Regularization): CRFModel = { + new CRF().setRegParam(regParam) + .setFreq(freq) + .setMaxIterations(maxIteration) + .setEta(eta) + .setRegularization(regularization) + .runCRF(templates, train) + } + + def train( + templates: Array[String], + train: RDD[Sequence], + regParam: Double, + freq: Int, + maxIteration: Int, + eta: Double): CRFModel = { + new CRF().setRegParam(regParam) + .setFreq(freq) + .setMaxIterations(maxIteration) + .setEta(eta) + .runCRF(templates, train) + } + + def train( + templates: Array[String], + train: RDD[Sequence], + regParam: Double, + freq: Int): CRFModel = { + new CRF().setRegParam(regParam) + .setFreq(freq) + .runCRF(templates, train) + } + + def train( + templates: Array[String], + train: RDD[Sequence], + regParam: Double, + regularization: Regularization): CRFModel = { + new CRF().setRegParam(regParam) + .setRegularization(regularization) + .runCRF(templates, train) + } + + def train( + templates: Array[String], + train: RDD[Sequence], + regularization: Regularization): CRFModel = { + new CRF().setRegularization(regularization) + .runCRF(templates, train) + } + + def train( + templates: Array[String], + train: RDD[Sequence]): CRFModel = { + new CRF().runCRF(templates, train) + } +} + +object Accuracy extends Logging { + def calc( + featureIdx: FeatureIndex, + testArrayWithLabel: Array[Sequence], + testArrayWithoutLabel: Array[Sequence]): Double = { + val results = testArrayWithoutLabel.map(testCRF(_, featureIdx)) + var score = 0 + var i = 0 + for (r <- results) { + score += r.compare(testArrayWithLabel(i)) + i += 1 + } + val total = testArrayWithoutLabel.map(_.toArray.length).sum + + logInfo(f"Prediction Accuracy: $score / $total = ${score / total.toDouble}") + + score / total.toDouble + } + + private def testCRF(test: Sequence, featureIdx: FeatureIndex): Sequence = { + val tagger = new Tagger(featureIdx.labels.size, TestMode) + tagger.read(test, featureIdx) + featureIdx.buildFeatures(tagger) + tagger.parse(featureIdx.alpha, None) + + Sequence(test.toArray.map { x => + Token.put(featureIdx.labels(tagger.result(test.toArray.indexOf(x))), x.tags) + }) + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/org/apache/spark/nlp/CRFModel.scala b/tools/kal-test/src/main/scala/org/apache/spark/nlp/CRFModel.scala new file mode 100644 index 0000000000000000000000000000000000000000..6e90f79b4c768dc2cb903d91ef0692dc0c412aca --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/nlp/CRFModel.scala @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// scalastyle:off +package org.apache.spark.nlp + +import java.io._ +import java.nio.file.{StandardOpenOption, Paths, Files} + +import scala.collection.mutable.ArrayBuffer +import org.apache.spark.rdd.RDD + +trait VerboseMode + +case object VerboseLevel1 extends VerboseMode + +case object VerboseLevel2 extends VerboseMode + +case class CRFModel ( + head: Array[String], + dic: Array[(String, Int)], + alpha: Array[Double]) extends Serializable { + + protected def formatVersion = "1.0" + + private var verboseMode: Option[VerboseMode] = None + + private var nBest = 0 + private var costFactor = 1.0 + + def setNBest(nBest: Int): CRFModel = { + this.nBest = nBest + this + } + + def setVerboseMode(mode: VerboseMode) = { + this.verboseMode = Some(mode) + this + } + + def setcostFact(cf: Double) = { + this.costFactor = cf + this + } + + override def toString: String = { + val dicString = dic.map{case(k, v) => k + "|-|" + v.toString} + s"${head.mkString("\t")}|--|${dicString.mkString("\t")}|--|${alpha.map(_.toFloat).mkString("\t")}" + } + + def toStringHead: String = { + val dicString: Array[String] = dic.map{case(k, v) => k + "|-|" + v.toString} + s"${head.mkString("\t")}|--|${dicString.mkString("\t")}" + } + + def toArrayString: Array[String] = { + val dicString: Array[String] = dic.map{case(k, v) => k + "|-|" + v.toString} + val alphaString: Array[String] = alpha.map(_.toString) + val emptyLine: Array[String] = Array("|--|") + head ++ emptyLine ++ dicString ++ emptyLine ++ alphaString + } + + /** + * Verify CRF model + * + * @param tests Source files to be verified + * @return Source files with the predictive labels + */ + def predict(tests: RDD[Sequence]): RDD[Sequence] = { + val bcModel = tests.context.broadcast(this) + tests.map { test => + bcModel.value.testCRF(test, costFactor, verboseMode) + } + } + + def predict(tests: Array[Sequence]): Array[Sequence] = { + tests.map(this.testCRF(_, costFactor, verboseMode)) + } + /** + * Internal method to test the CRF model + * + * @param test the sequence to be tested + * @return the sequence along with predictive labels + */ + def testCRF(test: Sequence, + costFactor: Double, vMode: Option[VerboseMode]): Sequence = { + val deFeatureIdx = new FeatureIndex() + deFeatureIdx.readModel(this) + val tagger = new Tagger(deFeatureIdx.labels.size, TestMode) + tagger.setCostFactor(costFactor) + tagger.setNBest(nBest) + tagger.read(test, deFeatureIdx) + deFeatureIdx.buildFeatures(tagger) + tagger.parse(deFeatureIdx.alpha, vMode) + var Seq: Sequence = null + if (vMode.isDefined) { + val tokens = new ArrayBuffer[Token]() + val labels = deFeatureIdx.labels + val tmp = test.toArray + for (i <- tmp.indices) { + val probMat = new ArrayBuffer[(String, Double)]() + vMode match { + case Some(VerboseLevel1) => + probMat.append((labels(tagger.result(i)), tagger.probMatrix(i * labels.length + tagger.result(i)))) + case Some(VerboseLevel2) => + for (j <- labels.indices) + probMat.append((labels(j), tagger.probMatrix(i * labels.length + j))) + case _ => + } + tokens.append(Token.put(labels(tagger.result(i)), tmp(i).tags).setProb(probMat.toArray)) + } + Seq = Sequence(tokens.toArray).setSeqProb(tagger.seqProb) + } + else { + Seq = Sequence(test.toArray.map(x => + Token.put(deFeatureIdx.labels(tagger.result(test.toArray.indexOf(x))), x.tags) + )) + } + if(nBest > 0) + Seq.setCandidates(tagger.topN, tagger.probN, deFeatureIdx.labels ) + + Seq + } +} + +object CRFModel { + def load(source: String): CRFModel = { + val components = source.split("""\|--\|""") + require(components.length == 3, "Incompatible formats in Model file") + val head = components(0).split("\t") + val dic = components(1).split("\t").map(x => { + val xx = x.split("""\|-\|""") + require(xx.length == 2, "Incompatible formats in Model file") + (xx(0), xx(1).toInt) + }) + val alpha = components(2).split("\t").map(_.toDouble) + CRFModel(head, dic, alpha) + } + + def loadBinaryFile(path: String): CRFModel = { + val source = scala.io.Source.fromFile(path + "/head").getLines().toArray.head + val components = source.split("""\|--\|""") + require(components.length == 2, "Incompatible formats in Model file") + val head = components(0).split("\t") + val dic = components(1).split("\t").map(x => { + val xx = x.split("""\|-\|""") + require(xx.length == 2, "Incompatible formats in Model file") + (xx(0), xx(1).toInt) + }) + val alpha = Array.fill(head(1).toInt)(0.0) + val infile = new FileInputStream(path + "/alpha") + val in: DataInputStream = new DataInputStream(infile) + for(i <- alpha.indices) + alpha(i) = in.readFloat() + in.close() + CRFModel(head, dic, alpha) + } + + def loadArray(source: Array[String]): CRFModel = { + val head = new ArrayBuffer[String]() + val dic = new ArrayBuffer[String]() + val alpha = new ArrayBuffer[String]() + var sentinel: Int = 0 + for(line <- source) { + if(line == "|--|") { + sentinel += 1 + } + else { + sentinel match { + case 0 => head.append(line) + case 1 => dic.append(line) + case 2 => alpha.append(line) + case _ => throw new RuntimeException("Incompatible formats in Model") + } + } + } + CRFModel(head.toArray, dic.toArray.map(x => { + val xx = x.split("""\|-\|""") + require(xx.length == 2, "Incompatible formats in Model file") + (xx(0), xx(1).toInt) + }), alpha.toArray.map(_.toDouble)) + } + + def save(model: CRFModel): String = { + model.toString + } + + def saveBinaryFile(model: CRFModel, path: String) = { + val head = model.toStringHead + new java.io.PrintWriter(path + "/head") { write(head); close() } + val outfile = new FileOutputStream(path + "/alpha") + val out: DataOutputStream = new DataOutputStream( + new BufferedOutputStream( + Files.newOutputStream( + Paths.get(path + "/alpha"), StandardOpenOption.APPEND + ) + ) + ) + val alpha = model.alpha.map(_.toFloat) + for(i <- alpha.indices) + out.writeFloat(alpha(i)) + out.close() + } + + def saveArray(model: CRFModel): Array[String] = { + model.toArrayString + } +} diff --git a/tools/kal-test/src/main/scala/org/apache/spark/nlp/CRFWithLBFGS.scala b/tools/kal-test/src/main/scala/org/apache/spark/nlp/CRFWithLBFGS.scala new file mode 100644 index 0000000000000000000000000000000000000000..2e8acdc42201ac5fab3f78c84563736fb5de2185 --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/nlp/CRFWithLBFGS.scala @@ -0,0 +1,177 @@ +// scalastyle:off +package org.apache.spark.nlp + +import scala.collection.mutable +import breeze.optimize.{CachedDiffFunction, DiffFunction, OWLQN => BreezeOWLQN, LBFGS => BreezeLBFGS} +import breeze.linalg.{DenseVector => BDV, sum => Bsum} +import org.apache.spark.rdd.RDD +import org.apache.spark.internal.Logging +import org.apache.spark.mllib.optimization._ +import org.apache.spark.mllib.linalg.{Vector => SparkVector} + + +class CRFWithLBFGS(private var gradient: CRFGradient, private var updater: Updater) + extends LBFGS(gradient: Gradient, updater: Updater) { + + private val numCorrections = 5 + private var maxNumIterations = 100 + private var convergenceTol = 1E-4 + private var regParam = 0.5 + + /** + * Set the regularization parameter. Default 0.5. + */ + override def setRegParam(regParam: Double): this.type = { + this.regParam = regParam + this + } + + /** + * Set the convergence tolerance of iterations for L-BFGS. Default 1E-4. + * Smaller value will lead to higher accuracy with the cost of more iterations. + * This value must be nonnegative. Lower convergence values are less tolerant + * and therefore generally cause more iterations to be run. + */ + override def setConvergenceTol(tolerance: Double): this.type = { + this.convergenceTol = tolerance + this + } + + /** + * Set the maximal number of iterations for L-BFGS. Default 100. + */ + override def setNumIterations(iters: Int): this.type = { + this.maxNumIterations = iters + this + } + + def optimizer(data: RDD[Tagger], initialWeights: BDV[Double]): BDV[Double] = { + CRFWithLBFGS.runLBFGS(data, + gradient, + updater, + numCorrections, + convergenceTol, + maxNumIterations, + regParam, + initialWeights) + } +} + +object CRFWithLBFGS extends Logging { + def runLBFGS( + data: RDD[Tagger], + gradient: CRFGradient, + updater: Updater, + numCorrections: Int, + convergenceTol: Double, + maxNumIterations: Int, + regParam: Double, + initialWeights: BDV[Double]): BDV[Double] = { + + val costFun = new CostFun(data, gradient, updater, regParam) + + var lbfgs: BreezeLBFGS[BDV[Double]] = null + + updater match { + case updater: L1Updater => + lbfgs = new BreezeOWLQN[Int, BDV[Double]](maxNumIterations, numCorrections, regParam, convergenceTol) + case updater: L2Updater => + lbfgs = new BreezeLBFGS[BDV[Double]](maxNumIterations, numCorrections, convergenceTol) + } + + val states = lbfgs.iterations(new CachedDiffFunction[BDV[Double]](costFun), initialWeights) + + val lossHistory = mutable.ArrayBuilder.make[Double] + var state = states.next() + while (states.hasNext) { + lossHistory += state.value + state = states.next() + } + + logInfo("LBFGS.runLBFGS finished after %s iterations. last 10 losses: %s".format( + state.iter, lossHistory.result().takeRight(10).mkString(" -> "))) + state.x + } +} + +class CRFGradient extends Gradient { + def compute( + data: SparkVector, + label: Double, + weights: SparkVector, + cumGradient: SparkVector): Double = { + throw new Exception("The original compute() method is not supported") + } + + def computeCRF(sentences: Iterator[Tagger], weights: BDV[Double]): (BDV[Double], Double) = { + + val expected = BDV.zeros[Double](weights.length) + var obj: Double = 0.0 + while (sentences.hasNext) + obj += sentences.next().gradient(expected, weights) + + (expected, obj) + } +} + +trait UpdaterCRF extends Updater { + def compute( + weightsOld: SparkVector, + gradient: SparkVector, + stepSize: Double, + iter: Int, + regParam: Double) = { + throw new Exception("The original compute() method is not supported") + } + def computeCRF(weightsOld: BDV[Double], gradient: BDV[Double], regParam: Double): (BDV[Double], Double) +} + +class L2Updater extends UpdaterCRF { + def computeCRF( + weightsOld: BDV[Double], + gradient: BDV[Double], + regParam: Double): (BDV[Double], Double) = { + val loss = Bsum(weightsOld *:* weightsOld *:* regParam) + gradient :+= weightsOld *:* (regParam * 2.0) + (gradient, loss) + } +} + +class L1Updater extends UpdaterCRF { + def computeCRF( + weightsOld: BDV[Double], + gradient: BDV[Double], + regParam: Double): (BDV[Double], Double) = { + (gradient, 0.0) + } +} + +private class CostFun( + taggers: RDD[Tagger], + gradient: CRFGradient, + updater: Updater, + regParam: Double) extends DiffFunction[BDV[Double]] with Logging with Serializable { + + var iter = 0 + + override def calculate(weigths: BDV[Double]): (Double, BDV[Double]) = { + val start = System.currentTimeMillis + val bcWeights = taggers.context.broadcast(weigths) + lazy val treeDepth = math.ceil(math.log(taggers.partitions.length) / (math.log(2) * 2)).toInt + + val (expected, obj) = taggers.mapPartitions(sentences => + Iterator(gradient.computeCRF(sentences, bcWeights.value)) + ).treeReduce((p1, p2) => (p1, p2) match { + case ((expected1, obj1), (expected2, obj2)) => + (expected1 + expected2, obj1 + obj2) + }, treeDepth) + + val (grad, loss) = updater.asInstanceOf[UpdaterCRF].computeCRF(weigths, expected, regParam) + val end = System.currentTimeMillis + + logInfo(s"iter_$iter, Run Time = ${(end - start) / 1000.0}[s]\n") + iter += 1 + (obj + loss, grad) + } +} + diff --git a/tools/kal-test/src/main/scala/org/apache/spark/nlp/Data.scala b/tools/kal-test/src/main/scala/org/apache/spark/nlp/Data.scala new file mode 100644 index 0000000000000000000000000000000000000000..c00f2fa13af5ceee2ae2fb2ffd56fff74f7e1790 --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/nlp/Data.scala @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// scalastyle:off +package org.apache.spark.nlp + +import scala.collection.mutable.ArrayBuffer + +/** + * Class that represents the columns of a token. + * + * @param label The last column for this token. + * @param tags List of tags for this token, expect for the last label. + */ +class Token( + val label: String, + val tags: Array[String]) extends Serializable { + var prob : Array[(String, Double)] = null + + def setProb(probMat: Array[(String, Double)]): Token ={ + this.prob = probMat + this + } + + def probPrinter(): String = { + val strRes = new StringBuffer() + strRes.append( tags.mkString("\t") ) + strRes.append( "\t" + label + "\t") + strRes.append(prob.map{ + case (str, p) => str + "/" + p.toString + }.mkString("\t") ) + strRes.toString + } + + override def toString: String = { + s"$label|--|${tags.mkString("|-|")}" + } + + def compare(other: Token): Int = { + if(this.label == other.label) 1 else 0 + } +} + +object Token { + /** + * Parses a string resulted from `LabeledToken#toString` into + * an [[com.intel.ssg.bdt.nlp.Token]]. + * + */ + def deSerializer(s: String): Token = { + val parts = s.split("""\|--\|""") + val label = parts(0) + val tags = parts(1).split("""\|-\|""") + Token.put(label, tags) + } + + def serializer(token: Token): String = { + token.toString + } + + def put(label: String, tags: Array[String]) = { + new Token(label, tags) + } + + def put(tags: Array[String]) = { + new Token(null, tags) + } +} + +/** + * Class that represents the tokens of a sentence. + * + * @param sequence List of tokens + */ +case class Sequence (sequence: Array[Token]) extends Serializable { + var seqProb = 0.0 + lazy val candidates = ArrayBuffer.empty[Sequence] + + def setSeqProb(seqProb: Double): Sequence ={ + this.seqProb = seqProb + this + } + + def setCandidates(nBest: ArrayBuffer[Array[Int]], + probN: ArrayBuffer[Double], + labels: ArrayBuffer[String]) = { + for(i <- nBest.indices) { + val tokens = new ArrayBuffer[Token]() + for(j <- sequence.indices) { + tokens += Token.put(labels(nBest(i)(j)), sequence(j).tags) + } + candidates += Sequence(tokens.toArray).setSeqProb(probN(i)) + } + this + } + + def Print(): String = { + val strRes = new ArrayBuffer[String]() + strRes.append("#" + "\t" + seqProb.toString) + val pairs = this.toArray + for(i <- pairs.indices) { + strRes.append(pairs(i).tags.mkString("\t") + "\t" + pairs(i).label) + } + strRes.mkString("\n") + } + + def nthPrint(k: Int): String = { + val strRes = new ArrayBuffer[String]() + strRes.append("#" + k + "\t" +candidates(k).seqProb.toString) + val pairs = this.candidates(k).toArray + for(i <- pairs.indices) { + strRes.append(pairs(i).tags.mkString("\t") + "\t" + pairs(i).label) + } + strRes.mkString("\n") + } + + def nBestPrint(): String = { + val idx = candidates.indices + idx.map(t =>nthPrint(t)) + .mkString("\n") + } + + override def toString: String = { + seqProb match { + case 0.0 => s"${sequence.mkString("\t")}" + case _ => "#" + seqProb.toString + "\t" + s"${sequence.mkString("\t")}" + } + } + + def toArray: Array[Token] = sequence + + def compare(other: Sequence): Int = { + this.toArray.zip(other.toArray).map{case(one, two) => one.compare(two)}.sum + } + + def probPrinter(): String = { + val strRes = new ArrayBuffer[String]() + strRes.append("|-#-|" + seqProb.toString) + strRes ++= this.toArray.map(_.probPrinter()) + strRes.mkString("\n") + } + +} + +object Sequence { + def deSerializer(s: String): Sequence = { + val tokens = s.split("\t") + tokens.head.substring(0, 5) match { + case """"\|-#-\|"""" => val seqProb = tokens.head.substring(5).toDouble + Sequence(tokens.tail.map(Token.deSerializer)).setSeqProb(seqProb) + case _ => Sequence(tokens.map(Token.deSerializer)) + } + } + def serializer(sequence: Sequence): String = { + sequence.toString + } +} diff --git a/tools/kal-test/src/main/scala/org/apache/spark/nlp/FeatureIndex.scala b/tools/kal-test/src/main/scala/org/apache/spark/nlp/FeatureIndex.scala new file mode 100644 index 0000000000000000000000000000000000000000..f79fe72f713252338a92c8db8e14bb5d2d77bc4c --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/nlp/FeatureIndex.scala @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// scalastyle:off +package org.apache.spark.nlp + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer + +import breeze.linalg.{DenseVector => BDV} + +import org.apache.spark.rdd.RDD +import org.apache.spark.broadcast.Broadcast + +private[nlp] class FeatureIndex extends Serializable { + + var maxID = 0 + var alpha :BDV[Double] = _ + var tokensSize = 0 + val unigramTempls = new ArrayBuffer[String]() + val bigramTempls = new ArrayBuffer[String]() + var labels = new ArrayBuffer[String]() + val dic = mutable.HashMap[String, (Int, Int)]() + val kMaxContextSize = 4 + val BOS = Array("_B-1", "_B-2", "_B-3", "_B-4") + val EOS = Array("_B+1", "_B+2", "_B+3", "_B+4") + + def initAlpha() = { + alpha = BDV.zeros[Double](maxID) + alpha + } + + def openTagSet(sentence: Sequence): FeatureIndex = { + val tokenNum = sentence.toArray.map(_.tags.length).distinct + require(tokenNum.length == 1, + "The number of columns should be fixed in each token!") + + labels.appendAll(sentence.toArray.map(_.label)) + tokensSize = tokenNum.head + this + } + + /** + * Build feature index + */ + def buildFeatures(tagger: Tagger): Tagger = { + List(unigramTempls, bigramTempls).foreach{ templs => + tagger.x.foreach { token => + if (tagger.x.head != token || templs.head.head.equals('U')) { + tagger.featureCacheIndex.append(tagger.featureCache.length) + templs.foreach { templ => + val os = applyRule(templ, tagger.x.indexOf(token), tagger) + val id = dic.getOrElse(os, (-1, 0))._1 + if (id != -1) tagger.featureCache.append(id) + } + tagger.featureCache.append(-1) + } + } + } + tagger + } + + def buildDictionary(tagger: Tagger) = { + val dicLocal = mutable.HashMap[String, Int]() + List(unigramTempls, bigramTempls).foreach{ templs => + tagger.x.foreach{ token => + if(tagger.x.head != token || templs.head.head.equals('U')) + templs.foreach{ templ => + val os = applyRule(templ, tagger.x.indexOf(token), tagger) + if (dicLocal.get(os).isEmpty) + dicLocal.update(os, 1) + else { + val idx = dicLocal.get(os).get + 1 + dicLocal.update(os, idx) + } + } + } + } + dicLocal + } + + def applyRule(src: String, idx: Int, tagger: Tagger): String = { + val templ = src.split(":") + if (templ.size == 2) { + val cols = templ(1).split("/").map(_.substring(2)) + templ(0) + ":" + cols.map(getIndex(_, idx, tagger)).reduce(_ + "/" + _) + } else if (templ.size == 1) { + templ(0) + } else + throw new RuntimeException("Incompatible formats in Template") + } + + def getIndex(src: String, pos: Int, tagger: Tagger): String = { + val coor = src.drop(1).dropRight(1).split(",") + require(coor.size == 2, "Incompatible formats in Template") + val row = coor(0).toInt + val col = coor(1).toInt + if (row < -kMaxContextSize || row > kMaxContextSize || + col < 0 || col >= tokensSize) { + throw new RuntimeException("Incompatible formats in Template") + } + val idx = pos + row + if (idx < 0) { + BOS(- idx - 1) + } else if (idx >= tagger.x.size) { + EOS(idx - tagger.x.size) + } else { + tagger.x(idx)(col) + } + } + + /** + * Read one template file + * + * @param lines the template file + */ + def openTemplate(lines: Array[String]): Unit = { + var i: Int = 0 + lines.foreach { t => + t.head match{ + case 'U' => unigramTempls += t + case 'B' => bigramTempls += t + case '#' => + case _ => throw new RuntimeException("Incompatible formats in Templates") + }} + } + + def saveModel: CRFModel = { + val head = new ArrayBuffer[String]() + + head.append("maxid:") + head.append(maxID.toString) + head.append("cost-factor:") + head.append(1.0.toString) + head.append("xsize:") + head.append(tokensSize.toString) + head.append("Labels:") + labels.foreach(head.append(_)) + head.append("UGrams:") + unigramTempls.foreach(head.append(_)) + head.append("BGrams:") + bigramTempls.foreach(head.append(_)) + + CRFModel(head.toArray, dic.map { case (k, v) => (k, v._1) }.toArray, alpha.toArray) + } + + def readModel(models: CRFModel) = { + val contents: Array[String] = models.head + models.dic.foreach{case(k, v) => dic.update(k, (v, 1))} + alpha = new BDV(models.alpha) + + var i: Int = 0 + var readMaxId: Boolean = false + var readCostFactor: Boolean = false + var readXSize: Boolean = false + var readLabels: Boolean = false + var readUGrams: Boolean = false + var readBGrams: Boolean = false + val alpha_tmp = new ArrayBuffer[Double]() + while (i < contents.length) { + contents(i) match { + case "maxid:" => + readMaxId = true + case "cost-factor:" => + readMaxId = false + readCostFactor = true + case "xsize:" => + readCostFactor = false + readXSize = true + case "Labels:" => + readXSize = false + readLabels = true + case "UGrams:" => + readLabels = false + readUGrams = true + case "BGrams:" => + readUGrams = false + readBGrams = true + case _ => + i -= 1 + } + i += 1 + if (readMaxId) { + maxID = contents(i).toInt + } else if (readXSize) { + tokensSize = contents(i).toInt + } else if (readLabels) { + labels.append(contents(i)) + } else if (readUGrams) { + unigramTempls.append(contents(i)) + } else if (readBGrams) { + bigramTempls.append(contents(i)) + } + i += 1 + } + this + } + + def openTagSetDist(trains: RDD[Sequence]) { + val features: RDD[FeatureIndex] = trains.map(new FeatureIndex().openTagSet) + val tokensSizeCollect = features.map(_.tokensSize).distinct().collect() + require(tokensSizeCollect.length == 1, + "The number of columns should be fixed in each token!") + tokensSize = tokensSizeCollect.head + labels = features.flatMap(_.labels).distinct().collect().to[ArrayBuffer] + } + + def buildDictionaryDist(taggers: RDD[Tagger], bcFeatureIdxI: Broadcast[FeatureIndex], freq: Int) { + //filter : use features that occur no less than freq(default 1) + val dictionary = taggers.flatMap(tagger => { + bcFeatureIdxI.value.buildDictionary(tagger) + }).reduceByKey(_ + _) + .filter(_._2 >= freq) + val dictionaryUni: RDD[(String, (Int, Int))] = dictionary.filter(_._1.head == 'U').zipWithIndex() + .map{ case((feature, frequency), featureID) => + (feature, (featureID.toInt * bcFeatureIdxI.value.labels.size, frequency)) + } + val bcOffSet = taggers.context.broadcast(dictionaryUni.count().toInt * labels.size) + val dictionaryBi: RDD[(String, (Int, Int))] = dictionary.filter(_._1.head == 'B').zipWithIndex() + .map{ case((feature, frequency), featureID) => + (feature, (featureID.toInt * bcFeatureIdxI.value.labels.size * bcFeatureIdxI.value.labels.size + bcOffSet.value, frequency)) + } + + val dictionaryGram = dictionaryUni.union(dictionaryBi).collect() + + dictionaryGram.foreach{case(k, v) => dic.update(k, v)} + maxID = dictionaryGram.map(_._2._1).max + labels.size * labels.size + + } +} diff --git a/tools/kal-test/src/main/scala/org/apache/spark/nlp/Graph.scala b/tools/kal-test/src/main/scala/org/apache/spark/nlp/Graph.scala new file mode 100644 index 0000000000000000000000000000000000000000..812bea2ad393588c43e7da3cca30e4c746ae13a9 --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/nlp/Graph.scala @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// scalastyle:off +package org.apache.spark.nlp + +import scala.collection.mutable.ArrayBuffer + +import breeze.linalg.{Vector => BV} + +private[nlp] class Node extends Serializable { + var x = 0 + var y = 0 + var alpha = 0.0 + var beta = 0.0 + var cost = 0.0 + var bestCost = 0.0 + var prev: Option[Node] = None + var fVector = 0 + val lPath = new ArrayBuffer[Path]() + val rPath = new ArrayBuffer[Path]() + + + /** + * simplify the log likelihood. + */ + def logSumExp(x: Double, y: Double, flg: Boolean): Double = { + val MINUS_LOG_EPSILON = 50.0 + if (flg) y + else { + val vMin: Double = math.min(x, y) + val vMax: Double = math.max(x, y) + if (vMax > vMin + MINUS_LOG_EPSILON) vMax else vMax + math.log(math.exp(vMin - vMax) + 1.0) + } + } + + def calcAlpha(nodes: ArrayBuffer[Node]): Unit = { + alpha = 0.0 + for(i <- lPath.indices) + alpha = logSumExp(alpha, lPath(i).cost + nodes(lPath(i).lNode).alpha, i == 0) + alpha += cost + } + + def calcBeta(nodes: ArrayBuffer[Node]): Unit = { + beta = 0.0 + for(i <- rPath.indices) + beta = logSumExp(beta, rPath(i).cost + nodes(rPath(i).rNode).beta, i == 0) + beta += cost + } + + def calExpectation( + expected: BV[Double], + Z: Double, + size: Int, + featureCache: ArrayBuffer[Int], + nodes: ArrayBuffer[Node]): Unit = { + val c: Double = math.exp(alpha + beta -cost - Z) + + var idx: Int = fVector + while (featureCache(idx) != -1) { + expected(featureCache(idx) + y) += c + idx += 1 + } + + for(i <- lPath.indices) + lPath(i).calExpectation(expected, Z, size, featureCache, nodes) + + } +} + +private[nlp] class Path extends Serializable { + var rNode = 0 + var lNode = 0 + var cost = 0.0 + var fVector = 0 + + def calExpectation( + expected: BV[Double], + Z: Double, + size: Int, + featureCache: ArrayBuffer[Int], + nodes: ArrayBuffer[Node]): Unit = { + val c: Double = math.exp(nodes(lNode).alpha + cost + nodes(rNode).beta - Z) + var idx: Int = fVector + + while (featureCache(idx) != -1) { + expected(featureCache(idx) + nodes(lNode).y * size + nodes(rNode).y) += c + idx += 1 + } + } + + def add(lnd: Int, rnd: Int, nodes: ArrayBuffer[Node]): Unit = { + lNode = lnd + rNode = rnd + nodes(lNode).rPath.append(this) + nodes(rNode).lPath.append(this) + } +} diff --git a/tools/kal-test/src/main/scala/org/apache/spark/nlp/Tagger.scala b/tools/kal-test/src/main/scala/org/apache/spark/nlp/Tagger.scala new file mode 100644 index 0000000000000000000000000000000000000000..b341abf8b14b0ead8bc638f1adfbaed6229b3668 --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/nlp/Tagger.scala @@ -0,0 +1,292 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// scalastyle:off +package org.apache.spark.nlp + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer + +import breeze.linalg.{DenseVector => BDV, Vector => BV} + +private[nlp] trait Mode + +private[nlp] case object LearnMode extends Mode + +private[nlp] case object TestMode extends Mode + +private[nlp] case class QueueElement(node : Node, fx : Double, gx : Double, next : QueueElement) + +private[nlp] class Tagger ( + ySize: Int, + mode: Mode) extends Serializable { + var nBest = 0 + var cost = 0.0 + var Z = 0.0 + var obj = 0.0 + var costFactor = 1.0 + val x = new ArrayBuffer[Array[String]]() + val nodes = new ArrayBuffer[Node]() + val answer = new ArrayBuffer[Int]() + val result = new ArrayBuffer[Int]() + val featureCache = new ArrayBuffer[Int]() + val featureCacheIndex = new ArrayBuffer[Int]() + val probMatrix = new ArrayBuffer[Double]() + var seqProb = 0.0 + lazy val topN = ArrayBuffer.empty[Array[Int]] + lazy val topNResult = ArrayBuffer.empty[Int] + lazy val probN = ArrayBuffer.empty[Double] + lazy val agenda = mutable.PriorityQueue.empty[QueueElement] ( + Ordering.by((_:QueueElement).fx).reverse + ) + + def setCostFactor(costFactor: Double) = { + this.costFactor = costFactor + this + } + + def setNBest(nBest: Int) = { + this.nBest = nBest + this + } + + def read(lines: Sequence, feature_idx: FeatureIndex): Unit = { + lines.toArray.foreach{ t => + mode match { + case LearnMode => + for (y <- feature_idx.labels if y.equals(t.label)) + answer.append(feature_idx.labels.indexOf(y)) + x.append(t.tags) + case TestMode => + x.append(t.tags) + answer.append(0) + } + result.append(0) + } + } + + /** + * Set node relationship and its feature index. + * Node represents a token. + */ + def rebuildFeatures(): Unit = { + + nodes ++= Array.fill(x.length * ySize)(new Node) + nodes.zipWithIndex.foreach{ case(n, index) => + n.x = index / ySize + n.y = index - n.x * ySize + n.fVector = featureCacheIndex(n.x) + } + + nodes.filter(_.x > 0).foreach{ n => + val paths = Array.fill(ySize)(new Path) + paths.zipWithIndex.foreach { case(p, indexP) => + p.fVector = featureCacheIndex(n.x + x.length - 1) + p.add((n.x - 1) * ySize + n.y, n.x * ySize + indexP, nodes) + } + } + } + + /** + * Calculate the expectation of each node + */ + def forwardBackward(): Unit = { + nodes.foreach(_.calcAlpha(nodes)) + nodes.reverse.foreach(_.calcBeta(nodes)) + Z = 0.0 + nodes.filter(_.x == 0).foreach(n => Z = n.logSumExp(Z, n.beta, n.y == 0)) + } + + /** + * Get the max expectation in the nodes and predicts the most likely label + */ + def viterbi(): Unit = { + var bestCost = Double.MinValue + var best: Option[Node] = None + + nodes.foreach { n => + bestCost = Double.MinValue + best = None + n.lPath.foreach { p => + val cost = nodes(p.lNode).bestCost + p.cost + n.cost + if (cost > bestCost) { + bestCost = cost + best = Some(nodes(p.lNode)) + } + } + n.prev = best + best match { + case None => + n.bestCost = n.cost + case _ => + n.bestCost = bestCost + } + } + + var nd: Option[Node] = Some(nodes.filter(_.x == x.length - 1).max(Ordering.by((_:Node).bestCost))) + + while (nd.isDefined) { + result.update(nd.get.x, nd.get.y) + nd = nd.get.prev + } + + cost = - nodes((x.length - 1)*ySize + result.last).bestCost + } + + def gradient(expected: BV[Double], alpha: BDV[Double]): Double = { + + buildLattice(alpha) + forwardBackward() + + nodes.foreach(_.calExpectation(expected, Z, ySize, featureCache, nodes)) + + var s: Double = 0.0 + for (i <- x.indices) { + var rIdx = nodes(i * ySize + answer(i)).fVector + while (featureCache(rIdx) != -1) { + expected(featureCache(rIdx) + answer(i)) -= 1.0 + rIdx += 1 + } + s += nodes(i * ySize + answer(i)).cost + var j = 0 + while (j < nodes(i * ySize + answer(i)).lPath.length) { + val lNode = nodes(nodes(i * ySize + answer(i)).lPath(j).lNode) + val rNode = nodes(nodes(i * ySize + answer(i)).lPath(j).rNode) + val lPath = nodes(i * ySize + answer(i)).lPath(j) + if (lNode.y == answer(lNode.x)) { + rIdx = lPath.fVector + while (featureCache(rIdx) != -1) { + expected(featureCache(rIdx) + lNode.y * ySize + rNode.y) -= 1.0 + rIdx += 1 + } + s += lPath.cost + } + j += 1 + } + } + + viterbi() + clear() + Z - s + } + + def probCalculate(): Unit ={ + probMatrix ++= Array.fill(x.length * ySize)(0.0) + var idx :Int = 0 + nodes.foreach{ n => + idx = n.x * ySize + n.y + probMatrix(idx) = Math.exp(n.alpha + n.beta - n.cost - Z) + } + this.seqProb = Math.exp(- cost - Z) + + } + + def clear(): Unit = { + nodes foreach{ n => + n.lPath.clear() + n.rPath.clear() + } + nodes.clear() + } + + def parse(alpha: BDV[Double], mode: Option[VerboseMode]): Unit = { + buildLattice(alpha) + if (nBest > 0 || mode.isDefined) { + forwardBackward() + viterbi() + probCalculate() + } + else + viterbi() + if(nBest > 0) { + //initialize nBest + if(agenda.nonEmpty) agenda.clear() + nodes.slice((x.size - 1) * ySize, x.size * ySize - 1) + .foreach(n => agenda += QueueElement(n, - n.bestCost, - n.cost, null)) + //find nBest + for(i <- 0 until this.nBest) { + topNResult.clear() + if(!nextNode) + return + probN.append(Math.exp(- cost - Z)) + topN.append(topNResult.toArray) + } + } + } + + def buildLattice(alpha: BDV[Double]): Unit = { + + rebuildFeatures() + nodes.foreach { n => + val nn = calcCost(n, alpha) + nn.lPath.foreach(calcCost(_, alpha)) + nn + } + } + + def calcCost(n: Node, alpha: BDV[Double]): Node = { + var cd: Double = 0.0 + var idx: Int = n.fVector + n.cost = 0.0 + + while (featureCache(idx) != -1) { + cd += alpha(featureCache(idx) + n.y) + n.cost = cd * costFactor + idx += 1 + } + + n + } + + def calcCost(p: Path, alpha: BDV[Double]): Path = { + var cd: Double = 0.0 + var idx: Int = p.fVector + p.cost = 0.0 + + while (featureCache(idx) != -1) { + cd += alpha(featureCache(idx) + + nodes(p.lNode).y * ySize + nodes(p.rNode).y) + p.cost = cd * costFactor + idx += 1 + } + + p + } + + def nextNode: Boolean = { + var top: QueueElement = null + var rNode: Node = null + while(agenda.nonEmpty) { + top = agenda.dequeue() + rNode = top.node + if(rNode.x == 0) { + var n: QueueElement = top + for(i <- x.indices) { + topNResult.append(n.node.y) + n = n.next + } + cost = top.gx + return true + } + rNode.lPath.foreach { p => + val gx = -nodes(p.lNode).cost - p.cost + top.gx + val fx = - nodes(p.lNode).bestCost - p.cost + top.gx + agenda += QueueElement(nodes(p.lNode), fx, gx, top) + } + } + false + } +}