From 323718f3b8eed23540b97a012ade407207dadf66 Mon Sep 17 00:00:00 2001 From: yongchao1 <297389370@qq.com> Date: Tue, 16 Jul 2024 12:29:22 +0000 Subject: [PATCH 1/3] =?UTF-8?q?add=20TensorFlow/built-in/recommendation/Wi?= =?UTF-8?q?deDeep=5FID2712=5Ffor=5FTensorFlow/train=5Fprofiling.py.=20?= =?UTF-8?q?=E6=96=B0=E5=A2=9Etrain=5Fprofiling.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: yongchao1 <297389370@qq.com> --- .../train_profiling.py | 549 ++++++++++++++++++ 1 file changed, 549 insertions(+) create mode 100644 TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train_profiling.py diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train_profiling.py b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train_profiling.py new file mode 100644 index 000000000..7b052b595 --- /dev/null +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train_profiling.py @@ -0,0 +1,549 @@ +# coding=utf-8 +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +main function for DL training. +""" +from __future__ import print_function + +import datetime +import os +import sys +import time +import math +import threading +from multiprocessing import Process + +import numpy as np +from sklearn.metrics import roc_auc_score, log_loss +import tensorflow as tf + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +print(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import configs.config as config +from widedeep.data_utils import input_fn_tfrecord +from npu_bridge.estimator.npu import util +from widedeep.WideDeep_fp16_huifeng import WideDeep +from tensorflow.python.client import timeline +from tensorflow.python.platform import gfile + +from npu_bridge.estimator import npu_ops +from npu_bridge.estimator.npu.npu_config import NPURunConfig +from npu_bridge.estimator.npu.npu_estimator import NPUEstimator +from npu_bridge.estimator.npu.npu_optimizer import allreduce + +###add by daihongtao +from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer +#from hccl.manage.api import get_rank_size +#from hccl.manage.api import get_rank_id +###end +from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig +from npu_bridge.npu_init import * +from hccl.split.api import set_split_strategy_by_idx +set_split_strategy_by_idx([2,64,67,88]) + +rank_size = os.getenv('RANK_SIZE') +rank_size = 1 if not rank_size else int(rank_size) +##single p时,需要采设置修改 +rank_id = os.getenv('DEVICE_INDEX') +rank_id = 0 if not rank_id else int(rank_id) +#rank_id = 0 if rank_size == 1 else get_rank_id() +#num_gpu = config.num_gpu +num_gpu = rank_size +mode = 'train' +algo='WideDeep' + + +###dump +from npu_bridge.estimator.npu.npu_config import DumpConfig + + + +data_para = { + 'batch_size': int(config.batch_size),#config.batch_size, + 'eval_batch_size': int(config.eval_batch_size),#config.batch_size, + 'num_gpu': num_gpu +} +train_para = { + 'pos_weight': 1.0, + 'n_epoch': config.n_epoches, + 'train_per_epoch': config.train_size, + 'test_per_epoch': config.test_size, + 'batch_size': data_para['batch_size'], + 'eval_batch_size': data_para['eval_batch_size'], + 'early_stop_epochs': 50, + # 'iterations_per_loop': config.iterations_per_loop +} +#if rank_size > 1: +# train_para = { +# 'pos_weight': 1.0, +# 'n_epoch': config.n_epoches_8p, +# 'train_per_epoch': config.train_size, +# 'test_per_epoch': config.test_size, +# 'batch_size': data_para['batch_size'], +# 'eval_batch_size': data_para['eval_batch_size'], +# 'early_stop_epochs': 50, + # 'iterations_per_loop': config.iterations_per_loop +# } + +def write_log(log_path, _line, echo=False): + with open(log_path, 'a') as log_in: + log_in.write(_line + '\n') + if echo: + print(_line) + + +def metric(log_path, batch_auc, y, p, name='ctr', cal_prob=None): + y = np.array(y) + p = np.array(p) + + if cal_prob: + if cal_prob <= 0 or cal_prob > 1: + raise ValueError('please ensure cal_prob is in (0,1]!') + p /= (p + (1 - p) / cal_prob) + auc = roc_auc_score(y, p) + orilen = len(p) + + ind = np.where((p > 0) & (p < 1))[0] + # print(len(ind)) + y = y[ind] + p = p[ind] + afterlen = len(p) + # print('train auc: %g\tavg ctr: %g' % (batch_auc, y.mean())) + + ll = log_loss(y, p) * afterlen / orilen; + q = y.mean() + ne = ll / (-1 * q * np.log(q) - (1 - q) * np.log(1 - q)) + rig = 1 - ne + + if log_path: + log = '%s\t%g\t%g\t%g\t%g' % (name, batch_auc, auc, ll, ne) + write_log(log_path, log) + print('avg %s on p: %g\teval auc: %g\tlog loss: %g\tne: %g\trig: %g' % + (name, q, auc, ll, ne, rig)) + return auc + +####global_step +def get_optimizer(optimizer_array, global_step): + opt = optimizer_array[0].lower() + #if algo == 'DCN_T': + lr = tf.train.exponential_decay(learning_rate=optimizer_array[1], global_step=global_step, decay_rate=optimizer_array[3], decay_steps=optimizer_array[4], staircase=True) + #else: + #lr = optimizer_array[1] + if opt == 'sgd' or opt == 'gd': + return tf.train.GradientDescentOptimizer(learning_rate=lr) + elif opt == 'adam': + eps = optimizer_array[2] + return tf.train.AdamOptimizer(learning_rate=lr, epsilon=eps) + elif opt == 'adagrad': + init_val = optimizer_array[2] + return tf.train.AdagradOptimizer(learning_rate=lr, initial_accumulator_value=init_val) + elif opt == 'ftrl': + return tf.train.FtrlOptimizer(learning_rate=lr, initial_accumulator_value=optimizer_array[2],l1_regularization_strength=optimizer_array[3],l2_regularization_strength=optimizer_array[4]) + +def evaluate(sess, model): # id_hldr, wt_hldr, eval_preds): + preds = [] + labels = [] + start_time = time.time() + number_of_batches = ((train_para['test_per_epoch'] + train_para['eval_batch_size'] - 1) / + train_para['eval_batch_size']) + print("%d batches in test set." % number_of_batches) + #for _batch in range(1, number_of_batches + 1): + iter = 0 + print("evaluate wile start time: %f sec" % (time.time() - start_time)) + while iter < number_of_batches: + iter = iter + 1 + try: + _preds, _labels = sess.run(fetches=[model.eval_preds, model.eval_labels]) + batch_preds = [_preds.flatten()] + preds.append(np.squeeze(batch_preds)) + labels.append(_labels) + except tf.errors.OutOfRangeError: + print("evaluate end of test trainset time: %f sec" % (time.time() - start_time)) + print("end of test trainset") + break + print("evaluate labels time: %f sec" % (time.time() - start_time)) + labels = np.hstack(labels) + print("evaluate preds time: %f sec" % (time.time() - start_time)) + preds = np.hstack(preds) + print("evaluate time: %f sec" % (time.time() - start_time)) + return labels, preds + + +def build_model(graph, _input_d, _eval_d): + seeds = [0x0123, 0x4567, 0x3210, 0x7654, 0x89AB, 0xCDEF, 0xBA98, 0xFEDC, + 0x0123, 0x4567, 0x3210, 0x7654, 0x89AB, 0xCDEF, 0xBA98, 0xFEDC] + + #drop1 0.80768 drop0.8 50epoch 0.808996 + + model = WideDeep(graph, + [[1024, 1024, 1024, 1024, 1024], 'relu'], + [['adam', 1e-4, 9e-8, 0.8, 5], ['ftrl', 0.06, 1, 0, 0]], + [1.0, 9e-6], + _input_d, _eval_d + ) + + print('mode:%s, batch size: %d, buf size: %d, eval size: %d' % ( + mode, batch_size, buf_size, eval_size)) + + write_log(log_file, model.log, True) + return model + +def average_gradients(gpu_grads): + #if(len(gpu_grads) == 1): + # return gpu_grads[0] + + avg_grads = [] + for grad_and_vars in zip(*gpu_grads): + grads = [] + + for g, _ in grad_and_vars: + expanded_g = tf.expand_dims(g, 0) + grads.append(expanded_g) + all_grad = tf.concat(grads, 0) + avg_grad = tf.reduce_mean(all_grad, 0, keep_dims=False) + + v = grad_and_vars[0][1] + grad_and_var = (avg_grad, v) + avg_grads.append(grad_and_var) + + return avg_grads + +def build_graph(graph, input_data, eval_data,rank_id): + # tf.reset_default_graph() + with tf.device(0): + ####### for hccl ######## + # with tf.device(rank_id): + ####### for hccl ######## + with tf.variable_scope(tf.get_variable_scope()): + global_step = tf.get_variable(name='global_step', dtype=tf.int32, shape=[], + initializer=tf.constant_initializer(0), trainable=False) + model = build_model(graph, input_data, eval_data) + return model, model.train_op + +def train_batch(sess, num_gpu, _model, train_op): + if num_gpu >= 1: + model = _model + fetche = [train_op] + if True: + fetche = fetche + [model.deep_loss, model.log_loss, model.l2_loss, model.train_preds, model.labels] + _, _deeploss_, _log_loss_, _l2_loss_, _preds_, _train_labels= sess.run(fetches=fetche) + _loss_ = _deeploss_ + else: + pass + return _loss_, _log_loss_, _l2_loss_, _preds_, _train_labels + +def create_dirs(dir): + """create dir recursively as needed.""" + if not os.path.exists(dir): + os.makedirs(dir) + +import argparse +def parse_args(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--over_dump", default=False, + help="whether to enable overflow") + parser.add_argument("--over_dump_path", default="./", + help="path to save overflow dump files") + parser.add_argument("--data_path", default="./data", + help="path of dataset") + parser.add_argument("--ckpt_path", default="./model", + help="path of ckpt") + parser.add_argument("--train_size", default= config.train_size, + help="size of train data") + parser.add_argument("--display_step", default= config.display_step, + help="display step") + parser.add_argument('--precision_mode', default='allow_mix_precision', + help='allow_fp32_to_fp16/force_fp16/ ' + 'must_keep_origin_dtype/allow_mix_precision.') + args = parser.parse_args() + '''args, unknown_args = parser.parse_known_args() + if len(unknown_args) > 0: + for bad_arg in unknown_args: + print("ERROR: Unknown command line arg: %s" % bad_arg) + raise ValueError("Invalid command line arg(s)")''' + return args + + +if __name__ == '__main__': + display_step = config.display_step + #os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" + + tag = algo + Base_path = config.BASE_DIR + log_path = os.path.join(Base_path, 'log/') + create_dirs(log_path) + log_file = os.path.join(log_path, tag) + pickle_model_path = os.path.join(Base_path, + 'model/pickle_model/') + create_dirs(pickle_model_path) + print("log file: ", log_file) + + batch_size = data_para['batch_size'] + buf_size = train_para['train_per_epoch'] + eval_size = data_para['batch_size'] + early_stop_epochs = train_para['early_stop_epochs'] + + metric_best = 0 + metric_best_epoch = -1 + + sess_config = tf.ConfigProto() + sess_config.gpu_options.allow_growth = True + + args = parse_args() + + #""" + # for npu + custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add() + #modify enable autotune + if "autotune" in os.environ: + + + print ("autotune value is " + os.environ["autotune"]) + + if os.environ["autotune"] == "True": + + print ("autotune is set !") + custom_op.parameter_map["auto_tune_mode"].s = tf.compat.as_bytes("RL,GA") + #autotune end + + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["op_select_implmode"].s = tf.compat.as_bytes("high_performance") + custom_op.parameter_map["optypelist_for_implmode"].s = tf.compat.as_bytes("UnsortedSegmentSum,GatherV2") + custom_op.parameter_map["enable_data_pre_proc"].b = True ##True getNext false在host侧 + #custom_op.parameter_map["mix_compile_mode"].b = True #开启混合计算,根据实际情况配置 + custom_op.parameter_map["use_off_line"].b = True + custom_op.parameter_map["min_group_size"].b = 1 + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes(args.precision_mode) + custom_op.parameter_map["profiling_mode"].b = True + custom_op.parameter_map["profiling_options"].s = tf.compat.as_bytes('{"output":"./profiling","training_trace":"on","task_trace":"on","fp_point":"","bp_point":"","aic_metrics":"PipeUtilization"}') + custom_op.parameter_map["hcom_parallel"].b = True + custom_op.parameter_map["iterations_per_loop"].i = config.iterations_per_loop + custom_op.parameter_map["op_select_implmode"].s = tf.compat.as_bytes("high_performance") + custom_op.parameter_map["optypelist_for_implmode"].s = tf.compat.as_bytes("UnsortedSegmentSum, GatherV2") + if args.precision_mode == "allow_mix_precision": + custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("ops_info.json") + custom_op.parameter_map["fusion_switch_file"].s = tf.compat.as_bytes("fusion_switch.cfg") + #aic err debug + # custom_op.parameter_map["enable_exception_dump"].i = 1 + # custom_op.parameter_map["op_debug_level"].i = 2 + + if args.over_dump is True: + print("NPU overflow dump is enabled") + custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes(args.over_dump_path) + custom_op.parameter_map["enable_dump_debug"].b = True + custom_op.parameter_map["dump_debug_mode"].s = tf.compat.as_bytes("all") + else: + print("NPU overflow dump is disabled") + + + sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF + + + #""" + + # for op perf + run_metadata = tf.RunMetadata() + #options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) + + global_start_time = time.time() + graph = tf.Graph() + with graph.as_default(): + train_dataset = input_fn_tfrecord(config.train_tag, int(batch_size), int(config.line_per_sample), + num_epochs=config.n_epoches, perform_shuffle=True) + eval_dataset = input_fn_tfrecord(config.eval_tag, int(config.eval_batch_size), int(config.line_per_sample), + num_epochs=None, perform_shuffle=True) + + print('train batch size:', int(batch_size) * num_gpu / config.line_per_sample) + if mode == 'train' and rank_size > 1: + rank_size = int(os.getenv('RANK_SIZE')) + #rank_id = int(os.getenv('DEVICE_INDEX')) + rank_id = int(os.getenv('ASCEND_DEVICE_ID')) + + ############## for hccl ################### + #rank_id = get_rank_id() + ############# for hccl ################### + + print("ranksize = %d, rankid = %d" % (rank_size, rank_id)) + train_dataset = train_dataset.shard(rank_size, rank_id) + + iterator = train_dataset.make_initializable_iterator() + next_element = iterator.get_next() + eval_iterator = eval_dataset.make_initializable_iterator() + eval_next_element = eval_iterator.get_next() + + if num_gpu >= 1: + model, opt = build_graph(graph, next_element, eval_next_element,rank_id) + + with tf.Session(graph=graph, config=sess_config) as sess: + #writer = tf.summary.FileWriter(config.writer_path, sess.graph) + ##### + sess.run(iterator.initializer) + sess.run(eval_iterator.initializer) + sess.run(tf.global_variables_initializer()) + + ##################### for hccl ################# + if mode == 'train' and rank_size > 1: + npu_int = npu_ops.initialize_system() + npu_shutdown = npu_ops.shutdown_system() + sess.run(npu_int) + input = tf.trainable_variables() + bcast_global_variables_op = hccl_ops.broadcast(input, 0) + sess.run(bcast_global_variables_op) + + #################### for hccl ################# + + + + saver = tf.train.Saver() + ####大小循环下沉方式 + train_op_sess = util.set_iteration_per_loop(sess, opt, config.iterations_per_loop) + + print('model initialized') + + if mode == 'train': + start_time = time.time() + #est_epoch_batches = int((train_para['train_per_epoch'] + train_para['batch_size']*num_gpu - 1) / (train_para['batch_size']*num_gpu)) + est_epoch_batches = int((train_para['train_per_epoch']) / (train_para['batch_size']*num_gpu)) + est_tot_batches = train_para['n_epoch'] * est_epoch_batches + + ###if use per_loop please use next + est_epoch_batches = int((est_epoch_batches / config.iterations_per_loop)) + est_tot_batches = train_para['n_epoch'] * est_epoch_batches * config.iterations_per_loop + + print("est_epoch_batches====", est_epoch_batches) + print("est_tot_batches====", est_tot_batches) + + _epoch = 1 + train_finished = False + while _epoch < train_para['n_epoch'] + 1 and not train_finished: + _epoch_start_time = time.time() + epoch_loss = [] + epoch_labels = [] + epoch_preds = [] + epoch_auc = -1 + epoch_finished = False + epoch_sample_num = 0 + epoch_finished_batches = 0 + cnt = 0 + #saver = tf.train.Saver() + #saver.save(sess, Base_path + 'model/%s' % tag, + # global_step=cnt, latest_filename='%s-checkpoint' % tag) + while not epoch_finished and not train_finished: + try: + epoch_finished_batches += 1 + start_time_test = time.time() + _loss, _log_loss, _l2_loss, p, _labels = train_batch(sess, num_gpu, model, train_op_sess) + end_time_test = time.time() + #print("time =================", (end_time_test - start_time_test)) + + epoch_loss.append(_loss) + epoch_labels.extend(_labels) + epoch_preds.extend(p) + epoch_sample_num += _labels.shape[0] + + dt = end_time_test - start_time_test + fps=train_para['batch_size'] * rank_size * config.iterations_per_loop/dt + + if epoch_finished_batches % (display_step / num_gpu) == 0: # print step + if _epoch: + print("================epoch_finished_batches", epoch_finished_batches, display_step, num_gpu) + avg_loss = np.array(epoch_loss).mean() + epoch_auc = roc_auc_score(epoch_labels, epoch_preds) + elapsed = int(time.time() - start_time) + finished_batches = (_epoch-1) * est_epoch_batches + epoch_finished_batches + eta = int(1.0 * (est_tot_batches - finished_batches) / + finished_batches * elapsed) + epoch_labels = [] + epoch_preds = [] + epoch_loss = [] + print('epoch %3d/%3d - batch %5d: loss = %f, auc = %f, device_id = %d | elapsed : %s, ETA : %s, fps : %f' % ( + _epoch, train_para['n_epoch'], epoch_finished_batches, avg_loss, epoch_auc, rank_id, + str(datetime.timedelta(seconds=elapsed)), str(datetime.timedelta(seconds=eta)), fps)) + print("est_epoch_batches =====", est_epoch_batches) + avg_loss = 0 + else: + elapsed = int(time.time() - start_time) + finished_batches = (_epoch-1) * est_epoch_batches + epoch_finished_batches + eta = int(1.0 * (est_tot_batches - finished_batches) / + finished_batches * elapsed) + print('epoch %3d/%3d - batch %5d: | elapsed : %s, ETA : %s' % ( + _epoch, train_para['n_epoch'], epoch_finished_batches, + str(datetime.timedelta(seconds=elapsed)), str(datetime.timedelta(seconds=eta)))) + + #print("=============================", epoch_finished_batches %(20* est_epoch_batches)) + if (epoch_finished_batches % (1 * est_epoch_batches) == 0) or epoch_finished: + epoch_finished = True + print('epoch %d train time = %.3f sec, #train sample = %d' % + (_epoch, time.time() - _epoch_start_time, epoch_sample_num*config.iterations_per_loop )) + + # ======== comment this block if no testset available ======== + print("== starting evaluate == ") + evaluate_time_start = time.time() + eval_labels, eval_preds = evaluate(sess, model) + evaluate_time_end = time.time() + print("evaluate time =====", (evaluate_time_end - evaluate_time_start)) + eval_auc = metric(log_file, epoch_auc, eval_labels, eval_preds) + print("== finished evaluate == ") + # ============================================================ + print('epoch %d total time = %s' % + (_epoch, str(time.time() - _epoch_start_time))) + + if eval_auc >= metric_best: + metric_best = eval_auc + metric_best_epoch = _epoch + print("current best auc: ", metric_best, " best_epoch: ", metric_best_epoch) + else: + if _epoch - metric_best_epoch >= early_stop_epochs: + print("the model will be early stopped: current epoch:", _epoch) + log_data = "best epoch: %d\t best performance:%g\n" % (metric_best_epoch, metric_best) + log_data += "model_saved to %s\n" % (Base_path + 'model/%s' % tag) + write_log(log_file, log_data, echo=True) + + print("save complete for epoch %d" % _epoch) + train_finished = True + break + _epoch += 1 + except tf.errors.OutOfRangeError as e: + print("end of training dataset") + print("epoch %3d finished ..." % _epoch) + train_finished = True + saver.save(sess, Base_path + 'model/%s' % tag, + global_step=epoch_finished_batches, latest_filename='%s-checkpoint' % tag) + #writer.close() + + elif mode == 'test': + pass + tf.train.write_graph(sess.graph, config.graph_path, 'widedeep16_graph.pbtxt', as_text=True) + ############### for hccl $^?################ + if mode == 'train' and rank_size > 1: + sess.run(npu_shutdown) + ############### for hccl $^?################ + sess.close() \ No newline at end of file -- Gitee From 6d467d0ebb1d94e7827a2bce0f40fbd9d33e2ee9 Mon Sep 17 00:00:00 2001 From: yongchao1 <297389370@qq.com> Date: Tue, 16 Jul 2024 12:32:49 +0000 Subject: [PATCH 2/3] =?UTF-8?q?add=20WideDeep=5FID2712=5Ffor=5FTensorFlow/?= =?UTF-8?q?test/train=5Fperformance=5F1p=5Fprofiling2.sh.=20=E6=96=B0?= =?UTF-8?q?=E5=A2=9E=20=E5=9F=BA=E4=BA=8EEstimator=E6=A8=A1=E5=BC=8F?= =?UTF-8?q?=E4=BD=BF=E8=83=BD=20profiling=E6=89=A7=E8=A1=8C=E8=84=9A?= =?UTF-8?q?=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: yongchao1 <297389370@qq.com> --- .../test/train_performance_1p_profiling2.sh | 162 ++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_profiling2.sh diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_profiling2.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_profiling2.sh new file mode 100644 index 000000000..3b6a491eb --- /dev/null +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_profiling2.sh @@ -0,0 +1,162 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` +#export ASCEND_SLOG_PRINT_TO_STDOUT=1 +#export GE_USE_STATIC_MEMORY=1 + +#集合通信参数,不需要修改 + +export RANK_SIZE=1 +export JOB_ID=10087 + + +RANK_ID_START=0 + +#基础参数,需要模型审视修改 +#Batch Size +batch_size=131072 +#网络名称,同目录名称 +Network="WideDeep_ID2712_for_TensorFlow" +#Device数量,单卡默认为1 +RankSize=1 + +#参数配置 +data_path="/npu/traindata/ID2940_CarPeting_TF_WideDeep_TF" +train_size=13107200 +display_step=10 + +#维持参数,以下不需要修改 +over_dump=False +precision_mode="allow_mix_precision" +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1p.sh " + echo " " + echo "parameter explain: + --over_dump if or not over detection, default is False + --data_path source data of training + --train_epochs train epochs + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` +elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +##############执行训练########## +if [ -d $cur_path/output ];then + rm -rf $cur_path/output/* + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID/ckpt +fi + +#if [ -d $cur_path/../config/1p_$ASCEND_DEVICE.json ];then +# export RANK_TABLE_FILE=$cur_path/../config/1p_$ASCEND_DEVICE.json +# export RANK_ID=$ASCEND_DEVICE_ID +#else +# export RANK_TABLE_FILE=$cur_path/../config/1p.json +# export RANK_ID=0 +#fi +wait + +#配置文件备份和修改 +cd $cur_path/../ +if [ -f configs/config.py.bak ];then + cp configs/config.py.bak configs/config.py + rm -f configs/config.py.run +else + cp configs/config.py configs/config.py.bak + rm -f configs/config.py.run +fi +sed -i "s%/npu/traindata/ID2940_CarPeting_TF_WideDeep_TF%${data_path}%p" configs/config.py +sed -i "s%./model%$cur_path/output/$ASCEND_DEVICE_ID/ckpt%p" configs/config.py +sed -i "s%59761827%${train_size}%p" configs/config.py +sed -i "s%display_step = 100%display_step = $display_step%p" configs/config.py +#echo `cat configs/config.py |uniq > configs/config.py; cp -f configs/config.py configs/config.py.run` +cp configs/config.py configs/config.py.run +#训练执行 +start=$(date +%s) +nohup python3 train_profiling.py --data_path=$data_path \ + --ckpt_path=$cur_path/output/$ASCEND_DEVICE_ID/ckpt \ + --train_size=$train_size \ + --precision_mode=$precision_mode \ + --display_step=$display_step > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +wait +end=$(date +%s) +e2e_time=$(( $end - $start )) + +#配置文件恢复 +mv -f configs/config.py.bak configs/config.py + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 + +#FPS=`grep 'fps :' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F' ' '{print $25}' | tail -n 1` +time=`grep -rn 'epoch 2 total time =' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F '=' '{print $2}'|sed s/[[:space:]]//g` +FPS=`awk 'BEGIN{printf "%.2f\n",'100'*'${batch_size}'/'${time}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep 'eval auc' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F' ' '{print $8}' |tail -n 1` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +if [[ $precision_mode == "must_keep_origin_dtype" ]];then + CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'fp32'_'profiling2'_'perf' +else + CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'profiling2'_'perf' +fi +echo "CaseName : $CaseName" + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=$time +echo "TrainingTime(ms/step) : $TrainingTime" + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +loss=`grep 'loss =' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | tr -d '\b\r' | awk -F' ' '{print $9}'|sed 's/,$//'` +echo "${loss}"> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`cat $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt | tail -n 1` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From c8011aeb04de4448e13ed522a1d9cca7aa7049c3 Mon Sep 17 00:00:00 2001 From: yongchao1 <297389370@qq.com> Date: Tue, 16 Jul 2024 12:34:59 +0000 Subject: [PATCH 3/3] =?UTF-8?q?update=20recommendation/WideDeep=5FID2712?= =?UTF-8?q?=5Ffor=5FTensorFlow/test/train=5Fperformance=5F1p=5Fprofiling2.?= =?UTF-8?q?sh.=20=E6=96=B0=E5=A2=9E=E8=A7=A3=E6=9E=90profiling=20=E6=89=A7?= =?UTF-8?q?=E8=A1=8C=E5=91=BD=E4=BB=A4=E8=A1=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: yongchao1 <297389370@qq.com> --- .../test/train_performance_1p_profiling2.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_profiling2.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_profiling2.sh index 3b6a491eb..4beaccc1c 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_profiling2.sh +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_profiling2.sh @@ -105,6 +105,12 @@ wait end=$(date +%s) e2e_time=$(( $end - $start )) +a=`find /usr/local/Ascend/ -name acp | awk -F 'acp' '{print $1}'` +cd $a +./msprof --parse=on --output=$cur_path/../profiling +./msprof --export=on --output=$cur_path/../profiling + + #配置文件恢复 mv -f configs/config.py.bak configs/config.py -- Gitee