diff --git a/OWNERS b/OWNERS index d609a18f9691010f41d4181d507204605d96f20d..6ad393ee68f77d0647ee159b27e3a2a6893fedb8 100644 --- a/OWNERS +++ b/OWNERS @@ -11,6 +11,7 @@ approvers: - ly-qianxiao - blian - kun_8 +- binghamhuang reviewers: - leo920320 - wo-wenjie @@ -39,4 +40,5 @@ reviewers: - machj - zhengweifeng6 - gong-siwei -- uniteone \ No newline at end of file +- uniteone +- binghamhuang \ No newline at end of file diff --git a/README.md b/README.md index cb203544c7478cc8b6e92567952a537f5f4e0ffd..87a1a03725d6778b24ab322b7ee8a3725c303e4a 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,9 @@ Ascend Training Tools,昇腾训练工具链。针对训练&大模型场景, 脚本迁移工具提供后端命令行用于将GPU上训练的PyTorch脚本迁移至NPU上,得到新的训练脚本用于训练。 +4. [训推一体权重转换工具](https://gitee.com/Ascend/att/wikis/%E5%B7%A5%E5%85%B7%E4%BB%8B%E7%BB%8D/%E5%88%86%E6%9E%90%E8%BF%81%E7%A7%BB%E5%B7%A5%E5%85%B7/%E8%AE%AD%E6%8E%A8%E4%B8%80%E4%BD%93%E6%9D%83%E9%87%8D%E8%BD%AC%E6%8D%A2%E5%B7%A5%E5%85%B7%E4%BD%BF%E7%94%A8%E6%8C%87%E5%AF%BC) + + 训推一体权重转换工具,支持在GPU和NPU上训练好的模型转成加速推理支持的格式。 ### [精度工具](https://gitee.com/ascend/att/tree/master/debug/accuracy_tools) diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py index 9dd204f5bc9a625170f6f5957cd7d9c3179ee694..c92eff25a701c5f0c228d3225fbbb22959d5f929 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py @@ -91,12 +91,13 @@ def get_max_abs_err(abs_err): #相对误差最大值 def get_max_rel_err(rel_err): - return np.max(rel_err) + return np.max(rel_err) if np.max(rel_err) >= 0 else 0 #相对误差均值 def get_mean_rel_err(rel_err): - return np.mean(rel_err) + non_negative_rel_err = rel_err[rel_err >= 0] + return np.mean(non_negative_rel_err) if non_negative_rel_err.size > 0 else 0 def get_rel_err_ratio(rel_err, thresholding): diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/api_precision_compare.py b/debug/accuracy_tools/api_accuracy_checker/compare/api_precision_compare.py index 8c98130ab61d27479817bc9c8c7eed0cc4f38361..ad38cb9561e3e14dc332790b3c06c9cb78716c4e 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/api_precision_compare.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/api_precision_compare.py @@ -11,7 +11,8 @@ from api_accuracy_checker.common.utils import print_info_log, print_warn_log, pr from api_accuracy_checker.common.config import msCheckerConfig from api_accuracy_checker.compare.compare_utils import CompareConst, API_PRECISION_COMPARE_RESULT_FILE_NAME, \ API_PRECISION_COMPARE_DETAILS_FILE_NAME, BENCHMARK_COMPARE_SUPPORT_LIST, API_PRECISION_COMPARE_UNSUPPORT_LIST, \ - ApiPrecisionCompareColumn, AbsoluteStandardApi, BinaryStandardApi, BINARY_COMPARE_UNSUPPORT_LIST, convert_str_to_float + ApiPrecisionCompareColumn, AbsoluteStandardApi, BinaryStandardApi, BINARY_COMPARE_UNSUPPORT_LIST, \ + convert_str_to_float, CompareMessage from api_accuracy_checker.compare.compare_column import ApiPrecisionOutputColumn from api_accuracy_checker.run_ut.run_ut import get_validated_result_csv_path from ptdbg_ascend.src.python.ptdbg_ascend.common.file_check_util import FileCheckConst, FileChecker, change_mode @@ -106,15 +107,15 @@ class BenchmarkStandard: def _compare_ratio(self): self.small_value_err_ratio = self._calc_ratio( self.npu_precision.get(ApiPrecisionCompareColumn.SMALL_VALUE_ERROR_RATE), - self.gpu_precision.get(ApiPrecisionCompareColumn.SMALL_VALUE_ERROR_RATE)) + self.gpu_precision.get(ApiPrecisionCompareColumn.SMALL_VALUE_ERROR_RATE), 10000.0) self.rmse_ratio = self._calc_ratio(self.npu_precision.get(ApiPrecisionCompareColumn.RMSE), self.gpu_precision.get(ApiPrecisionCompareColumn.RMSE), 10000.0) self.max_rel_err_ratio = self._calc_ratio(self.npu_precision.get(ApiPrecisionCompareColumn.MAX_REL_ERR), self.gpu_precision.get(ApiPrecisionCompareColumn.MAX_REL_ERR), 10000.0) self.mean_rel_err_ratio = self._calc_ratio(self.npu_precision.get(ApiPrecisionCompareColumn.MEAN_REL_ERR), - self.gpu_precision.get(ApiPrecisionCompareColumn.MEAN_REL_ERR)) + self.gpu_precision.get(ApiPrecisionCompareColumn.MEAN_REL_ERR), 10000.0) self.eb_ratio = self._calc_ratio(self.npu_precision.get(ApiPrecisionCompareColumn.EB), - self.gpu_precision.get(ApiPrecisionCompareColumn.EB)) + self.gpu_precision.get(ApiPrecisionCompareColumn.EB), 10000.0) def to_column_value(self): return [self.small_value_err_ratio, self.small_value_err_status, self.rmse_ratio, @@ -176,7 +177,7 @@ def api_precision_compare(config): def analyse_csv(npu_data, gpu_data, config): forward_status, backward_status = [], [] - last_api_name, last_api_dtype = None, None + full_last_api_name, last_api_dtype = None, None for _, row_npu in npu_data.iterrows(): message = '' compare_column = ApiPrecisionOutputColumn() @@ -197,7 +198,7 @@ def analyse_csv(npu_data, gpu_data, config): new_status = CompareConst.SPACE compare_column.api_name = full_api_name_with_direction_status if row_npu[ApiPrecisionCompareColumn.DEVICE_DTYPE] not in BINARY_COMPARE_UNSUPPORT_LIST or api_name in BinaryStandardApi: - new_status = record_binary_consistency_result(compare_column, row_npu) + new_status = record_binary_consistency_result(api_name, compare_column, row_npu) elif api_name in AbsoluteStandardApi: new_status = record_absolute_threshold_result(compare_column, row_npu) elif row_npu[ApiPrecisionCompareColumn.DEVICE_DTYPE] in BENCHMARK_COMPARE_SUPPORT_LIST: @@ -205,21 +206,23 @@ def analyse_csv(npu_data, gpu_data, config): new_status = record_benchmark_compare_result(compare_column, bs) write_detail_csv(compare_column.to_column_value(), config.details_csv_path) - if last_api_name is not None and full_api_name != last_api_name: + if full_last_api_name is not None and full_api_name != full_last_api_name: if last_api_dtype in API_PRECISION_COMPARE_UNSUPPORT_LIST: message = unsupported_message - write_csv([[last_api_name, "skip", "skip", message]], config.result_csv_path) + write_csv([[full_last_api_name, "skip", "skip", message]], config.result_csv_path) forward_status, backward_status = [], [] message = '' else: forward_result = get_api_checker_result(forward_status) backward_result = get_api_checker_result(backward_status) - write_csv([[last_api_name, forward_result, backward_result, message]], config.result_csv_path) + _, last_api_name, _ = full_last_api_name.split("*") + message += CompareMessage.get(last_api_name, "") if forward_result == CompareConst.ERROR else "" + write_csv([[full_last_api_name, forward_result, backward_result, message]], config.result_csv_path) forward_status, backward_status = [], [] message = '' is_supported = row_npu[ApiPrecisionCompareColumn.DEVICE_DTYPE] not in API_PRECISION_COMPARE_UNSUPPORT_LIST - last_api_name = full_api_name + full_last_api_name = full_api_name last_api_dtype = row_npu[ApiPrecisionCompareColumn.DEVICE_DTYPE] if not is_supported: @@ -232,14 +235,16 @@ def analyse_csv(npu_data, gpu_data, config): else: print_error_log(f"Invalid direction status: {direction_status}") - if last_api_name is not None: + if full_last_api_name is not None: if last_api_dtype in API_PRECISION_COMPARE_UNSUPPORT_LIST: message = unsupported_message - write_csv([[last_api_name, "skip", "skip", message]], config.result_csv_path) + write_csv([[full_last_api_name, "skip", "skip", message]], config.result_csv_path) else: forward_result = get_api_checker_result(forward_status) backward_result = get_api_checker_result(backward_status) - write_csv([[last_api_name, forward_result, backward_result, message]], config.result_csv_path) + _, last_api_name, _ = full_last_api_name.split("*") + message += CompareMessage.get(last_api_name, "") if forward_result == CompareConst.ERROR else "" + write_csv([[full_last_api_name, forward_result, backward_result, message]], config.result_csv_path) def check_error_rate(npu_error_rate): @@ -288,7 +293,7 @@ def check_csv_columns(columns, csv_type): raise CompareException(CompareException.INVALID_DATA_ERROR, msg) -def record_binary_consistency_result(compare_column, row_npu): +def record_binary_consistency_result(api_name, compare_column, row_npu): new_status = check_error_rate(row_npu[ApiPrecisionCompareColumn.ERROR_RATE]) compare_column.error_rate = row_npu[ApiPrecisionCompareColumn.ERROR_RATE] compare_column.error_rate_status = new_status @@ -296,7 +301,8 @@ def record_binary_consistency_result(compare_column, row_npu): compare_column.compare_algorithm = "二进制一致法" message = '' if compare_column.error_rate_status == CompareConst.ERROR: - message += "ERROR: 二进制一致错误率超过阈值" + message += "ERROR: 二进制一致错误率超过阈值\n" + message += CompareMessage.get(api_name, "") compare_column.compare_message = message return new_status diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/api_precision_standard.yaml b/debug/accuracy_tools/api_accuracy_checker/compare/api_precision_standard.yaml index ceccf65a46bdb757db4079c3e5e5bff71a5625b5..4033538b73e9b6a094bbf2c05e0c02bbed607c24 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/api_precision_standard.yaml +++ b/debug/accuracy_tools/api_accuracy_checker/compare/api_precision_standard.yaml @@ -99,6 +99,7 @@ BinaryCompareStandard: - sign_ - sort - tile + - topk - transpose - transpose_ - tril diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py index 15bfb1904c810f756600daef1a90ab1d4176ac0d..bd10f77976642331fa8e7bce28a703f0922c1411 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare.py @@ -5,7 +5,7 @@ import torch import numpy as np from rich.table import Table from rich.console import Console -from api_accuracy_checker.common.utils import get_json_contents, write_csv +from api_accuracy_checker.common.utils import get_json_contents, write_csv, print_warn_log from api_accuracy_checker.compare.compare_utils import CompareConst, check_dtype_comparable, DETAIL_TEST_ROWS, \ precision_configs, BENCHMARK_COMPARE_SUPPORT_LIST, AbsoluteStandardApi, BinaryStandardApi, apis_threshold from api_accuracy_checker.compare.compare_column import CompareColumn @@ -47,6 +47,8 @@ class Comparator: else: passing_rate = "0%" + print_warn_log("The follwing tables will be deprecated in the future." + "The following results are for reference only.") console = Console() table_total = Table( show_header=True, title="Overall Statistics", show_lines=True, width=75 @@ -160,9 +162,16 @@ class Comparator: _, api_name, _ = full_api_name.split("*") compare_func = self._compare_dropout if "dropout" in full_api_name else self._compare_core_wrapper fwd_success_status, fwd_compare_alg_results = compare_func(api_name, bench_output, device_output) - bwd_success_status, bwd_compare_alg_results = (CompareConst.PASS, []) if not (bench_grad and npu_grad) else compare_func(api_name, bench_grad[0], npu_grad[0]) if "dropout" in full_api_name else compare_func(api_name, bench_grad, npu_grad) + if not (bench_grad and npu_grad): + bwd_success_status, bwd_compare_alg_results = (CompareConst.SPACE, []) + else: + if "dropout" in full_api_name: + bwd_success_status, bwd_compare_alg_results = compare_func(api_name, bench_grad[0], npu_grad[0]) + else: + bwd_success_status, bwd_compare_alg_results = compare_func(api_name, bench_grad, npu_grad) self.record_results(full_api_name, fwd_success_status, bwd_success_status if bwd_compare_alg_results is not None else CompareConst.SPACE, fwd_compare_alg_results, bwd_compare_alg_results) - return fwd_success_status == CompareConst.PASS, bwd_success_status == CompareConst.PASS + return fwd_success_status == CompareConst.PASS, bwd_success_status == CompareConst.PASS \ + or bwd_success_status == CompareConst.SPACE def _compare_core_wrapper(self, api_name, bench_output, device_output): detailed_result_total = [] diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/compare_utils.py b/debug/accuracy_tools/api_accuracy_checker/compare/compare_utils.py index d711265cc781c634caa189692b5e0141eee2f83e..2ca701aaec3f6a03882e600dcdfe217b702b6c3d 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/compare_utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/compare_utils.py @@ -152,6 +152,11 @@ class ApiPrecisionCompareColumn: ApiPrecisionCompareColumn.BACKWARD_STATUS, ApiPrecisionCompareColumn.MESSAGE] +CompareMessage = { + "topk" : "在npu上,topk的入参sorted=False时不生效,会返回有序tensor,而cpu上会返回无序tensor。 如果topk精度不达标,请检查是否是该原因导致的。" +} + + def check_dtype_comparable(x, y): if x.dtype in Const.FLOAT_TYPE: if y.dtype in Const.FLOAT_TYPE: diff --git a/debug/accuracy_tools/api_accuracy_checker/dump/api_info.py b/debug/accuracy_tools/api_accuracy_checker/dump/api_info.py index e14405fe143e0a39030b6cca18a615c48e802682..50ad39166fb03ae210af92217f424cfdbe6e1eb4 100644 --- a/debug/accuracy_tools/api_accuracy_checker/dump/api_info.py +++ b/debug/accuracy_tools/api_accuracy_checker/dump/api_info.py @@ -12,16 +12,39 @@ from ptdbg_ascend.src.python.ptdbg_ascend.common.utils import check_path_before_ def get_tensor_extremum(data, operator): if data.dtype is torch.bool: if data.numel() == 0: - return False + return False, False if operator == 'max': - return True in data + return True in data, True in data elif operator == 'min': - return False not in data - data_clone = data.clone().detach() + return False not in data, False not in data + data_clone = data.float().clone().detach() if operator == 'max': - return torch._C._VariableFunctionsClass.max(data_clone.float()).item() + max_result = torch._C._VariableFunctionsClass.max(data_clone).item() + if np.isinf(max_result) or np.isnan(max_result): + return handle_tensor_extremum_nan_inf(data_clone, operator), max_result + else: + return max_result, max_result + else: + min_result = torch._C._VariableFunctionsClass.min(data_clone).item() + if np.isinf(min_result) or np.isnan(min_result): + return handle_tensor_extremum_nan_inf(data_clone, operator), min_result + else: + return min_result, min_result + + +def handle_tensor_extremum_nan_inf(data_clone, operator): + data_nan = torch._C._VariableFunctionsClass.isnan(data_clone) + if int(torch._C._VariableFunctionsClass.sum(data_nan)) == data_clone.numel(): + return float('nan') + finite_mask = torch._C._VariableFunctionsClass.isfinite(data_clone) + if int(torch._C._VariableFunctionsClass.sum(finite_mask)) > 0: + finite_values = data_clone[finite_mask] + return torch._C._VariableFunctionsClass.max(finite_values).item() if operator == 'max' else \ + torch._C._VariableFunctionsClass.min(finite_values).item() else: - return torch._C._VariableFunctionsClass.min(data_clone.float()).item() + data_no_nan = data_clone[~data_nan] + return torch._C._VariableFunctionsClass.max(data_no_nan).item() if operator == 'max' else \ + torch._C._VariableFunctionsClass.min(data_no_nan).item() def get_type_name(name): @@ -118,8 +141,12 @@ class APIInfo: single_arg.update({'type': 'torch.Tensor'}) single_arg.update({'dtype': str(arg.dtype)}) single_arg.update({'shape': arg.shape}) - single_arg.update({'Max': transfer_types(get_tensor_extremum(arg, 'max'), str(arg.dtype))}) - single_arg.update({'Min': transfer_types(get_tensor_extremum(arg, 'min'), str(arg.dtype))}) + max_handle, max_origin = get_tensor_extremum(arg, 'max') + single_arg.update({'Max': transfer_types(max_handle, str(arg.dtype))}) + single_arg.update({'Max_origin': transfer_types(max_origin, str(arg.dtype))}) + min_handle, min_origin = get_tensor_extremum(arg, 'min') + single_arg.update({'Min': transfer_types(min_handle, str(arg.dtype))}) + single_arg.update({'Min_origin': transfer_types(min_origin, str(arg.dtype))}) single_arg.update({'requires_grad': arg.requires_grad}) else: api_args = self.api_name + '.' + str(self.args_num) diff --git a/debug/accuracy_tools/api_accuracy_checker/dump/dump_scope.py b/debug/accuracy_tools/api_accuracy_checker/dump/dump_scope.py index a297e7235f7b7a196112d8fa857513c4d5027f03..1f65dbc9c8a7e482d8ac85e3d06cffc3b11b406a 100644 --- a/debug/accuracy_tools/api_accuracy_checker/dump/dump_scope.py +++ b/debug/accuracy_tools/api_accuracy_checker/dump/dump_scope.py @@ -5,14 +5,18 @@ from api_accuracy_checker.dump.dump import DumpUtil from api_accuracy_checker.common.config import msCheckerConfig -def iter_tracer(func): +def iter_tracer(original_next): def func_wrapper(*args, **kwargs): - DumpUtil.dump_switch = "OFF" - result = func(*args, **kwargs) - DumpUtil.incr_iter_num_maybe_exit() - DumpUtil.call_num += 1 - return result + if msCheckerConfig.enable_dataloader: + DumpUtil.dump_switch = "OFF" + result = original_next(*args, **kwargs) + DumpUtil.incr_iter_num_maybe_exit() + DumpUtil.call_num += 1 + return result + else: + return original_next(*args, **kwargs) return func_wrapper -if msCheckerConfig.enable_dataloader: - _BaseDataLoaderIter.__next__ = iter_tracer(torch.utils.data.dataloader._BaseDataLoaderIter.__next__) \ No newline at end of file +original_next_method = _BaseDataLoaderIter.__next__ + +_BaseDataLoaderIter.__next__ = iter_tracer(original_next_method) \ No newline at end of file diff --git a/debug/accuracy_tools/api_accuracy_checker/hook_module/wrap_tensor.py b/debug/accuracy_tools/api_accuracy_checker/hook_module/wrap_tensor.py index b105ae8353fbe31ac3f1ffe2916beb2c8be2724e..d60cac74baf15872854d71089df7cdd81925746e 100644 --- a/debug/accuracy_tools/api_accuracy_checker/hook_module/wrap_tensor.py +++ b/debug/accuracy_tools/api_accuracy_checker/hook_module/wrap_tensor.py @@ -25,6 +25,7 @@ from api_accuracy_checker.common.utils import torch_device_guard from api_accuracy_checker.common.config import msCheckerConfig from api_accuracy_checker.hook_module.utils import WrapTensorOps from ptdbg_ascend.src.python.ptdbg_ascend.common.file_check_util import FileOpen +from ptdbg_ascend.src.python.ptdbg_ascend.common.utils import parameter_adapter def get_tensor_ops(): @@ -49,6 +50,7 @@ class TensorOPTemplate(HOOKModule): super().__init__(hook) @torch_device_guard + @parameter_adapter def forward(self, *args, **kwargs): return getattr(torch._C._TensorBase, str(self.op_name_))(*args, **kwargs) diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/data_generate.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/data_generate.py index ec3c539f7f86149b9a8c1f26864bbcb5751748bf..51fcfedefbb8a6ef079a2d26cdc7d9ca841092bf 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/data_generate.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/data_generate.py @@ -16,6 +16,7 @@ """ import os +import math import torch import numpy @@ -105,6 +106,9 @@ def gen_random_tensor(info, convert_type): """ check_object_type(info, dict) low, high = info.get('Min'), info.get('Max') + low_origin, high_origin = info.get('Min_origin'), info.get('Max_origin') + low_info = [low, low_origin] + high_info = [high, high_origin] data_dtype = info.get('dtype') shape = tuple(info.get('shape')) if not isinstance(low, (int, float)) or not isinstance(high, (int, float)): @@ -113,17 +117,19 @@ def gen_random_tensor(info, convert_type): if data_dtype == "torch.bool": data = gen_bool_tensor(low, high, shape) else: - data = gen_common_tensor(low, high, shape, data_dtype, convert_type) + data = gen_common_tensor(low_info, high_info, shape, data_dtype, convert_type) return data -def gen_common_tensor(low, high, shape, data_dtype, convert_type): +def gen_common_tensor(low_info, high_info, shape, data_dtype, convert_type): """ Function Description: Based on API basic information, generate int or float tensor Parameter: - low: The minimum value in Tensor - high: The max value in Tensor + low_info: [low, low_origin], low is the minimum value in the tensor removed inf and nan, + low_origin is the original minimum value in the tensor + high_info: [high, high_origin], high is the maximum value in the tensor removed inf and nan, + high_origin is the original maximum value in the tensor shape:The shape of Tensor data_dtype: The data type of Tensor convert_type: convert ori_type to dist_type flag. @@ -132,13 +138,32 @@ def gen_common_tensor(low, high, shape, data_dtype, convert_type): ori_dtype = Const.CONVERT.get(convert_type)[0] if ori_dtype == data_dtype: data_dtype = Const.CONVERT.get(convert_type)[1] + low, low_origin = low_info[0], low_info[1] + high, high_origin = high_info[0], high_info[1] if data_dtype in FLOAT_TYPE: - if high in [float('inf'), float('-inf')] or low in [float('inf'), float('-inf')]: - error_info = 'Parameter contains inf, skip comparison.' - raise CompareException(CompareException.INVALID_PARAM_ERROR, error_info) - scale = high - low + if math.isnan(high): + tensor = torch._C._VariableFunctionsClass.full(shape, float('nan'), dtype=eval(data_dtype)) + return tensor + #high_origin为新版json中的属性,只有当high_origin不为None,且high为inf或-inf时,原tensor全为inf或-inf + if high_origin and high in [float('inf'), float('-inf')]: + tensor = torch._C._VariableFunctionsClass.full(shape, high, dtype=eval(data_dtype)) + tensor[-1] = low + return tensor + low_scale, high_scale = low, high + dtype_finfo = torch.finfo(eval(data_dtype)) + #适配老版json high和low为inf或-inf的情况,取dtype的最大值或最小值进行放缩 + if high == float('inf'): + high_scale = dtype_finfo.max + elif high == float('-inf'): + high_scale = dtype_finfo.min + if low == float('inf'): + low_scale = dtype_finfo.max + elif low == float('-inf'): + low_scale = dtype_finfo.min + + scale = high_scale - low_scale rand01 = torch.rand(shape, dtype=eval(data_dtype)) - tensor = rand01 * scale + low + tensor = rand01 * scale + low_scale elif 'int' in data_dtype or 'long' in data_dtype: low, high = int(low), int(high) tensor = torch.randint(low, high + 1, shape, dtype=eval(data_dtype)) @@ -148,8 +173,21 @@ def gen_common_tensor(low, high, shape, data_dtype, convert_type): if tensor.nelement() == 0: return tensor tmp_tensor = tensor.reshape(-1) - tmp_tensor[0] = low - tmp_tensor[-1] = high + if high_origin and math.isnan(high_origin): + if tmp_tensor.numel() <= 2: + tmp_tensor[0] = float('nan') + tmp_tensor[-1] = high + else: + tmp_tensor[0] = low + tmp_tensor[1] = float('nan') + tmp_tensor[-1] = high + else: + tmp_tensor[0] = low + tmp_tensor[-1] = high + if high_origin in [float('inf'), float('-inf')]: + tmp_tensor[-1] = high_origin + if low_origin in [float('inf'), float('-inf')]: + tmp_tensor[0] = low_origin data = tmp_tensor.reshape(shape) return data diff --git a/debug/accuracy_tools/api_accuracy_checker/test/resources/forward.json b/debug/accuracy_tools/api_accuracy_checker/test/resources/forward.json index 5f54e077bfd0425be200f89e14a8ba131d3a3a8b..f938f352460a87222bdb5346873904cb420996cc 100644 --- a/debug/accuracy_tools/api_accuracy_checker/test/resources/forward.json +++ b/debug/accuracy_tools/api_accuracy_checker/test/resources/forward.json @@ -1,3 +1,3 @@ { - "Functional*silu*0": {"args": [{"type": "torch.Tensor", "dtype": "torch.float32", "shape": [2, 2560, 24, 24], "Max": 5.7421875, "Min": -5.125, "requires_grad": true}], "kwargs" :{"inplace": {"type": "bool", "value": false}}} + "Functional*silu*0": {"args": [{"type": "torch.Tensor", "dtype": "torch.float32", "shape": [2, 2560, 24, 24], "Max": 5.7421875, "Max_origin": 5.7421875, "Min": -5.125, "Min_origin": -5.125, "requires_grad": true}], "kwargs" :{"inplace": {"type": "bool", "value": false}}} } \ No newline at end of file diff --git a/debug/accuracy_tools/api_accuracy_checker/test/ut/dump/test_api_info.py b/debug/accuracy_tools/api_accuracy_checker/test/ut/dump/test_api_info.py index 8951d5523ae2db277bf41e0417ae71456a27af4a..2c03d56e722decc424052367dfe9700ba3df94ce 100644 --- a/debug/accuracy_tools/api_accuracy_checker/test/ut/dump/test_api_info.py +++ b/debug/accuracy_tools/api_accuracy_checker/test/ut/dump/test_api_info.py @@ -2,6 +2,7 @@ import os import shutil import unittest import torch +import numpy as np from api_accuracy_checker.dump.api_info import APIInfo, ForwardAPIInfo, BackwardAPIInfo, transfer_types, \ get_tensor_extremum, get_type_name, is_builtin_class, analyze_device_in_kwargs, analyze_dtype_in_kwargs from api_accuracy_checker.common.config import msCheckerConfig @@ -55,10 +56,52 @@ class TestAPIInfo(unittest.TestCase): def test_get_tensor_extremum(self): data = torch.tensor([1, 2, 3]) - result_max = get_tensor_extremum(data, 'max') - result_min = get_tensor_extremum(data, 'min') + result_max, result_max_origin = get_tensor_extremum(data, 'max') + result_min, result_min_origin = get_tensor_extremum(data, 'min') self.assertEqual(result_max, 3) self.assertEqual(result_min, 1) + self.assertEqual(result_max_origin, 3) + self.assertEqual(result_min_origin, 1) + + data = torch.tensor([1, float("inf"), 2, 3]) + result_max, result_max_origin = get_tensor_extremum(data, 'max') + result_min, result_min_origin = get_tensor_extremum(data, 'min') + self.assertEqual(result_max, 3) + self.assertEqual(result_min, 1) + self.assertEqual(result_max_origin, float("inf")) + self.assertEqual(result_min_origin, 1) + + data = torch.tensor([1, float("-inf"), 2, 3]) + result_max, result_max_origin = get_tensor_extremum(data, 'max') + result_min, result_min_origin = get_tensor_extremum(data, 'min') + self.assertEqual(result_max, 3) + self.assertEqual(result_min, 1) + self.assertEqual(result_max_origin, 3) + self.assertEqual(result_min_origin, float("-inf")) + + data = torch.tensor([1, float("inf"), float("nan"), 3]) + result_max, result_max_origin = get_tensor_extremum(data, 'max') + result_min, result_min_origin = get_tensor_extremum(data, 'min') + self.assertEqual(result_max, 3) + self.assertEqual(result_min, 1) + self.assertTrue(np.isnan(result_max_origin)) + self.assertTrue(np.isnan(result_min_origin)) + + data = torch.tensor([float("inf"), float("nan")]) + result_max, result_max_origin = get_tensor_extremum(data, 'max') + result_min, result_min_origin = get_tensor_extremum(data, 'min') + self.assertEqual(result_max, float("inf")) + self.assertEqual(result_min, float("inf")) + self.assertTrue(np.isnan(result_max_origin)) + self.assertTrue(np.isnan(result_min_origin)) + + data = torch.tensor([float("nan"), float("nan")]) + result_max, result_max_origin = get_tensor_extremum(data, 'max') + result_min, result_min_origin = get_tensor_extremum(data, 'min') + self.assertTrue(np.isnan(result_max)) + self.assertTrue(np.isnan(result_min)) + self.assertTrue(np.isnan(result_max_origin)) + self.assertTrue(np.isnan(result_min_origin)) def test_get_type_name(self): name = "" diff --git a/debug/accuracy_tools/api_accuracy_checker/test/ut/run_ut/test_data_generate.py b/debug/accuracy_tools/api_accuracy_checker/test/ut/run_ut/test_data_generate.py index 3f79ecf17b2583474a8de08e5c4b3862e5924590..b98f84d516404665b5c3284f1e03f14eedddac55 100644 --- a/debug/accuracy_tools/api_accuracy_checker/test/ut/run_ut/test_data_generate.py +++ b/debug/accuracy_tools/api_accuracy_checker/test/ut/run_ut/test_data_generate.py @@ -73,9 +73,12 @@ class TestDataGenerateMethods(unittest.TestCase): def test_gen_common_tensor(self): info = api_info_dict.get('args')[0] low, high = info.get('Min'), info.get('Max') + low_origin, high_origin = info.get('Min_origin'), info.get('Max_origin') + low_info = [low, low_origin] + high_info = [high, high_origin] data_dtype = info.get('dtype') shape = tuple(info.get('shape')) - data = gen_common_tensor(low, high, shape, data_dtype, None) + data = gen_common_tensor(low_info, high_info, shape, data_dtype, None) max_diff = abs(data.max() - max_value) min_diff = abs(data.min() - min_value) self.assertEqual(data.dtype, torch.float32) diff --git a/debug/accuracy_tools/grad_tool/level_adapter.py b/debug/accuracy_tools/grad_tool/level_adapter.py index 51e6717d941f3d64fc085f0da39eb9927693e54f..64bb1e92a9272f2b071a9cc61094713ac7d84db2 100644 --- a/debug/accuracy_tools/grad_tool/level_adapter.py +++ b/debug/accuracy_tools/grad_tool/level_adapter.py @@ -34,7 +34,7 @@ class LevelOps: def save_grad_direction(param_name, grad, save_path): if not os.path.exists(save_path): os.makedirs(save_path) - param_grad = torch.Tensor(grad.clone().cpu()) + param_grad = grad.clone().detach() is_positive = param_grad > 0 torch.save(is_positive, f'{save_path}/{param_name}.pt') print_info_log(f'Save {param_name} bool tensor, it has {is_positive.sum()}/{is_positive.numel()} positive elements') diff --git a/debug/accuracy_tools/ptdbg_ascend/CMakeLists.txt b/debug/accuracy_tools/ptdbg_ascend/CMakeLists.txt index 582dd0c8e5a34409cf33e5563738f5292eeeafea..a709b268361645d68a2561a3e4616b6275681adf 100644 --- a/debug/accuracy_tools/ptdbg_ascend/CMakeLists.txt +++ b/debug/accuracy_tools/ptdbg_ascend/CMakeLists.txt @@ -16,4 +16,4 @@ add_custom_target(ptdbg_ascend ALL VERBATIM ) -install(CODE "execute_process(COMMAND ${PYTHON_BIN_PATH} -m pip install ${CMAKE_BINARY_DIR}/ptdbg_ascend/dist/ptdbg_ascend-5.0.T4-py3-none-any.whl --upgrade)") +install(CODE "execute_process(COMMAND ${PYTHON_BIN_PATH} -m pip install ${CMAKE_BINARY_DIR}/ptdbg_ascend/dist/ptdbg_ascend-5.0-py3-none-any.whl --upgrade)") diff --git a/debug/accuracy_tools/ptdbg_ascend/RELEASE.md b/debug/accuracy_tools/ptdbg_ascend/RELEASE.md index f4725220f1e418343f4f3424446a3b1b732714d4..fd7d9e93c1f87e93bd5022b848cca640eaeb0bce 100644 --- a/debug/accuracy_tools/ptdbg_ascend/RELEASE.md +++ b/debug/accuracy_tools/ptdbg_ascend/RELEASE.md @@ -1,4 +1,4 @@ -# Release 5.0.T4 +# Release 5.0 This is the initial release of Pytorch precision compare tools which was designed by the researchers and engineers in Huawei Technologies Co.,Ltd. \ No newline at end of file diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v2.0.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v2.0.md" index 9b3227f47a622368d8c384b538197caae06298ec..ce102dd882faf8a7a273d40a998f1053a0b30013 100644 --- "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v2.0.md" +++ "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v2.0.md" @@ -41,10 +41,17 @@ ptdbg_ascend工具的原理及安装请参见《[PyTorch精度工具](https://gi PyTorch训练场景的精度问题分析建议参考以下思路进行精度比对和比对结果分析: 1. 整网比对:dump整网数据并进行精度比对,初步定位异常范围。 + + 不推荐使用整网dump比对,若模型数据庞大(比如达到T级别),整网dump可能导致磁盘不足,需要预留足够的存储空间,或者分多次dump。 + 2. 缩小范围:根据Accuracy Reached or Not找出不符合精度标准的API。 + 3. 范围比对:对不符合精度标准的API重新dump。 + 4. 分析原因并优化:分析API精度不符合标准的原因并进行优化调整。 + 5. 整网比对:重新进行整网比对,判断优化后的API是否已符合精度标准以及是否出现新的精度问题。 + 6. 重复1~5步,直到不存在精度问题为止。 **精度分析示例** diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v3.0.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v3.0.md" index 03ed6bb61156a65803abdcfb2d75956961b0ab59..f2059fd96e943dea2af85deabc94b0d7c57308f7 100644 --- "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v3.0.md" +++ "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v3.0.md" @@ -41,10 +41,17 @@ ptdbg_ascend工具的原理及安装请参见《[PyTorch精度工具](https://gi PyTorch训练场景的精度问题分析建议参考以下思路进行精度比对和比对结果分析: 1. 整网比对:dump整网数据并进行精度比对,初步定位异常范围。 + + 不推荐使用整网dump比对,若模型数据庞大(比如达到T级别),整网dump可能导致磁盘不足,需要预留足够的存储空间,或者分多次dump。 + 2. 缩小范围:根据Accuracy Reached or Not找出不符合精度标准的API。 + 3. 范围比对:对不符合精度标准的API重新dump。 + 4. 分析原因并优化:分析API精度不符合标准的原因并进行优化调整。 + 5. 整网比对:重新进行整网比对,判断优化后的API是否已符合精度标准以及是否出现新的精度问题。 + 6. 重复1~5步,直到不存在精度问题为止。 **精度分析示例** diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.0.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.0.md" index 9dee9341fcf83307fe9aa7cc6b0a80df30e4710f..962d903dde030a34b6c64a297e9a395097cc5ec3 100644 --- "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.0.md" +++ "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v4.0.md" @@ -43,10 +43,17 @@ ptdbg_ascend工具主要支持PyTorch API精度数据dump、溢出检测、精 PyTorch训练场景的精度问题分析建议参考以下思路进行精度比对和比对结果分析: 1. 整网比对:dump整网数据并进行精度比对,初步定位异常范围。 + + 不推荐使用整网dump比对,若模型数据庞大(比如达到T级别),整网dump可能导致磁盘不足,需要预留足够的存储空间,或者分多次dump。 + 2. 缩小范围:根据Accuracy Reached or Not找出不符合精度标准的API。 + 3. 范围比对:对不符合精度标准的API重新dump详细信息。 + 4. 分析原因并优化:分析API精度不符合标准的原因并进行优化调整。 + 5. 整网比对:重新进行整网比对,判断优化后的API是否已符合精度标准以及是否出现新的精度问题。 + 6. 重复1~5步,直到不存在精度问题为止。 **精度分析示例** diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v5.0.T4.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v5.0.T4.md" index aa88541c3ed65ad8f31a333f4a759b46d465ad82..d7ab37749d776e828d34b3cbcd0f74258ea681cb 100644 --- "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v5.0.T4.md" +++ "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v5.0.T4.md" @@ -43,10 +43,17 @@ ptdbg_ascend工具主要支持PyTorch API精度数据dump、溢出检测、精 PyTorch训练场景的精度问题分析建议参考以下思路进行精度比对和比对结果分析: 1. 整网比对:dump整网数据并进行精度比对,初步定位异常范围。 + + 不推荐使用整网dump比对,若模型数据庞大(比如达到T级别),整网dump可能导致磁盘不足,需要预留足够的存储空间,或者分多次dump。 + 2. 缩小范围:根据Accuracy Reached or Not找出不符合精度标准的API。 + 3. 范围比对:对不符合精度标准的API重新dump详细信息。 + 4. 分析原因并优化:分析API精度不符合标准的原因并进行优化调整。 + 5. 整网比对:重新进行整网比对,判断优化后的API是否已符合精度标准以及是否出现新的精度问题。 + 6. 重复1~5步,直到不存在精度问题为止。 **精度分析示例** @@ -784,12 +791,12 @@ PrecisionDebugger(dump_path=None, hook_name=None, rank=None, step=[], enable_dat | 参数名 | 说明 | 是否必选 | | ----------------- | ------------------------------------------------------------ | -------- | -| dump_path | 设置dump数据目录路径,参数示例:"./dump_path"。
默认在dump_path目录下生成`ptdbg_dump_{version}`目录,并在该目录下生成`dump.pkl`文件以及`dump`数据文件保存目录。
当**configure_hook**函数配置了mode参数时,`dump.pkl`文件以及`dump`数据文件保存目录名称添加mode参数值为前缀,详情请参见“**dump数据存盘说明**”。
未配置dump_path时,也可以通过环境变量ASCEND_WORK_PATH配置dump路径,此时dump数据将落盘在${ASCEND_WORK_PATH}/dump_data下,自定义配置dump_path优先级高于环境变量,dump_path和环境变量需要二选一。 | 否 | -| hook_name | dump模式,可取值dump和overflow_check,表示dump和溢出检测功能,二选一。 | 是 | -| rank | 指定对某张卡上的数据进行dump或溢出检测,默认未配置(表示dump所有卡的数据),须根据实际卡的Rank ID配置。应配置为大于0的正整数,且须根据实际卡的Rank ID配置,若所配置的值大于实际训练所运行的卡的Rank ID,则dump数据为空,比如当前环境Rank ID为0~7,实际训练运行0~3卡,此时若配置Rank ID为4或不存在的10等其他值,此时dump数据为空。 | 否 | -| step | 指定dump某个step的数据,默认未配置,表示dump所有step数据。dump特定step时,须指定为训练脚本中存在的step。step为list格式,可配置逐个step,例如:step=[0,1,2];也可以配置step范围,例如:step=list(range(0,9)),表示dump第0到第8个step。 | 否 | -| enable_dataloader | 自动控制开关,可取值True(开启)或False(关闭),默认为False。配置为True后自动识别dump step参数指定的迭代,并在该迭代执行完成后退出训练,此时start和stop函数可不配置,开启该开关要求训练脚本是通过torch.utils.data.dataloader方式加载数据;配置为False则需要配置start和stop函数,并在最后一个stop函数后或一个step结束的位置添加debugger.step()。 | 否 | -| model | 开启init dump模式,传入网络模型实例化的对象,配置该参数后,dump操作仅dump网络中init方法里调用的方法(nn.Module类),不会对所有API进行dump。参数示例: model=net,net为网络模型实例化的对象名称。默认未配置。
配置该参数时,PrecisionDebugger模块请在模型实例化之后调用。
该模式不支持“溢出检测”、”ACL级别数据dump“和“模块级精度数据dump”。此模式下dump文件名前缀为网络中定义的模块名或层名。 | 否 | +| dump_path | 设置dump数据目录路径,参数示例:"./dump_path"。数据类型:str。
默认在dump_path目录下生成`ptdbg_dump_{version}`目录,并在该目录下生成`dump.pkl`文件以及`dump`数据文件保存目录。
当**configure_hook**函数配置了mode参数时,`dump.pkl`文件以及`dump`数据文件保存目录名称添加mode参数值为前缀,详情请参见“**dump数据存盘说明**”。
未配置dump_path时,也可以通过环境变量ASCEND_WORK_PATH配置dump路径,此时dump数据将落盘在${ASCEND_WORK_PATH}/dump_data下,自定义配置dump_path优先级高于环境变量,dump_path和环境变量需要二选一。 | 否 | +| hook_name | dump模式,可取值"dump"和"overflow_check",表示dump和溢出检测功能,二选一。参数示例:hook_name="dump"。数据类型:str。 | 是 | +| rank | 指定对某张卡上的数据进行dump或溢出检测,默认未配置(表示dump所有卡的数据),须根据实际卡的Rank ID配置。应配置为大于0的正整数,且须根据实际卡的Rank ID配置,若所配置的值大于实际训练所运行的卡的Rank ID,则dump数据为空,比如当前环境Rank ID为0到7,实际训练运行0到3卡,此时若配置Rank ID为4或不存在的10等其他值,此时dump数据为空。数据类型:int。 | 否 | +| step | 指定dump某个step的数据,默认未配置,表示dump所有step数据。dump特定step时,须指定为训练脚本中存在的step。step为list格式,可配置逐个step,例如:step=[0,1,2];也可以配置step范围,例如:step=list(range(0,9)),表示dump第0到第8个step。数据类型:List[int]。 | 否 | +| enable_dataloader | 自动控制开关,可取值True(开启)或False(关闭),默认为False。配置为True后自动识别dump step参数指定的迭代,并在该迭代执行完成后退出训练,此时start和stop函数可不配置,开启该开关要求训练脚本是通过torch.utils.data.dataloader方式加载数据;配置为False则需要配置start和stop函数,并在最后一个stop函数后或一个step结束的位置添加debugger.step()。数据类型:bool。 | 否 | +| model | 开启init dump模式,传入网络模型实例化的对象,配置该参数后,dump操作仅dump网络中init方法里调用的方法(nn.Module类),不会对所有API进行dump。参数示例: model=net,net为网络模型实例化的对象名称。默认未配置。
配置该参数时,PrecisionDebugger模块请在模型实例化之后调用。数据类型:torch.nn.Module。
该模式不支持“溢出检测”、”ACL级别数据dump“和“模块级精度数据dump”。此模式下dump文件名前缀为网络中定义的模块名或层名。 | 否 | #### init dump模式示例代码和数据落盘说明 @@ -858,7 +865,7 @@ bn1_BatchNorm2d_0_backward_output.2.npy dump: ```python -debugger.configure_hook(mode="api_stack", scope=[], api_list=[], filter_switch="OFF", acl_config=None, backward_input=[], input_output_mode=["all"], summary_only=False) +debugger.configure_hook(mode="api_stack", scope=[], api_list=[], filter_switch="OFF", acl_config=None, backward_input=[], input_output_mode=["all"], summary_only=False, summary_mode="all") ``` 溢出检测: @@ -871,16 +878,16 @@ debugger.configure_hook(mode=None, acl_config=None, overflow_nums=1, need_replic | 参数名 | 说明 | 是否必选 | | ----------------- | ------------------------------------------------------------ | -------- | -| mode | dump模式。可取值"all"、"list"、"range"、"stack"、"acl"、"api_list"、"api_stack",各参数含义请参见本节的“**函数示例**”。参数示例:mode="list"。默认为api_stack。该参数配置值将作为dump数据文件名的前缀,详情请参见“**dump数据存盘说明**”。 | 否 | -| scope或api_list | dump范围。根据model配置的模式选择dump的API范围,mode="api_list"时,需要配置api_list=[],其他模式有需要时配置scope=[]。参数示例:scope=["Tensor_permute_1_forward", "Tensor_transpose_2_forward"]、api_list=["relu"]。默认为空。 | 否 | -| filter_switch | dump bool和整型的tensor以及浮点、bool和整型的标量的过滤开关。可取值"ON"(表示开启过滤,即不dump)或"OFF"(表示关闭过滤)。参数示例:filter_switch="ON"。默认不配置,即filter_switch="OFF",表示dump上述数据。 | 否 | -| acl_config | acl dump的配置文件。mode="acl"时,该参数必选;mode为其他值时,该参数不选。参数示例:acl_config='./dump.json'。dump.json配置文件详细介绍请参见“**dump.json配置文件说明**”。 | 否 | -| backward_input | 该输入文件为首次运行训练dump得到反向API输入的.npy文件。例如若需要dump Functional_conv2d_1 API的反向过程的输入输出,则需要在dump目录下查找命名包含Functional_conv2d_1、backward和input字段的.npy文件。 | 否 | -| input_output_mode | dump数据过滤。可取值"all"、"forward"、"backward"、"input"和"output",表示仅保存dump的数据中文件名包含"forward"、"backward"、"input"和"output"的前向、反向、输入或输出的.npy文件。参数示例input_output_mode=["backward"]或input_output_mode=["forward", "backward"]。默认为all,即保存所有dump的数据。除了all参数只能单独配置外,其他参数可以自由组合。 | 否 | -| summary_only | dump npy文件过滤,可取值True或False,配置为True后仅dump保存API统计信息的pkl文件,参数示例:summary_only=False,默认为False。 | 否 | -| summary_mode | 控制dump文件输出的模式,可取值md5(dump仅输出包含md5值的pkl文件,用于验证数据的完整性)、summary(dump仅输出包含API统计信息的pkl文件)、all(dump输出包含API统计信息的pkl文件以及具体的npy文件),参数示例:summary_mode=md5,默认为all。summary_only=True时,不允许配置该参数。 | 否 | -| overflow_nums | 控制溢出次数,表示第N次溢出时,停止训练,过程中检测到溢出API对应ACL数据均dump。参数示例:overflow_nums=3。配置overflow_check时可配置,默认不配置,即检测到1次溢出,训练停止,配置为-1时,表示持续检测溢出直到训练结束。 | 否 | -| need_replicate | 过程dump数据生成开关,执行溢出检测时,dump目录下会生成forward_real_data和backward_real_data的过程dump数据目录,可取值True(生成)或False(不生成),默认不生成。 | 否 | +| mode | dump模式。可取值"all"、"list"、"range"、"stack"、"acl"、"api_list"、"api_stack",各参数含义请参见本节的“**函数示例**”。参数示例:mode="list"。默认为"api_stack"。该参数配置值将作为dump数据文件名的前缀,详情请参见“**dump数据存盘说明**”。数据类型:str。 | 否 | +| scope或api_list | dump范围。根据model配置的模式选择dump的API范围,mode="api_list"时,需要配置api_list=[],其他模式有需要时配置scope=[]。参数示例:scope=["Tensor_permute_1_forward", "Tensor_transpose_2_forward"]、api_list=["relu"]。默认为空。数据类型:List[str]。 | 否 | +| filter_switch | dump bool和整型的tensor以及浮点、bool和整型的标量的过滤开关。可取值"ON"(表示开启过滤,即不dump)或"OFF"(表示关闭过滤)。参数示例:filter_switch="ON"。默认不配置,即filter_switch="OFF",表示dump上述数据。数据类型:str。 | 否 | +| acl_config | acl dump的配置文件。mode="acl"时,该参数必选;mode为其他值时,该参数不选。参数示例:acl_config='./dump.json'。dump.json配置文件详细介绍请参见“**dump.json配置文件说明**”。数据类型:str。 | 否 | +| backward_input | 该输入文件为首次运行训练dump得到反向API输入的.npy文件。例如若需要dump Functional_conv2d_1 API的反向过程的输入输出,则需要在dump目录下查找命名包含Functional_conv2d_1、backward和input字段的.npy文件。数据类型:str。 | 否 | +| input_output_mode | dump数据过滤。可取值"all"、"forward"、"backward"、"input"和"output",表示仅保存dump的数据中文件名包含"forward"、"backward"、"input"和"output"的前向、反向、输入或输出的.npy文件。参数示例input_output_mode=["backward"]或input_output_mode=["forward", "backward"]。默认为["all"],即保存所有dump的数据。除了all参数只能单独配置外,其他参数可以自由组合。数据类型:list。 | 否 | +| summary_only | dump npy文件过滤,可取值True或False,配置为True后仅dump保存API统计信息的pkl文件,参数示例:summary_only=False,默认为False。数据类型:bool。 | 否 | +| summary_mode | 控制dump文件输出的模式,可取值md5(dump仅输出包含md5值的pkl文件,用于验证数据的完整性)、summary(dump仅输出包含API统计信息的pkl文件)、all(dump输出包含API统计信息的pkl文件以及具体的npy文件),参数示例:summary_mode="md5",默认为"all"。summary_only=True时,不允许配置该参数。数据类型:str。 | 否 | +| overflow_nums | 控制溢出次数,表示第N次溢出时,停止训练,过程中检测到溢出API对应ACL数据均dump。参数示例:overflow_nums=3。配置overflow_check时可配置,默认不配置,即检测到1次溢出,训练停止,配置为-1时,表示持续检测溢出直到训练结束。数据类型:int。 | 否 | +| need_replicate | 过程dump数据生成开关,执行溢出检测时,dump目录下会生成forward_real_data和backward_real_data的过程dump数据目录,可取值True(生成)或False(不生成),默认不生成。数据类型:bool。 | 否 | **函数示例** @@ -1132,8 +1139,8 @@ seed_all(seed=1234, mode=False) | 参数名 | 说明 | 是否必选 | | ------ | ------------------------------------------------------------ | -------- | -| seed | 随机数种子。参数示例:seed=1000。默认值为:1234。 | 否 | -| mode | 确定性计算模式。可配置True或False。参数示例:mode=True。默认为False。
即使在相同的硬件和输入下,API多次执行的结果也可能不同,开启确定性计算是为了保证在相同的硬件和输入下,API多次执行的结果相同。
确定性计算会导致API执行性能降低,建议在发现模型多次执行结果不同的情况下开启。
rnn类算子、ReduceSum、ReduceMean等算子可能与确定性计算存在冲突,若开启确定性计算后多次执行的结果不相同,则考虑存在这些算子。 | 否 | +| seed | 随机数种子。参数示例:seed=1000。默认值为:1234。数据类型:int。 | 否 | +| mode | 确定性计算模式。可配置True或False。参数示例:mode=True。默认为False。数据类型:bool。
即使在相同的硬件和输入下,API多次执行的结果也可能不同,开启确定性计算是为了保证在相同的硬件和输入下,API多次执行的结果相同。
确定性计算会导致API执行性能降低,建议在发现模型多次执行结果不同的情况下开启。
rnn类算子、ReduceSum、ReduceMean等算子可能与确定性计算存在冲突,若开启确定性计算后多次执行的结果不相同,则考虑存在这些算子。 | 否 | **函数示例** @@ -1207,8 +1214,8 @@ set_dump_path(fpath=None, dump_tag='ptdbg_dump') | 参数名 | 说明 | 是否必选 | | -------- | ------------------------------------------------------------ | -------- | -| fpath | 设置数据目录路径。参数示例:'./dump_path'。
默认在dump_path目录下生成`ptdbg_dump_{version}`目录,并在该目录下生成`dump.pkl`文件以及`dump`数据文件保存目录。
当set_dump_switch函数配置了mode参数时,`dump.pkl`文件以及`dump`数据文件保存目录名称添加mode参数值为前缀,详情请参见“**dump数据存盘说明**”。
未配置fpath时,也可以通过环境变量ASCEND_WORK_PATH配置dump路径,此时数据将落盘在${ASCEND_WORK_PATH}/dump_data下,自定义配置dump_path优先级高于环境变量,fpath和环境变量需要二选一。 | 否 | -| dump_tag | 设置数据目录名称。参数示例:dump_tag='dump_conv2d'。默认数据目录命名为ptdbg_dump_{version}。
{version}为当前安装ptdbg_ascend工具版本。目录结构参见“**dump数据存盘说明**”。
配置该参数会将生成的`ptdbg_dump_{version}`目录名称变更为dump_tag配置的值,如`dump_conv2d_{version}`。 | 否 | +| fpath | 设置数据目录路径。参数示例:'./dump_path'。数据类型:str。
默认在dump_path目录下生成`ptdbg_dump_{version}`目录,并在该目录下生成`dump.pkl`文件以及`dump`数据文件保存目录。
当set_dump_switch函数配置了mode参数时,`dump.pkl`文件以及`dump`数据文件保存目录名称添加mode参数值为前缀,详情请参见“**dump数据存盘说明**”。
未配置fpath时,也可以通过环境变量ASCEND_WORK_PATH配置dump路径,此时数据将落盘在${ASCEND_WORK_PATH}/dump_data下,自定义配置dump_path优先级高于环境变量,fpath和环境变量需要二选一。 | 否 | +| dump_tag | 设置数据目录名称。参数示例:dump_tag='dump_conv2d'。默认数据目录命名为ptdbg_dump_{version}。数据类型:str。
{version}为当前安装ptdbg_ascend工具版本。目录结构参见“**dump数据存盘说明**”。
配置该参数会将生成的`ptdbg_dump_{version}`目录名称变更为dump_tag配置的值,如`dump_conv2d_{version}`。 | 否 | **函数示例** @@ -1245,11 +1252,11 @@ register_hook(model, hook, overflow_nums=overflow_nums, dump_mode=dump_mode, dum | 参数名 | 说明 | 是否必选 | | ------------- | ------------------------------------------------------------ | -------- | -| model | 传入网络模型实例化的对象。参数示例: model=net,net为网络模型实例化的对象名称。 | 是 | -| hook | 注册工具的dump和溢出检测钩子。可取值overflow_check(表示溢出检测)和acc_cmp_dump(表示dump数据),二选一。 | 是 | -| overflow_nums | 控制溢出次数,表示第N次溢出时,停止训练,过程中检测到溢出API对应ACL数据均dump。参数示例:overflow_nums=3。配置overflow_check时可配置,默认不配置,即检测到1次溢出,训练停止,配置为-1时,表示持续检测溢出直到训练结束。 | 否 | -| dump_mode | 控制针对溢出API的dump模式,可取值"acl"或"api"。配置acl时,表示dump ACL级别的溢出数据,此时set_dump_path参数不生效,dump数据目录由dump_config的.json文件配置。参数示例:dump_mode="acl"。默认不配置,即dump API级别的溢出数据。 | 否 | -| dump_config | acl dump的配置文件。dump_mode="acl"时,该参数必选;dump_mode="api"时,该参数不选。参数示例:dump_config='./dump.json'。 | 否 | +| model | 传入网络模型实例化的对象。参数示例: model=net,net为网络模型实例化的对象名称。数据类型:torch.nn.Module。 | 是 | +| hook | 注册工具的dump和溢出检测钩子。可取值overflow_check(表示溢出检测)和acc_cmp_dump(表示dump数据),二选一。数据类型:Callable。 | 是 | +| overflow_nums | 控制溢出次数,表示第N次溢出时,停止训练,过程中检测到溢出API对应ACL数据均dump。参数示例:overflow_nums=3。配置overflow_check时可配置,默认不配置,即检测到1次溢出,训练停止,配置为-1时,表示持续检测溢出直到训练结束。数据类型:int。 | 否 | +| dump_mode | 控制针对溢出API的dump模式,可取值"acl"或"api"。配置acl时,表示dump ACL级别的溢出数据,此时set_dump_path参数不生效,dump数据目录由dump_config的.json文件配置。参数示例:dump_mode="acl"。默认不配置,即dump API级别的溢出数据。数据类型:str。 | 否 | +| dump_config | acl dump的配置文件。dump_mode="acl"时,该参数必选;dump_mode="api"时,该参数不选。参数示例:dump_config='./dump.json'。数据类型:str。 | 否 | **函数示例** @@ -1309,12 +1316,12 @@ def set_dump_switch(switch, mode="all", scope=[], api_list=[], filter_switch="OF | 参数名 | 说明 | 是否必选 | | --------------- | ------------------------------------------------------------ | -------- | -| switch | dump开关。可取值"ON"或"OFF"。须在选定dump开始的位置配置set_dump_switch("ON");dump结束的位置设置set_dump_switch("OFF")。 | 是 | -| mode | dump模式。可取值"all"、"list"、"range"、"stack"、"acl"、"api_list"、"api_stack",各参数含义请参见本节的“**函数示例**”。参数示例:mode="list"。默认为all。该参数配置值将作为dump数据文件名的前缀,详情请参见“**dump数据存盘说明**”。 | 否 | -| scope或api_list | dump范围。根据model配置的模式选择dump的API范围。参数示例:scope=["Tensor_permute_1_forward", "Tensor_transpose_2_forward"]、api_list=["relu"]。默认为空。 | 否 | -| filter_switch | dump bool和整型的tensor以及浮点、bool和整型的标量的过滤开关。可取值"ON"或"OFF"。参数示例:filter_switch="ON"。默认不配置,即filter_switch="OFF",表示dump上述数据。 | 否 | -| dump_mode | dump数据过滤。可取值"all"、"forward"、"backward"、"input"和"output",表示仅保存dump的数据中文件名包含"forward"、"backward"、"input"和"output"的前向、反向、输入或输出的.npy文件。参数示例dump_mode=["backward"]或dump_mode=["forward", "backward"]。默认为all,即保存所有dump的数据。除了all参数只能单独配置外,其他参数可以自由组合。 | 否 | -| summary_only | dump npy文件过滤,可取值True或False,配置为True后仅dump保存API统计信息的pkl文件,参数示例:summary_only=False,默认为False。 | 否 | +| switch | dump开关。可取值"ON"或"OFF"。须在选定dump开始的位置配置set_dump_switch("ON");dump结束的位置设置set_dump_switch("OFF")。数据类型:str。 | 是 | +| mode | dump模式。可取值"all"、"list"、"range"、"stack"、"acl"、"api_list"、"api_stack",各参数含义请参见本节的“**函数示例**”。参数示例:mode="list"。默认为"all"。该参数配置值将作为dump数据文件名的前缀,详情请参见“**dump数据存盘说明**”。数据类型:str。 | 否 | +| scope或api_list | dump范围。根据model配置的模式选择dump的API范围。参数示例:scope=["Tensor_permute_1_forward", "Tensor_transpose_2_forward"]、api_list=["relu"]。默认为空。数据类型:List[str]。 | 否 | +| filter_switch | dump bool和整型的tensor以及浮点、bool和整型的标量的过滤开关。可取值"ON"或"OFF"。参数示例:filter_switch="ON"。默认不配置,即filter_switch="OFF",表示dump上述数据。数据类型:str。 | 否 | +| dump_mode | dump数据过滤。可取值"all"、"forward"、"backward"、"input"和"output",表示仅保存dump的数据中文件名包含"forward"、"backward"、"input"和"output"的前向、反向、输入或输出的.npy文件。参数示例dump_mode=["backward"]或dump_mode=["forward", "backward"]。默认为all,即保存所有dump的数据。除了all参数只能单独配置外,其他参数可以自由组合。数据类型:List[str]。 | 否 | +| summary_only | dump npy文件过滤,可取值True或False,配置为True后仅dump保存API统计信息的pkl文件,参数示例:summary_only=False,默认为False。数据类型:bool。 | 否 | **推荐配置** @@ -1463,8 +1470,8 @@ set_overflow_check_switch(switch, filter_switch='OFF') | 参数名 | 说明 | 是否必选 | | ------------- | ------------------------------------------------------------ | -------- | -| switch, | 检测开关。可取值"ON"或"OFF"。如果只在特定的step溢出检测,则在期望溢出检测的step位置开始前插入set_overflow_check_switch("ON"),在step结束的位置插入set_overflow_check_switch("OFF")。 | 是 | -| filter_switch | dump bool和整型的tensor以及浮点、bool和整型的标量的过滤开关。可取值"ON"或"OFF"。参数示例:filter_switch="ON"。默认不配置,即filter_switch="OFF",表示dump上述数据。 | 否 | +| switch, | 检测开关。可取值"ON"或"OFF"。如果只在特定的step溢出检测,则在期望溢出检测的step位置开始前插入set_overflow_check_switch("ON"),在step结束的位置插入set_overflow_check_switch("OFF")。数据类型:str。 | 是 | +| filter_switch | dump bool和整型的tensor以及浮点、bool和整型的标量的过滤开关。可取值"ON"或"OFF"。参数示例:filter_switch="ON"。默认不配置,即filter_switch="OFF",表示dump上述数据。数据类型:str。 | 否 | **函数示例** @@ -1510,7 +1517,7 @@ set_backward_input(backward_input) | 参数名 | 说明 | 是否必选 | | -------------- | ------------------------------------------------------------ | -------- | -| backward_input | 该输入文件为首次运行训练dump得到反向API输入的.npy文件。例如若需要dump Functional_conv2d_1 API的反向过程的输入输出,则需要在dump目录下查找命名包含Functional_conv2d_1、backward和input字段的.npy文件。 | 是 | +| backward_input | 该输入文件为首次运行训练dump得到反向API输入的.npy文件。例如若需要dump Functional_conv2d_1 API的反向过程的输入输出,则需要在dump目录下查找命名包含Functional_conv2d_1、backward和input字段的.npy文件。数据类型:str。 | 是 | **函数示例** @@ -1597,8 +1604,8 @@ module_dump(module, module_name) | 参数名 | 说明 | 是否必选 | | ----------- | ------------------------------------------------------------ | -------- | -| module | 网络中实例化好的nn.Module类模块的model对象。 | 是 | -| module_name | 用户自定义的该model名称。主要用于dump数据文件的命名,便于在比对时识别模块级数据。 | 是 | +| module | 网络中实例化好的nn.Module类对象。数据类型:torch.nn.Module。 | 是 | +| module_name | 用户自定义的该model名称。主要用于dump数据文件的命名,便于在比对时识别模块级数据。数据类型:str。 | 是 | ### module_dump_end @@ -1689,7 +1696,7 @@ dump过程中,npy文件在对应算子或者模块被执行后就会落盘, 精度比对dump场景的结果如下: -* dump.pkl文件:包含dump数据的API名称(命名格式为:`{api_type}_{api_name}_{API调用次数}_{前向反向}_{input/output}.{参数序号}`)、dtype、 shape、各数据的max、min、mean、L2norm统计信息以及当配置summary_mode=md5时的md5数据。 +* dump.pkl文件:包含dump数据的API名称(命名格式为:`{api_type}_{api_name}_{API调用次数}_{前向反向}_{input/output}.{参数序号}`)、dtype、 shape、各数据的max、min、mean、L2norm统计信息以及当配置summary_mode="md5"时的md5数据。 其中,“参数序号”表示该API下的第n个参数,例如1,则为第一个参数,若该参数为list格式,则根据list继续排序,例如1.1,表示该API的第1个参数的第1个子参数;L2norm表示2范数(平方根)。 @@ -1767,9 +1774,9 @@ compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs) | 参数名 | 说明 | 是否必选 | | -------------- | ------------------------------------------------------------ | -------- | -| npu_dump_dir | 配置NPU环境下的dump目录。dump数据目录须指定到step级。参数示例:'./npu_dump/ptdbg_dump_v4.0/step0'。register_hook方式可通过set_dump_path函数的dump_tag参数修改该目录名称。 | 是 | -| bench_dump_dir | 配置CPU、GPU或NPU环境下的dump目录。参数示例:'./gpu_dump/ptdbg_dump_v4.0/step0'。register_hook方式可通过set_dump_path函数的dump_tag参数修改该目录名称。 | 是 | -| output_path | 配置比对结果csv文件存盘目录。需要预先创建output_path目录。参数示例:'./output'。文件名称基于时间戳自动生成,格式为:`compare_result_rank{npu_ID}-rank{cpu/gpu/npu_ID}_{timestamp}.csv`。 | 是 | +| npu_dump_dir | 配置NPU环境下的dump目录。dump数据目录须指定到step级。参数示例:'./npu_dump/ptdbg_dump_v4.0/step0'。register_hook方式可通过set_dump_path函数的dump_tag参数修改该目录名称。数据类型:str。 | 是 | +| bench_dump_dir | 配置CPU、GPU或NPU环境下的dump目录。参数示例:'./gpu_dump/ptdbg_dump_v4.0/step0'。register_hook方式可通过set_dump_path函数的dump_tag参数修改该目录名称。数据类型:str。 | 是 | +| output_path | 配置比对结果csv文件存盘目录。需要预先创建output_path目录。参数示例:'./output'。文件名称基于时间戳自动生成,格式为:`compare_result_rank{npu_ID}-rank{cpu/gpu/npu_ID}_{timestamp}.csv`。数据类型:str。 | 是 | | **kwargs | 支持compare的所有可选参数。 | 否 | **函数示例** @@ -1799,11 +1806,11 @@ compare(input_param, output_path, stack_mode=False, auto_analyze=True, fuzzy_mat | 参数名 | 说明 | 是否必选 | | ------------ | ------------------------------------------------------------ | -------- | -| input_param | 配置dump数据文件及目录。配置参数包括:
- "npu_pkl_path":指定NPU dump目录下的.pkl文件。参数示例:"npu_pkl_path": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl"。必选。
- "bench_pkl_path":指定CPU、GPU或NPU dump目录下的.pkl文件。参数示例:"bench_pkl_path": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl"。必选。
- "npu_dump_data_dir":"指定NPU dump目录下的dump数据目录。参数示例:"npu_dump_data_dir": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump"。可选,仅比对pkl文件时不选。
- "bench_dump_data_dir":"指定CPU、GPU或NPU dump目录下的dump数据目录。参数示例:"npu_dump_data_dir": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump"。可选,仅比对pkl文件时不选。
- "is_print_compare_log":配置是否开启日志打屏。可取值True或False。可选。 | 是 | -| output_path | 配置比对结果csv文件存盘目录。参数示例:'./output'。文件名称基于时间戳自动生成,格式为:`compare_result_{timestamp}.csv`。 | 是 | -| stack_mode | 配置stack_mode的开关。仅当dump数据时配置debugger.configure_hook或set_dump_switch的mode="api_stack"时需要开启。参数示例:stack_mode=True,默认为False。 | 否 | -| auto_analyze | 自动精度分析,开启后工具自动针对比对结果进行分析,识别到第一个精度不达标节点(在比对结果文件中的“Accuracy Reached or Not”列显示为No),并给出问题可能产生的原因(打屏展示并生成advisor_{timestamp}.txt文件)。可取值True或False,参数示例:auto_analyze=False,默认为True。 | 否 | -| fuzzy_match | 模糊匹配。开启后,对于网络中同一层级且命名仅调用次数不同的API,可匹配并进行比对。可取值True或False,参数示例:fuzzy_match=True,默认为False。 | 否 | +| input_param | 配置dump数据文件及目录。数据类型:dict。配置参数包括:
- "npu_pkl_path":指定NPU dump目录下的.pkl文件。参数示例:"npu_pkl_path": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl"。必选。
- "bench_pkl_path":指定CPU、GPU或NPU dump目录下的.pkl文件。参数示例:"bench_pkl_path": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl"。必选。
- "npu_dump_data_dir":"指定NPU dump目录下的dump数据目录。参数示例:"npu_dump_data_dir": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump"。可选,仅比对pkl文件时不选。
- "bench_dump_data_dir":"指定CPU、GPU或NPU dump目录下的dump数据目录。参数示例:"npu_dump_data_dir": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump"。可选,仅比对pkl文件时不选。
- "is_print_compare_log":配置是否开启日志打屏。可取值True或False。可选。 | 是 | +| output_path | 配置比对结果csv文件存盘目录。参数示例:'./output'。文件名称基于时间戳自动生成,格式为:`compare_result_{timestamp}.csv`。数据类型:str。 | 是 | +| stack_mode | 配置stack_mode的开关。仅当dump数据时配置debugger.configure_hook或set_dump_switch的mode="api_stack"时需要开启。可取值True或False,参数示例:stack_mode=True,默认为False。数据类型:bool。 | 否 | +| auto_analyze | 自动精度分析,开启后工具自动针对比对结果进行分析,识别到第一个精度不达标节点(在比对结果文件中的“Accuracy Reached or Not”列显示为No),并给出问题可能产生的原因(打屏展示并生成advisor_{timestamp}.txt文件)。可取值True或False,参数示例:auto_analyze=False,默认为True。数据类型:bool。 | 否 | +| fuzzy_match | 模糊匹配。开启后,对于网络中同一层级且命名仅调用次数不同的API,可匹配并进行比对。可取值True或False,参数示例:fuzzy_match=True,默认为False。数据类型:bool。 | 否 | **函数示例** @@ -1846,16 +1853,15 @@ compare(dump_result_param, output_path="./output", stack_mode=True) **函数原型** ```python -parse(pkl_file, moudule_name_prefix) +parse(pkl_file, module_name_prefix) ``` **参数说明** -| 参数名 | 说明 | 是否必选 | -| ------------------- | ------------------------------------------------------------ | -------- | -| pkl_file | 指定dump数据文件中的pkl文件名。参数示例:"./npu_dump/ptdbg_dump_v4.0/step0/rank0/dump.pkl"。 | 是 | -| moudule_name_prefix | 指定待提取的API接口前缀。参数示例:"Torch_norm_1_forward"。 | 是 | - +| 参数名 | 说明 | 是否必选 | +| ------------------ | ------------------------------------------------------------ | -------- | +| pkl_file | 指定dump数据文件中的pkl文件名。参数示例:"./npu_dump/ptdbg_dump_v4.0/step0/rank0/dump.pkl"。数据类型:str。 | 是 | +| module_name_prefix | 指定待提取的API接口前缀。参数示例:"Torch_norm_1_forward"。数据类型:str。 | 是 | **函数示例** 创建堆栈信息及数据统计信息提取脚本,例如parse.py,拷贝如下代码,具体参数请根据实际环境修改。 @@ -1992,14 +1998,15 @@ Parse >>> cad -m /home/xxx/my_dump_path/20000124003856/0 输入以下比对命令进行数据比对。 ```bash -vc -m my_dump_path -g golden_dump_path [-out output_path] +vc -m my_dump_path -g golden_dump_path [-out output_path] [-cmp_path msaccucmp_path] ``` -| 参数名称 | 说明 | 是否必选 | -| -------- | ------------------------------------------------------------ | -------- | -| -m | 待比对ACL dump数据目录。如果比对单个算子,需要指定到ACL dump数据的model_id级目录;如果批量比对,则指定到cad转换后的timestamp级目录。 | 是 | -| -g | 标杆ACL dump数据目录。如果比对单个算子,需要指定到ACL dump数据的model_id级目录;如果批量比对,则指定到cad转换后的timestamp级目录。 | 是 | -| -out | 结果输出目录,须指定已存在的目录,默认为./parse_data/acl_batch_comapre。未指定时保存在默认路径下,比对结束后会打印log提示输出结果存放路径。 | 否 | +| 参数名称 | 说明 | 是否必选 | +| --------- | ------------------------------------------------------------ | -------- | +| -m | 待比对ACL dump数据目录。如果比对单个算子,需要指定到ACL dump数据的model_id级目录;如果批量比对,则指定到cad转换后的timestamp级目录。 | 是 | +| -g | 标杆ACL dump数据目录。如果比对单个算子,需要指定到ACL dump数据的model_id级目录;如果批量比对,则指定到cad转换后的timestamp级目录。 | 是 | +| -out | 结果输出目录,须指定已存在的目录,默认为./parse_data/acl_batch_comapre。未指定时保存在默认路径下,比对结束后会打印log提示输出结果存放路径。 | 否 | +| -cmp_path | 指定msaccucmp路径,默认路径为:/usr/local/Ascend/ascend-toolkit/latest/tools/operator_cmp/compare/msaccucmp.py | 否 | 输出结果:batch_compare_{timestamp}.csv文件。 diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py index 01759056764959d8ac9bbef3b2b19f41ddb961fa..3c2e5f02d9b3cb4f93e7885c75a4f4c7078aa76c 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py @@ -713,6 +713,8 @@ def parameter_adapter(func): else: res = [input_tensor[tensor_index] for tensor_index in indices] return getattr(torch._C._VariableFunctionsClass, "stack")(res, 0) + if self.op_name_ == "__eq__" and args[1] is None: + return False return func(self, *args, **kwargs) return inner diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py index 68fe6c0d7ea6442bc01dfea19cbc97295e174016..0885ebb0ccc1b6bc280ca8f8f3e4ebbd6de11610 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py @@ -273,6 +273,9 @@ def read_op(ops_queue, pkl_file_handle, stack_mode): continue if len(tensor_line) != 0: tensor_data = json.loads(tensor_line) + if not isinstance(tensor_data, list): + print_error_log(f"This data is not a list, please check the dump data pkl file. {tensor_data}") + raise CompareException(CompareException.INVALID_DATA_ERROR) read_output_flag["last_line"] = read_output_flag.get("curr_line") read_output_flag["curr_line"] = True if tensor_data[0].find(end_flag) != -1 else False diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/api_registry.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/api_registry.py index 064103aead22e0f417794e047bbb83557e1b7020..cf21fe86bb541e64101dbdd360739a136f898d71 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/api_registry.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/api_registry.py @@ -60,12 +60,23 @@ class ApiRegistry: @staticmethod def store_ori_attr(ori_api_group, api_list, api_ori_attr): for api in api_list: - api_ori_attr[api] = getattr(ori_api_group, api) + if '.' in api: + sub_module_name, sub_op = api.rsplit('.', 1) + sub_module = getattr(ori_api_group, sub_module_name) + api_ori_attr[api] = getattr(sub_module, sub_op) + else: + api_ori_attr[api] = getattr(ori_api_group, api) @staticmethod def set_api_attr(api_group, attr_dict): for api, api_attr in attr_dict.items(): - setattr(api_group, api, api_attr) + if '.' in api: + sub_module_name, sub_op = api.rsplit('.', 1) + sub_module = getattr(api_group, sub_module_name, None) + if sub_module is not None: + setattr(sub_module, sub_op, api_attr) + else: + setattr(api_group, api, api_attr) def api_modularity(self): self.set_api_attr(torch.Tensor, self.tensor_hook_attr) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml index de9fac5dbbbbd1a008f2c61ace8ea7fcabd7efe7..92096fc4bb336928b2ddf9c3e8eba33dca71a12c 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml @@ -560,12 +560,45 @@ tensor: - xlogy_ torch: + - linalg.norm + - linalg.vector_norm + - linalg.matrix_norm + - linalg.diagonal + - linalg.det + - linalg.slogdet + - linalg.cond + - linalg.matrix_rank + - linalg.qr + - linalg.lu + - linalg.lu_factor + - linalg.svd + - linalg.svdvals + - linalg.solve + - linalg.lstsq + - linalg.inv + - linalg.pinv + - linalg.matrix_exp + - linalg.matrix_power + - linalg.cross + - linalg.matmul + - linalg.vecdot + - linalg.multi_dot + - linalg.householder_product + - linalg.tensorsolve + - linalg.vander + - linalg.cholesky_ex + - linalg.inv_ex + - linalg.solve_ex + - linalg.lu_factor_ex + - linalg.ldl_factor + - linalg.ldl_factor_ex - _adaptive_avg_pool2d - _add_relu - _add_relu_ - _aminmax - _batch_norm_impl_index - _convolution + - _foreach_norm - _softmax_backward_data - abs - abs_ diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_torch.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_torch.py index 5dcc41b1c8c23c90a6bbad1fe764fef389595661..e3a4af7a850397989fad1e810181eaa7d6fccfb1 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_torch.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_torch.py @@ -32,12 +32,29 @@ with FileOpen(yaml_path, 'r') as f: def get_torch_ops(): global WrapTorchOps - _torch_ops = dir(torch) - return set(WrapTorchOps) & set(_torch_ops) + _torch_ops = [] + for operation in WrapTorchOps: + if '.' in operation: + operation_sub_module_name, operation_sub_op = operation.rsplit('.', 1) + operation_sub_module = getattr(torch, operation_sub_module_name) + if operation_sub_op in dir(operation_sub_module): + _torch_ops.append(operation) + else: + if hasattr(torch, operation): + _torch_ops.append(operation) + return set(_torch_ops) + + +TorchOps = {} +for op in get_torch_ops(): + if '.' in op: + sub_module_name, sub_op = op.rsplit('.', 1) + sub_module = getattr(torch, sub_module_name) + TorchOps[op] = getattr(sub_module, sub_op) + else: + TorchOps[op] = getattr(torch, op) -TorchOps = {op: getattr(torch, op) for op in get_torch_ops()} - class HOOKTorchOP(object): pass diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py b/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py index fb5b8ff0007ca9e25b99720c0a30d6c5f01975cf..36082bfdef2711aafc85e8d83565431f37e0a71a 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py @@ -20,7 +20,7 @@ import stat from pathlib import Path import setuptools -VERSION = '5.0.T4' +VERSION = '5.0' def generate_ptdbg_ascend_version(): diff --git a/debug/weight_convert/README.md b/debug/weight_convert/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6cc4e2481fbee3460fa79d600bd680721db4e6b7 --- /dev/null +++ b/debug/weight_convert/README.md @@ -0,0 +1,90 @@ +## 训推一体权重转换工具 + +推理场景基于Huggingface的权重进行推理,Huggingface主要基于GPU训练,而昇腾主要在NPU上进行训练,不同硬件平台对应的模型权重格式存在差异。需要支持在NPU和GPU上训练好的模型转成Huggingface格式safetensors文件,用于推理使用。 + + +#### 前提条件 +准备以下权重: +1. 复训前huggingface权重 +2. 复训后权重 + +##### 依赖安装 + +如下命令如果使用非root用户安装,需要在安装命令后加上--user,例如:**pip3 install xxx** **--user**,安装命令可在任意路径下执行。 + +```shell +# python=3.8 +pip install torch-2.1.0-cp38-cp38m-linux_aarch64.whl +pip install torch_npu-2.1.0.post5_XXXXXX-cp38-cp38m-linux_aarch64.whl +source /path/to/Ascend/ascend-toolkit/set_env.sh + +git clone https://gitee.com/ascend/ModelLink.git +git clone https://gitee.com/ascend/AscendSpeed.git + +cd AscendSpeed +pip3 install -r requirements.txt +pip3 install -e . +cd .. +cd ModelLink +pip3 install -r requirements.txt +export PYTHONPATH=`pwd`:$PYTHONPATH +cd .. +``` + +##### 代码获取 + +```shell +git clone https://gitee.com/Ascend/att.git +cd att +git checkout develop + +cd ../ModelLink +git reset --hard c566ce4fa99cf3ea179b163355fca2c2aedfc471 +cp ../att/debug/weight_convert/diff.patch . +git apply --check diff.patch +git apply diff.patch +cd ../att/debug/weight_convert/ +``` + +#### 启动工具 + +1. 参考表1 参数说明配置信息,执行如下命令启动分析任务。转换后权重会保存在`原始huggingface权重存放位置/mg2hf`下 + +```shell +python3 convert_ckpt.py -i 待转换权重路径 -o 原始huggingface权重存放位置 -m 模型类型,可选项:llama/bloom\ + [--target-tensor-parallel-size 张量并行数 \ + --target-pipeline-parallel-size 流水线并行数\ + --embed-layernorm] +``` + + **表1 参数说明** + + | 参数 | 参数说明 | 取值示例 | + | ---------------------------------- | -------------------------------------- | ------------------------------------------------------------ | + | -i
--input-model-dir | **必选** 待转换权重文件的存放位置 | /home/*xxx*/*input_weight* | + | -o
--output-model-dir | **必选** 导出权重文件的存放位置(要求目录下有原始huggingface权重) | /home/*xxx*/*output_weight* | + | -m
--model | **必选** 转换的模型类型 | llama(默认)
bloom | + | --target-tensor-parallel-size | 转换后张量并行数 | 1 | + | --target-pipeline-parallel-size | 转换后流水线并行数 | 1 | + | --embed-layernorm | 模型中是否存在embedding layernorm结构 | False(默认)
True | + | -h
--help | 显示帮助信息。 | - | + + +2. 模型转换命令参考 + + **Llama 7/13/65B**、 **Llama2 7/13/70B** +```shell +python3 convert_ckpt.py -o "your huggingface checkpoint output path" \ + -i "your megatron checkpoint path" \ + --model llama +``` + + **Bloom 7B** +```shell +python3 convert_ckpt.py -o "your huggingface checkpoint output path" \ + -i "your megatron checkpoint path" \ + --model bloom +``` + + +3. 分析完成后,进入输出路径,查看转换结果。 \ No newline at end of file diff --git a/debug/weight_convert/bloom.py b/debug/weight_convert/bloom.py new file mode 100644 index 0000000000000000000000000000000000000000..d884d451f3282be197df967ebdd979a20c6775c1 --- /dev/null +++ b/debug/weight_convert/bloom.py @@ -0,0 +1,526 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the License); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +from collections.abc import Mapping +import concurrent.futures +import os +import gc +import sys +import shutil +import torch +import torch_npu + + +def add_arguments(parser): + group = parser.add_argument_group(title='Megatron saver') + group.add_argument('--target-tensor-parallel-size', type=int, + help='Target tensor model parallel size, defaults to the tensor parallel size ' + 'in the input checkpoint if provided by the loader, otherwise to 1') + group.add_argument('--target-pipeline-parallel-size', type=int, + help='Target tensor model parallel size, default to the pipeline parall size ' + 'in the input checkpoint if provided by the loader, otherwise to 1') + group.add_argument("--w-pack", type=bool, + help='True is w_pack weight for llm', + default=False) + + +def save_huggingface_bloom(args, model, model_args): + hf2mg_map = {} + for name_param_m in model.named_parameters(): + print("name_param_m", name_param_m[0]) + layer_num = name_param_m[0].split(".")[3] if len(name_param_m[0].split(".")) > 3 else name_param_m[0].split(".")[1] + nh = model_args.num_attention_heads + ng = ( + model_args.checkpoint_args.num_query_groups + if model_args.checkpoint_args.group_query_attention + else model_args.num_attention_heads + ) + repeats = nh // ng + # word embedding + if name_param_m[0] == "language_model.embedding.word_embeddings.weight": + hf2mg_map["word_embeddings.weight"] = name_param_m[1] + continue + if name_param_m[0] == "language_model.embedding.word_embeddings.norm.weight": + hf2mg_map["word_embeddings_layernorm.weight"] = name_param_m[1] + continue + if name_param_m[0] == "language_model.embedding.word_embeddings.norm.bias": + hf2mg_map["word_embeddings_layernorm.bias"] = name_param_m[1] + continue + + # input layernorm + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.input_norm.weight": + hf2mg_map[f"h.{layer_num}.input_layernorm.weight"] = name_param_m[1] + continue + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.input_norm.bias": + hf2mg_map[f"h.{layer_num}.input_layernorm.bias"] = name_param_m[1] + continue + + # qkv + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.self_attention.query_key_value.weight": + hf2mg_map[f"h.{layer_num}.self_attention.query_key_value.weight"] = name_param_m[1] + continue + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.self_attention.query_key_value.bias": + hf2mg_map[f"h.{layer_num}.self_attention.query_key_value.bias"] = name_param_m[1] + continue + + # post attention norm + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.post_attention_norm.weight": + hf2mg_map[f"h.{layer_num}.post_attention_layernorm.weight"] = name_param_m[1] + continue + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.post_attention_norm.bias": + hf2mg_map[f"h.{layer_num}.post_attention_layernorm.bias"] = name_param_m[1] + continue + + # dense + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.self_attention.dense.weight": + hf2mg_map[f"h.{layer_num}.self_attention.dense.weight"] = name_param_m[ + 1 + ] + continue + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.self_attention.dense.bias": + hf2mg_map[f"h.{layer_num}.self_attention.dense.bias"] = name_param_m[1] + continue + # mlp + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.mlp.dense_h_to_4h.weight": + hf2mg_map[f"h.{layer_num}.mlp.dense_h_to_4h.weight"] = name_param_m[1] + continue + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.mlp.dense_h_to_4h.bias": + hf2mg_map[f"h.{layer_num}.mlp.dense_h_to_4h.bias"] = name_param_m[1] + continue + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.mlp.dense_4h_to_h.weight": + hf2mg_map[f"h.{layer_num}.mlp.dense_4h_to_h.weight"] = name_param_m[1] + continue + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.mlp.dense_4h_to_h.bias": + hf2mg_map[f"h.{layer_num}.mlp.dense_4h_to_h.bias"] = name_param_m[1] + continue + # final norm + if name_param_m[0] == "language_model.encoder.final_norm.weight": + hf2mg_map[f"ln_f.weight"] = name_param_m[1] + continue + if name_param_m[0] == "language_model.encoder.final_norm.bias": + hf2mg_map[f"ln_f.bias"] = name_param_m[1] + continue + print('hf2mg_map.keys', hf2mg_map.keys()) + + gc.collect() + file_format = "pytorch_model-{:05d}-of-{:05d}.bin" + file_list = os.listdir(args.output_model_dir) + output_mg2hg_path = os.path.join(args.output_model_dir, 'mg2hg') + os.makedirs(output_mg2hg_path, exist_ok=True) + for filename in file_list: + if filename.startswith("pytorch_model-") and filename.endswith(".bin"): + try: + start_index = len("pytorch_model-") + end_index = filename.index("-of-") + index1 = int(filename[start_index:end_index]) + index2 = int(filename[end_index + len("-of-"): -len(".bin")]) + if 0 <= index1 <= index2 <= 99999: + formatted_filename = file_format.format(index1, index2) + file_path = os.path.join(args.output_model_dir, formatted_filename) + hf_model = torch.load(file_path, map_location=torch.device('cpu')) + for key in hf_model.keys(): + print(key) + if key in hf2mg_map.keys(): + hf_model[key].data.copy_(hf2mg_map[key]) + output_file_path = os.path.join(output_mg2hg_path, formatted_filename) + print(f'save weight to {output_file_path}') + # torch.save(hf_model, output_file_path) + else: + print("Invalid file format:", filename) + except ValueError: + print("Unable to save file:", filename) + elif filename.endswith('.json') or filename.endswith('.model'): + source_file = os.path.join(args.output_model_dir, filename) + destination_file = os.path.join(output_mg2hg_path, filename) + shutil.copyfile(source_file, destination_file) + output_model_dir = os.path.join(args.output_model_dir, 'mg2hg') + print(f'save weight to {output_model_dir}') + + +def save_model_checkpoint(queue, args): + + # Search in directory above this + # sys.path.append(os.path.abspath( + # os.path.join(os.path.dirname(__file__), + # os.path.pardir, + # os.path.pardir))) + + from pretrain_gpt import model_provider + from ascendspeed import megatron_adaptor + from megatron.arguments import validate_args + from modellink.utils import parse_args + from megatron.checkpointing import save_checkpoint + from megatron.global_vars import set_global_variables, get_args + from megatron.core.enums import ModelType + from megatron.tokenizer.tokenizer import _vocab_size_with_padding + from megatron import fused_kernels + from megatron.core import mpu + + def queue_get(name=None): + val = queue.get() + if val == "exit": + raise RuntimeError('Loader exited, exiting saver') + if name is not None and args.checking and val["name"] != name: + val_name = val["name"] + raise RuntimeError(f'Unexpected message. Expecting "{name}" but got "{val_name}". Exiting saver.') + if name is not None: + print(f"received {name}") + return val + + def check_message(msg): + if not args.checking: + return + msg_name = msg.pop("name") + if len(msg.keys()) > 0: + print(f"Unexpected values in {msg_name}:") + for key in msg.keys(): + print(f" {key}") + raise RuntimeError(f"Exiting. If you want to ignore this, use the argument --no-checking.") + + + md = queue_get() + + if args.target_tensor_parallel_size is None: + if hasattr(md, 'previous_tensor_parallel_size'): + args.target_tensor_parallel_size = md.previous_tensor_parallel_size + else: + print("loader did not provide a tensor parallel size and --target-tensor-parallel-size not provided on command line. " + "Default to 1.") + args.target_tensor_parallel_size = 1 + + if args.target_pipeline_parallel_size is None: + if hasattr(md, 'previous_pipeline_parallel_size'): + args.target_pipeline_parallel_size = md.previous_pipeline_parallel_size + else: + print("loader did not provide a pipeline parallel size and --target-pipeline-parallel-size not provided on command line. " + "Default to 1.") + args.target_pipeline_parallel_size = 1 + + + # Arguments do sanity checks on the world size, but we don't care, + # so trick it into thinking we are plenty of processes + if args.target_tensor_parallel_size is not None and args.target_pipeline_parallel_size is not None: + os.environ["WORLD_SIZE"] = f'{args.target_tensor_parallel_size * args.target_pipeline_parallel_size}' + + # We want all arguments to come from us + sys.argv = ['script.py', + '--num-layers', str(md.num_layers), + '--hidden-size', str(md.hidden_size), + '--seq-length', str(md.seq_length), + '--num-attention-heads', str(md.num_attention_heads), + '--max-position-embeddings', str(md.max_position_embeddings), + '--position-embedding-type', str(md.position_embedding_type), + '--tokenizer-type', str(md.tokenizer_type), + '--tensor-model-parallel-size', str(args.target_tensor_parallel_size), + '--pipeline-model-parallel-size', str(args.target_pipeline_parallel_size), + '--no-masked-softmax-fusion', + '--no-bias-gelu-fusion', + '--no-bias-dropout-fusion', + '--no-async-tensor-model-parallel-allreduce', + '--use-cpu-initialization', + '--micro-batch-size', '1', + '--no-load-optim', + '--no-load-rng', + '--no-save-optim', + '--no-save-rng', + '--no-initialization', + '--save-interval', '1', + '--save', args.output_model_dir, + '--fp16' + ] + + if md.make_vocab_size_divisible_by is not None: + sys.argv.extend(['--make-vocab-size-divisible-by', str(md.make_vocab_size_divisible_by)]) + if md.output_layer: + sys.argv.append('--untie-embeddings-and-output-weights') + if not md.linear_bias: + sys.argv.append('--disable-bias-linear') + + margs = parse_args() + margs.w_pack = args.w_pack + + + if hasattr(md, 'checkpoint_args'): + # These are arguments that we are either changing, or cause problems for validation if they are set + # Note that some of these deal with T5 so will need to be changed if we support T5. + args_to_keep = ['tensor_model_parallel_size', 'pipeline_model_parallel_size', 'world_size', 'params_dtype', + 'num_layers_per_virtual_pipeline_stage', 'virtual_pipeline_model_parallel_size', + 'masked_softmax_fusion', 'bias_gelu_fusion', 'bias_dropout_fusion', + 'sequence_parallel', 'async_tensor_model_parallel_allreduce', + 'no_load_optim', 'no_load_rng', 'no_save_optim', 'no_save_rng', + 'vocab_file', 'tokenizer_model', + 'save_interval', 'save', + 'perform_initialization', 'use_cpu_initialization', + 'recompute_granularity', 'recompute_num_layers', 'recompute_method', + 'encoder_num_layers', 'encoder_seq_length', + 'distribute_saved_activations', + 'train_iters', 'lr_decay_iters', 'lr_warmup_iters', 'lr_warmup_fraction', + 'start_weight_decay', 'end_weight_decay'] + + + for arg, value in vars(md.checkpoint_args).items(): + if arg in args_to_keep: + continue + if not hasattr(margs, arg): + print(f"Checkpoint had argument {arg} but new arguments does not have this.") + continue + if getattr(margs, arg) != value: + print(f"Overwriting default {arg} value {getattr(margs, arg)} with value from checkpoint {value}.") + setattr(margs, arg, value) + + validate_args(margs) + + set_global_variables(margs, build_tokenizer=False) + + # margs = megatron args + margs = get_args() + + margs.model_type = ModelType.encoder_or_decoder + + if hasattr(md, 'consumed_train_samples'): + margs.consumed_train_samples = md.consumed_train_samples + margs.consumed_valid_samples = md.consumed_valid_samples + print(f"Setting consumed_train_samples to {margs.consumed_train_samples}" + f" and consumed_valid_samples to {margs.consumed_valid_samples}") + else: + print("consumed_train_samples not provided.") + + def get_models(count, dtype, pre_process, post_process): + models = [model_provider(pre_process, post_process).to(dtype) for _ in range(count)] + return models + + # fake initializing distributed + mpu.set_tensor_model_parallel_world_size(args.target_tensor_parallel_size) + mpu.set_pipeline_model_parallel_world_size(args.target_pipeline_parallel_size) + mpu.set_tensor_model_parallel_rank(0) + mpu.set_pipeline_model_parallel_rank(0) + # Embeddings + #----------- + embeddings_msg = queue_get("embeddings") + + pos_embed = None + if md.position_embedding_type == 'learned_absolute': + pos_embed = embeddings_msg.pop("position embeddings") + orig_word_embed = embeddings_msg.pop("word embeddings") + orig_word_embed_n_w, orig_word_embed_n_b = None, None + if "word embeddings norm_w" in embeddings_msg and "word embeddings norm_b" in embeddings_msg: + orig_word_embed_n_w = embeddings_msg.pop("word embeddings norm_w") + orig_word_embed_n_b = embeddings_msg.pop("word embeddings norm_b") + check_message(embeddings_msg) + + # Deal with padding + if md.true_vocab_size is not None: + # figure out what our padded vocab size is + orig_vocab_size = orig_word_embed.shape[0] + margs.padded_vocab_size = _vocab_size_with_padding(md.true_vocab_size, margs) + + # Cut out extra padding we don't need + if orig_vocab_size > margs.padded_vocab_size: + full_word_embed = orig_word_embed[0:margs.padded_vocab_size, :] + + # Expanding embedding to larger size by replicating final entry + elif orig_vocab_size < margs.padded_vocab_size: + padding_size = margs.padded_vocab_size - orig_vocab_size + + full_word_embed = torch.cat(( + orig_word_embed, + orig_word_embed[-1].unsqueeze(0).expand(padding_size, -1))) + + # Same size! + else: + full_word_embed = orig_word_embed + else: + print("Original vocab size not specified, leaving embedding table as-is. " + "If you've changed the tensor parallel size this could cause problems.") + margs.padded_vocab_size = orig_word_embed.shape[0] + full_word_embed = orig_word_embed + + # Split into new tensor model parallel sizes + out_word_embed = torch.chunk(full_word_embed, args.target_tensor_parallel_size, dim=0) + + # Make models for first pipeline stage and fill in embeddings + mpu.set_pipeline_model_parallel_rank(0) + post_process = args.target_pipeline_parallel_size == 1 + models = get_models(args.target_tensor_parallel_size, md.params_dtype, True, post_process) + for tp_rank, model in enumerate(models): + model.language_model.embedding.word_embeddings.weight.data.copy_(out_word_embed[tp_rank]) + if orig_word_embed_n_w is not None: + model.language_model.embedding.word_embeddings.norm.weight.data.copy_(orig_word_embed_n_w) + model.language_model.embedding.word_embeddings.norm.bias.data.copy_(orig_word_embed_n_b) + if pos_embed is not None: + model.language_model.embedding.position_embeddings.weight.data.copy_(pos_embed) + else: + if hasattr(model.language_model.embedding, 'position_embeddings'): + raise ValueError("model should have position_embeddings") + + # Transformer layers + #------------------- + total_layer_num = 0 + for pp_rank in range(args.target_pipeline_parallel_size): + # For later pipeline parallel ranks, make the new models + if pp_rank > 0: + mpu.set_pipeline_model_parallel_rank(pp_rank) + post_process = pp_rank == args.target_pipeline_parallel_size - 1 + models = get_models(args.target_tensor_parallel_size, md.params_dtype, False, post_process) + + encoder_layer_num = len(models[0].language_model.encoder.layers) + for layer in range(encoder_layer_num): + msg = queue_get(f"transformer layer {total_layer_num}") + + # duplicated tensors + input_norm_weight = msg.pop("input norm weight") + if md.norm_has_bias: + input_norm_bias = msg.pop("input norm bias") + post_norm_weight = msg.pop("post norm weight") + if md.norm_has_bias: + post_norm_bias = msg.pop("post norm bias") + if md.linear_bias: + dense_bias = msg.pop("dense bias") + mlp_l1_bias = msg.pop("mlp l1 bias") + + if args.add_qkv_bias: + qkv_bias = torch.chunk(msg.pop("qkv bias"), args.target_tensor_parallel_size, dim=0) + if args.add_dense_bias: + dense_bias = msg.pop("dense bias") + + qkv_org = msg.pop("qkv weight") + qkv_weight = torch.chunk(qkv_org, args.target_tensor_parallel_size, dim=0) + + # Split up the parallel tensors + dense_weight = torch.chunk(msg.pop("dense weight"), args.target_tensor_parallel_size, dim=1) + mlp_l1_weight = torch.chunk(msg.pop("mlp l1 weight"), args.target_tensor_parallel_size, dim=1) + + # Special handling for swiglu + if md.swiglu: + mlp_l0_weight_W = torch.chunk(msg.pop("mlp l0 weight W"), args.target_tensor_parallel_size, dim=0) + mlp_l0_weight_V = torch.chunk(msg.pop("mlp l0 weight V"), args.target_tensor_parallel_size, dim=0) + mlp_l0_weight = [torch.cat(weights, dim=0) for weights in zip(mlp_l0_weight_W, mlp_l0_weight_V)] + else: + mlp_l0_weight = torch.chunk(msg.pop("mlp l0 weight"), args.target_tensor_parallel_size, dim=0) + + if md.linear_bias: + qkv_bias = torch.chunk(msg.pop("qkv bias"), args.target_tensor_parallel_size, dim=0) + if md.swiglu: + mlp_l0_bias_W = torch.chunk(msg.pop("mlp l0 bias W"), args.target_tensor_parallel_size, dim=0) + mlp_l0_bias_V = torch.chunk(msg.pop("mlp l0 bias V"), args.target_tensor_parallel_size, dim=0) + mlp_l0_weight = [] + for weights in zip(mlp_l0_weight_W, mlp_l0_weight_V): + mlp_l0_weight.append(torch.cat(weights, dim=0)) + else: + mlp_l0_bias = torch.chunk(msg.pop("mlp l0 bias"), args.target_tensor_parallel_size, dim=0) + + # Save them to the model + for tp_rank in range(args.target_tensor_parallel_size): + layer_encoder = models[tp_rank].language_model.encoder.layers[layer] + layer_encoder.input_norm.weight.data.copy_(input_norm_weight) + if md.norm_has_bias: + layer_encoder.input_norm.bias.data.copy_(input_norm_bias) + layer_encoder.self_attention.query_key_value.weight.data.copy_(qkv_weight[tp_rank]) + layer_encoder.self_attention.dense.weight.data.copy_(dense_weight[tp_rank]) + layer_encoder.post_attention_norm.weight.data.copy_(post_norm_weight) + if md.norm_has_bias: + layer_encoder.post_attention_norm.bias.data.copy_(post_norm_bias) + layer_encoder.mlp.dense_h_to_4h.weight.data.copy_(mlp_l0_weight[tp_rank]) + layer_encoder.mlp.dense_4h_to_h.weight.data.copy_(mlp_l1_weight[tp_rank]) + if md.linear_bias: + layer_encoder.self_attention.query_key_value.bias.data.copy_(qkv_bias[tp_rank]) + layer_encoder.self_attention.dense.bias.data.copy_(dense_bias) + layer_encoder.mlp.dense_h_to_4h.bias.data.copy_(mlp_l0_bias[tp_rank]) + layer_encoder.mlp.dense_4h_to_h.bias.data.copy_(mlp_l1_bias) + if args.add_qkv_bias: + layer_encoder.self_attention.query_key_value.bias.data.copy_(qkv_bias[tp_rank]) + if args.add_dense_bias: + layer_encoder.self_attention.dense.bias.data.copy_(dense_bias) + + total_layer_num = total_layer_num + 1 + check_message(msg) + + if post_process: + msg = queue_get("final norm") + final_norm_weight = msg.pop("weight") + if md.norm_has_bias: + final_norm_bias = msg.pop("bias") + for tp_rank in range(args.target_tensor_parallel_size): + models[tp_rank].language_model.encoder.final_norm.weight.data.copy_(final_norm_weight) + if md.norm_has_bias: + models[tp_rank].language_model.encoder.final_norm.bias.data.copy_(final_norm_bias) + if pp_rank != 0 and not md.output_layer: + # Copy word embeddings to final pipeline rank + models[tp_rank].word_embeddings.weight.data.copy_(out_word_embed[tp_rank]) + del final_norm_weight + if md.norm_has_bias: + del final_norm_bias + check_message(msg) + + if md.output_layer: + msg = queue_get("output layer") + if not hasattr(models[0].language_model, 'output_layer'): + raise RuntimeError("ERROR: got an output layer, but model does not have one") + output_layer_weight = torch.chunk(msg.pop("weight"), args.target_tensor_parallel_size, dim=0) + for tp_rank in range(args.target_tensor_parallel_size): + models[tp_rank].language_model.output_layer.weight.data.copy_(output_layer_weight[tp_rank]) + del output_layer_weight + check_message(msg) + + msg = queue_get() + if msg != "done" and msg["name"] == "pooler": + if not hasattr(models[0].language_model, 'pooler'): + raise RuntimeError("ERROR: got a pooler, but model does not have one") + print("received pooler") + pooler_weight = msg.pop("weight") + pooler_bias = msg.pop("bias") + for tp_rank in range(args.target_tensor_parallel_size): + models[tp_rank].language_model.pooler.dense.weight.data.copy_(pooler_weight) + models[tp_rank].language_model.pooler.dense.bias.data.copy_(pooler_bias) + del pooler_weight + del pooler_bias + check_message(msg) + msg = queue_get() + + if msg != "done" and msg["name"] == "lm head": + if not hasattr(models[0], 'lm_head'): + raise RuntimeError("ERROR: got an lm head, but model does not have one") + print("received lm head") + lm_head_dense_weight = msg.pop("dense weight") + lm_head_dense_bias = msg.pop("dense bias") + lm_head_norm_weight = msg.pop("norm weight") + if md.norm_has_bias: + lm_head_norm_bias = msg.pop("norm bias") + for tp_rank in range(args.target_tensor_parallel_size): + models[tp_rank].lm_head.dense.weight.data.copy_(lm_head_dense_weight) + models[tp_rank].lm_head.dense.bias.data.copy_(lm_head_dense_bias) + models[tp_rank].lm_head.norm.weight.data.copy_(lm_head_norm_weight) + if md.norm_has_bias: + models[tp_rank].lm_head.norm.bias.data.copy_(lm_head_norm_bias) + check_message(msg) + msg = queue_get() + + if msg != "done" and msg["name"] == "binary head": + if not hasattr(models[0], 'binary_head'): + raise RuntimeError("ERROR: got a binary head, but model does not have one") + print("received binary head") + binary_head_weight = msg.pop("weight") + binary_head_bias = msg.pop("bias") + for tp_rank in range(args.target_tensor_parallel_size): + models[tp_rank].binary_head.weight.data.copy_(binary_head_weight) + models[tp_rank].binary_head.bias.data.copy_(binary_head_bias) + check_message(msg) + msg = queue_get() + + if msg != "done": + print("ERROR: got some more data but was expecting to be done") + + for tp_rank in range(args.target_tensor_parallel_size): + mpu.set_tensor_model_parallel_rank(tp_rank) + save_huggingface_bloom(args, models[tp_rank], md) diff --git a/debug/weight_convert/convert_ckpt.py b/debug/weight_convert/convert_ckpt.py new file mode 100644 index 0000000000000000000000000000000000000000..b88a73c361f5dbb96574fdbd509bfd57a887f27a --- /dev/null +++ b/debug/weight_convert/convert_ckpt.py @@ -0,0 +1,90 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the License); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +import argparse +import importlib +import torch.multiprocessing as mp + + +def check_and_convert_weight(args): + import torch + from transformers import AutoModelForCausalLM + try: + output_mg2hg_path = os.path.join(args.output_model_dir, 'mg2hg') + hf_model = AutoModelForCausalLM.from_pretrained( + output_mg2hg_path, device_map="cpu", torch_dtype=torch.float16) + hf_model.save_pretrained(output_mg2hg_path, safe_serialization=True) + except ModuleNotFoundError as e: + print('failed to convert bin 2 safetensors') + raise exc from e + + +def load_model(model_name): + module_name = f"{model_name}" + try: + converter = importlib.import_module(module_name) + except ModuleNotFoundError as e: + raise exc from e + return converter + + +def main(): + parser = argparse.ArgumentParser( + description="convert weight to huggingface format") + + parser.add_argument('-m', '--model', type=str, required=True, + choices=['llama', 'bloom', 'gptneox'], + help='Type of the model') + parser.add_argument('-i', '--input-model-dir', type=str, required=True, + help='Directory to load model checkpoint from') + parser.add_argument('-o', '--output-model-dir', type=str, required=True, + help='Directory to save model checkpoint to') + parser.add_argument('--no-checking', action='store_false', + help='Do not perform checking on the name and ordering of weights', + dest='checking') + parser.add_argument('--convert-to-safetensors', action='store_false', + help='convert .bin to safetensors') + + known_args, _ = parser.parse_known_args() + loader = importlib.import_module('load_utils') + saver = load_model(known_args.model) + + loader.add_arguments(parser) + saver.add_arguments(parser) + + args = parser.parse_args() + + queue = mp.Queue(maxsize=50) + + print("Starting saver...") + saver_proc = mp.Process( + target=saver.save_model_checkpoint, args=(queue, args)) + saver_proc.start() + + print("Starting loader...") + loader.load_checkpoint(queue, args) + + print("Waiting for saver to complete...") + saver_proc.join() + + if args.convert_to_safetensors: + print("converting .bin to safetensors...") + check_and_convert_weight(args) + + print("Done!") + + +if __name__ == '__main__': + main() diff --git a/debug/weight_convert/diff.patch b/debug/weight_convert/diff.patch new file mode 100644 index 0000000000000000000000000000000000000000..134890cee45c7a7373187b46c0888f2552c1861d --- /dev/null +++ b/debug/weight_convert/diff.patch @@ -0,0 +1,76 @@ +diff --git a/modellink/__init__.py b/modellink/__init__.py +index ceae8c4..837386e 100644 +--- a/modellink/__init__.py ++++ b/modellink/__init__.py +@@ -22,7 +22,7 @@ except Exception as e: + logging.warning("Warning: You did not install torch_npu") + + +-from .model_adaptor import exe_adaptor ++# from .model_adaptor import exe_adaptor + + +-exe_adaptor() ++# exe_adaptor() +diff --git a/modellink/model_adaptor.py b/modellink/model_adaptor.py +index 7c1bda1..7376b98 100644 +--- a/modellink/model_adaptor.py ++++ b/modellink/model_adaptor.py +@@ -26,32 +26,32 @@ from .utils import ALL_MODULE_WRAPPER_CLASSNAMES + from .checkpointing import _load_base_checkpoint_wrapper, load_checkpoint_wrapper + + +-def exe_adaptor(): +- import megatron +- megatron.utils.ALL_MODULE_WRAPPER_CLASSNAMES = ALL_MODULE_WRAPPER_CLASSNAMES +- megatron.initialize.parse_args = parse_args_decorator(megatron.initialize.parse_args) +- megatron.arguments.parse_args = parse_args_decorator(megatron.arguments.parse_args) +- megatron.global_vars.build_tokenizer = build_tokenizer ++# def exe_adaptor(): ++# import megatron ++# megatron.utils.ALL_MODULE_WRAPPER_CLASSNAMES = ALL_MODULE_WRAPPER_CLASSNAMES ++# megatron.initialize.parse_args = parse_args_decorator(megatron.initialize.parse_args) ++# megatron.arguments.parse_args = parse_args_decorator(megatron.arguments.parse_args) ++# megatron.global_vars.build_tokenizer = build_tokenizer + +- import megatron.training +- megatron.training.get_model = get_model_wrapper(megatron.training.get_model) +- megatron.training.build_pretraining_data_loader = build_pretraining_data_loader ++# import megatron.training ++# megatron.training.get_model = get_model_wrapper(megatron.training.get_model) ++# megatron.training.build_pretraining_data_loader = build_pretraining_data_loader + +- megatron.model.GPTModel = GPTModel +- megatron.model.transformer.SwitchMLP = SwitchMLP +- megatron.model.transformer.ParallelTransformer.__init__ = parallel_transformer_init +- megatron.model.transformer.ParallelTransformer.state_dict_for_save_checkpoint \ +- = state_dict_for_save_checkpoint_wrapper( +- megatron.model.transformer.ParallelTransformer.state_dict_for_save_checkpoint) +- megatron.model.language_model.TransformerLanguageModel.forward = (seq_length_wrapper( +- megatron.model.language_model.TransformerLanguageModel.forward)) ++# megatron.model.GPTModel = GPTModel ++# megatron.model.transformer.SwitchMLP = SwitchMLP ++# megatron.model.transformer.ParallelTransformer.__init__ = parallel_transformer_init ++# megatron.model.transformer.ParallelTransformer.state_dict_for_save_checkpoint \ ++# = state_dict_for_save_checkpoint_wrapper( ++# megatron.model.transformer.ParallelTransformer.state_dict_for_save_checkpoint) ++# megatron.model.language_model.TransformerLanguageModel.forward = (seq_length_wrapper( ++# megatron.model.language_model.TransformerLanguageModel.forward)) + +- megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward = vocab_embedding_wrapper( +- megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward) +- megatron.core.tensor_parallel.layers.VocabParallelEmbedding.__init__ = norm_wrapper( +- megatron.core.tensor_parallel.layers.VocabParallelEmbedding.__init__) ++# megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward = vocab_embedding_wrapper( ++# megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward) ++# megatron.core.tensor_parallel.layers.VocabParallelEmbedding.__init__ = norm_wrapper( ++# megatron.core.tensor_parallel.layers.VocabParallelEmbedding.__init__) + +- megatron.checkpointing._load_base_checkpoint = _load_base_checkpoint_wrapper( +- megatron.checkpointing._load_base_checkpoint) +- megatron.training.load_checkpoint = load_checkpoint_wrapper( +- megatron.checkpointing.load_checkpoint) ++# megatron.checkpointing._load_base_checkpoint = _load_base_checkpoint_wrapper( ++# megatron.checkpointing._load_base_checkpoint) ++# megatron.training.load_checkpoint = load_checkpoint_wrapper( ++# megatron.checkpointing.load_checkpoint) diff --git a/debug/weight_convert/llama.py b/debug/weight_convert/llama.py new file mode 100644 index 0000000000000000000000000000000000000000..0ae2173c6cd5a1b1ffcb8fd1ee56e58aa05fe646 --- /dev/null +++ b/debug/weight_convert/llama.py @@ -0,0 +1,560 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the License); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +from collections.abc import Mapping +import concurrent.futures +import os +import gc +import sys +import shutil +import torch +import torch_npu + + +def add_arguments(parser): + group = parser.add_argument_group(title='Megatron saver') + group.add_argument('--target-tensor-parallel-size', type=int, + help='Target tensor model parallel size, defaults to the tensor parallel size ' + 'in the input checkpoint if provided by the loader, otherwise to 1') + group.add_argument('--target-pipeline-parallel-size', type=int, + help='Target tensor model parallel size, default to the pipeline parall size ' + 'in the input checkpoint if provided by the loader, otherwise to 1') + group.add_argument("--w-pack", type=bool, + help='True is w_pack weight for llm', + default=False) + + +def save_huggingface_llama(args, model, model_args): + hf2mg_map = {} + for name_param_m in model.named_parameters(): + layer_num = name_param_m[0].split(".")[3] if len( + name_param_m[0].split(".")) > 3 else name_param_m[0].split(".")[1] + nh = model_args.num_attention_heads + ng = ( + model_args.checkpoint_args.num_query_groups + if model_args.checkpoint_args.group_query_attention + else model_args.num_attention_heads + ) + repeats = nh // ng + if name_param_m[0] == "language_model.embedding.word_embeddings.weight": + hf2mg_map["model.embed_tokens.weight"] = name_param_m[1] + continue + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.post_attention_norm.weight": + hf2mg_map[f"model.layers.{layer_num}.post_attention_layernorm.weight"] = name_param_m[1] + continue + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.input_norm.weight": + hf2mg_map[f"model.layers.{layer_num}.input_layernorm.weight"] = name_param_m[1] + continue + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.post_attention_norm.weight": + hf2mg_map[f"model.layers.{layer_num}.post_attention_layernorm.weight"] = name_param_m[1] + continue + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.self_attention.query_key_value.weight": + qkv_weight = name_param_m[1].reshape( + ng, + repeats + 2, + name_param_m[1].shape[0] // ng // (repeats + 2), + name_param_m[1].shape[1], + ) + w = qkv_weight.shape[-1] + qw = qkv_weight[:, :repeats, ...].reshape(-1, w) + kw = qkv_weight[:, repeats: repeats + 1, ...].reshape(-1, w) + vw = qkv_weight[:, repeats + 1:, ...].reshape(-1, w) + if args.w_pack: + qkv = torch.cat((qw, kw, vw), dim=0) + hf2mg_map[f"model.layers.{layer_num}.self_attn.W_pack.weight"] = qkv + else: + hf2mg_map[f"model.layers.{layer_num}.self_attn.q_proj.weight"] = qw + hf2mg_map[f"model.layers.{layer_num}.self_attn.k_proj.weight"] = kw + hf2mg_map[f"model.layers.{layer_num}.self_attn.v_proj.weight"] = vw + continue + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.self_attention.query_key_value.bias": + bias_weight = name_param_m[1].reshape( + ng, repeats + + 2, name_param_m[1].shape[0] // ng // (repeats + 2) + ) + w = bias_weight.shape[-1] + qw = bias_weight[:, :repeats, ...].reshape(-1) + kw = bias_weight[:, repeats: repeats + 1, ...].reshape(-1) + vw = bias_weight[:, repeats + 1:, ...].reshape(-1) + hf2mg_map[f"model.layers.{layer_num}.self_attn.q_proj.bias"] = qw + hf2mg_map[f"model.layers.{layer_num}.self_attn.k_proj.bias"] = kw + hf2mg_map[f"model.layers.{layer_num}.self_attn.v_proj.bias"] = vw + continue + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.self_attention.dense.bias": + hf2mg_map[f"model.layers.{layer_num}.self_attn.dense.bias"] = name_param_m[1] + continue + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.self_attention.dense.weight": + hf2mg_map[f"model.layers.{layer_num}.self_attn.o_proj.weight"] = name_param_m[1] + continue + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.mlp.dense_h_to_4h.weight": + proj_read_h_half = name_param_m[1].shape[0] // 2 + hf2mg_map[f"model.layers.{layer_num}.mlp.gate_proj.weight"] = name_param_m[1][:proj_read_h_half, ...] + hf2mg_map[f"model.layers.{layer_num}.mlp.up_proj.weight"] = name_param_m[1][proj_read_h_half:, ...] + continue + if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.mlp.dense_4h_to_h.weight": + hf2mg_map[f"model.layers.{layer_num}.mlp.down_proj.weight"] = name_param_m[1] + continue + if name_param_m[0] == "language_model.encoder.final_norm.weight": + hf2mg_map[f"model.norm.weight"] = name_param_m[1] + continue + if name_param_m[0] == "language_model.output_layer.weight": + hf2mg_map[f"lm_head.weight"] = name_param_m[1] + continue + + gc.collect() + file_format = "pytorch_model-{:05d}-of-{:05d}.bin" + file_list = os.listdir(args.output_model_dir) + output_mg2hg_path = os.path.join(args.output_model_dir, 'mg2hg') + os.makedirs(output_mg2hg_path, exist_ok=True) + for filename in file_list: + if filename.startswith("pytorch_model-") and filename.endswith(".bin"): + try: + start_index = len("pytorch_model-") + end_index = filename.index("-of-") + index1 = int(filename[start_index:end_index]) + index2 = int(filename[end_index + len("-of-"): -len(".bin")]) + if 0 <= index1 <= index2 <= 99999: + formatted_filename = file_format.format(index1, index2) + file_path = os.path.join( + args.output_model_dir, formatted_filename) + hf_model = torch.load( + file_path, map_location=torch.device('cpu')) + for key in hf_model.keys(): + if key in hf2mg_map.keys(): + hf_model[key].data.copy_(hf2mg_map[key]) + output_file_path = os.path.join( + output_mg2hg_path, formatted_filename) + print(f'save weight to {output_file_path}') + torch.save(hf_model, output_file_path) + else: + print("Invalid file format:", filename) + except ValueError: + print("Unable to save file:", filename) + elif (filename.endswith('.json') or filename.endswith('.mode')) and 'safetensors' not in filename: + source_file = os.path.join(args.output_model_dir, filename) + destination_file = os.path.join(output_mg2hg_path, filename) + shutil.copyfile(source_file, destination_file) + + +def save_model_checkpoint(queue, args): + from pretrain_gpt import model_provider + from ascendspeed import megatron_adaptor + from megatron.arguments import validate_args + from modellink.utils import parse_args + from megatron.checkpointing import save_checkpoint + from megatron.global_vars import set_global_variables, get_args + from megatron.core.enums import ModelType + from megatron.tokenizer.tokenizer import _vocab_size_with_padding + from megatron import fused_kernels + from megatron.core import mpu + + def queue_get(name=None): + val = queue.get() + if val == "exit": + raise RuntimeError('Loader exited, exiting saver') + if name is not None and args.checking and val["name"] != name: + val_name = val["name"] + raise RuntimeError(f'Unexpected message. Expecting "{name}" but got "{val_name}". Exiting saver.') + if name is not None: + print(f"received {name}") + return val + + def check_message(msg): + if not args.checking: + return + msg_name = msg.pop("name") + if len(msg.keys()) > 0: + print(f"Unexpected values in {msg_name}:") + for key in msg.keys(): + print(f" {key}") + raise RuntimeError(f"Exiting. If you want to ignore this, use the argument --no-checking.") + + md = queue_get() + + if args.target_tensor_parallel_size is None: + if hasattr(md, 'previous_tensor_parallel_size'): + args.target_tensor_parallel_size = md.previous_tensor_parallel_size + else: + print("loader did not provide a tensor parallel size and --target-tensor-parallel-size not provided on command line. " + "Default to 1.") + args.target_tensor_parallel_size = 1 + + if args.target_pipeline_parallel_size is None: + if hasattr(md, 'previous_pipeline_parallel_size'): + args.target_pipeline_parallel_size = md.previous_pipeline_parallel_size + else: + print("loader did not provide a pipeline parallel size and --target-pipeline-parallel-size not provided on command line. " + "Default to 1.") + args.target_pipeline_parallel_size = 1 + + # Arguments do sanity checks on the world size, but we don't care, + # so trick it into thinking we are plenty of processes + if args.target_tensor_parallel_size is not None and args.target_pipeline_parallel_size is not None: + os.environ["WORLD_SIZE"] = f'{args.target_tensor_parallel_size * args.target_pipeline_parallel_size}' + + # We want all arguments to come from us + sys.argv = ['script.py', + '--num-layers', str(md.num_layers), + '--hidden-size', str(md.hidden_size), + '--seq-length', str(md.seq_length), + '--num-attention-heads', str(md.num_attention_heads), + '--max-position-embeddings', str(md.max_position_embeddings), + '--position-embedding-type', str(md.position_embedding_type), + '--tokenizer-type', str(md.tokenizer_type), + '--tensor-model-parallel-size', str( + args.target_tensor_parallel_size), + '--pipeline-model-parallel-size', str( + args.target_pipeline_parallel_size), + '--no-masked-softmax-fusion', + '--no-bias-gelu-fusion', + '--no-bias-dropout-fusion', + '--no-async-tensor-model-parallel-allreduce', + '--use-cpu-initialization', + '--micro-batch-size', '1', + '--no-load-optim', + '--no-load-rng', + '--no-save-optim', + '--no-save-rng', + '--no-initialization', + '--save-interval', '1', + '--save', args.output_model_dir, + '--fp16' + ] + + if md.make_vocab_size_divisible_by is not None: + sys.argv.extend(['--make-vocab-size-divisible-by', + str(md.make_vocab_size_divisible_by)]) + if md.output_layer: + sys.argv.append('--untie-embeddings-and-output-weights') + if not md.linear_bias: + sys.argv.append('--disable-bias-linear') + + margs = parse_args() + margs.w_pack = args.w_pack + + if hasattr(md, 'checkpoint_args'): + # These are arguments that we are either changing, or cause problems for validation if they are set + # Note that some of these deal with T5 so will need to be changed if we support T5. + args_to_keep = ['tensor_model_parallel_size', 'pipeline_model_parallel_size', 'world_size', 'params_dtype', + 'num_layers_per_virtual_pipeline_stage', 'virtual_pipeline_model_parallel_size', + 'masked_softmax_fusion', 'bias_gelu_fusion', 'bias_dropout_fusion', + 'sequence_parallel', 'async_tensor_model_parallel_allreduce', + 'no_load_optim', 'no_load_rng', 'no_save_optim', 'no_save_rng', + 'vocab_file', 'tokenizer_model', + 'save_interval', 'save', + 'perform_initialization', 'use_cpu_initialization', + 'recompute_granularity', 'recompute_num_layers', 'recompute_method', + 'encoder_num_layers', 'encoder_seq_length', + 'distribute_saved_activations', + 'train_iters', 'lr_decay_iters', 'lr_warmup_iters', 'lr_warmup_fraction', + 'start_weight_decay', 'end_weight_decay'] + + for arg, value in vars(md.checkpoint_args).items(): + if arg in args_to_keep: + continue + if not hasattr(margs, arg): + print( + f"Checkpoint had argument {arg} but new arguments does not have this.") + continue + if getattr(margs, arg) != value: + print( + f"Overwriting default {arg} value {getattr(margs, arg)} with value from checkpoint {value}.") + setattr(margs, arg, value) + + validate_args(margs) + + set_global_variables(margs, build_tokenizer=False) + + # margs means megatron args + margs = get_args() + + margs.model_type = ModelType.encoder_or_decoder + + if hasattr(md, 'consumed_train_samples'): + margs.consumed_train_samples = md.consumed_train_samples + margs.consumed_valid_samples = md.consumed_valid_samples + print(f"Setting consumed_train_samples to {margs.consumed_train_samples}" + f" and consumed_valid_samples to {margs.consumed_valid_samples}") + else: + print("consumed_train_samples not provided.") + + def get_models(count, dtype, pre_process, post_process): + models = [model_provider(pre_process, post_process).to( + dtype) for _ in range(count)] + return models + + # fake initializing distributed + mpu.set_tensor_model_parallel_world_size(args.target_tensor_parallel_size) + mpu.set_pipeline_model_parallel_world_size( + args.target_pipeline_parallel_size) + mpu.set_tensor_model_parallel_rank(0) + mpu.set_pipeline_model_parallel_rank(0) + # Embeddings + # ----------- + embeddings_msg = queue_get("embeddings") + + pos_embed = None + if md.position_embedding_type == 'learned_absolute': + pos_embed = embeddings_msg.pop("position embeddings") + orig_word_embed = embeddings_msg.pop("word embeddings") + orig_word_embed_n_w, orig_word_embed_n_b = None, None + if "word embeddings norm_w" in embeddings_msg and "word embeddings norm_b" in embeddings_msg: + orig_word_embed_n_w = embeddings_msg.pop("word embeddings norm_w") + orig_word_embed_n_b = embeddings_msg.pop("word embeddings norm_b") + check_message(embeddings_msg) + + # Deal with padding + if md.true_vocab_size is not None: + # figure out what our padded vocab size is + orig_vocab_size = orig_word_embed.shape[0] + margs.padded_vocab_size = _vocab_size_with_padding( + md.true_vocab_size, margs) + + # Cut out extra padding we don't need + if orig_vocab_size > margs.padded_vocab_size: + full_word_embed = orig_word_embed[0:margs.padded_vocab_size, :] + + # Expanding embedding to larger size by replicating final entry + elif orig_vocab_size < margs.padded_vocab_size: + padding_size = margs.padded_vocab_size - orig_vocab_size + + full_word_embed = torch.cat(( + orig_word_embed, + orig_word_embed[-1].unsqueeze(0).expand(padding_size, -1))) + + # Same size! + else: + full_word_embed = orig_word_embed + else: + print("Original vocab size not specified, leaving embedding table as-is. " + "If you've changed the tensor parallel size this could cause problems.") + margs.padded_vocab_size = orig_word_embed.shape[0] + full_word_embed = orig_word_embed + + # Split into new tensor model parallel sizes + out_word_embed = torch.chunk( + full_word_embed, args.target_tensor_parallel_size, dim=0) + + # Make models for first pipeline stage and fill in embeddings + mpu.set_pipeline_model_parallel_rank(0) + post_process = args.target_pipeline_parallel_size == 1 + models = get_models(args.target_tensor_parallel_size, + md.params_dtype, True, post_process) + for tp_rank, model in enumerate(models): + model.language_model.embedding.word_embeddings.weight.data.copy_( + out_word_embed[tp_rank]) + if orig_word_embed_n_w is not None: + model.language_model.embedding.word_embeddings.norm.weight.data.copy_( + orig_word_embed_n_w) + model.language_model.embedding.word_embeddings.norm.bias.data.copy_( + orig_word_embed_n_b) + if pos_embed is not None: + model.language_model.embedding.position_embeddings.weight.data.copy_( + pos_embed) + else: + if hasattr(model.language_model.embedding, 'position_embeddings'): + raise ValueError("model should have position_embeddings") + + # Transformer layers + # ------------------- + total_layer_num = 0 + for pp_rank in range(args.target_pipeline_parallel_size): + # For later pipeline parallel ranks, make the new models + if pp_rank > 0: + mpu.set_pipeline_model_parallel_rank(pp_rank) + post_process = pp_rank == args.target_pipeline_parallel_size - 1 + models = get_models(args.target_tensor_parallel_size, + md.params_dtype, False, post_process) + + encoder_layer_num = len(models[0].language_model.encoder.layers) + for layer in range(encoder_layer_num): + msg = queue_get(f"transformer layer {total_layer_num}") + + # duplicated tensors + input_norm_weight = msg.pop("input norm weight") + if md.norm_has_bias: + input_norm_bias = msg.pop("input norm bias") + post_norm_weight = msg.pop("post norm weight") + if md.norm_has_bias: + post_norm_bias = msg.pop("post norm bias") + if md.linear_bias: + dense_bias = msg.pop("dense bias") + mlp_l1_bias = msg.pop("mlp l1 bias") + + if args.add_qkv_bias: + qkv_bias = torch.chunk( + msg.pop("qkv bias"), args.target_tensor_parallel_size, dim=0) + if args.add_dense_bias: + dense_bias = msg.pop("dense bias") + + qkv_org = msg.pop("qkv weight") + qkv_weight = torch.chunk( + qkv_org, args.target_tensor_parallel_size, dim=0) + + # Split up the parallel tensors + dense_weight = torch.chunk( + msg.pop("dense weight"), args.target_tensor_parallel_size, dim=1) + mlp_l1_weight = torch.chunk( + msg.pop("mlp l1 weight"), args.target_tensor_parallel_size, dim=1) + + # Special handling for swiglu + if md.swiglu: + mlp_l0_weight_W = torch.chunk( + msg.pop("mlp l0 weight W"), args.target_tensor_parallel_size, dim=0) + mlp_l0_weight_V = torch.chunk( + msg.pop("mlp l0 weight V"), args.target_tensor_parallel_size, dim=0) + mlp_l0_weight = [] + for weights in zip(mlp_l0_weight_W, mlp_l0_weight_V): + mlp_l0_weight.append(torch.cat(weights, dim=0)) + else: + mlp_l0_weight = torch.chunk( + msg.pop("mlp l0 weight"), args.target_tensor_parallel_size, dim=0) + + if md.linear_bias: + qkv_bias = torch.chunk( + msg.pop("qkv bias"), args.target_tensor_parallel_size, dim=0) + if md.swiglu: + mlp_l0_bias_W = torch.chunk( + msg.pop("mlp l0 bias W"), args.target_tensor_parallel_size, dim=0) + mlp_l0_bias_V = torch.chunk( + msg.pop("mlp l0 bias V"), args.target_tensor_parallel_size, dim=0) + mlp_l0_weight = [] + for weights in zip(mlp_l0_weight_W, mlp_l0_weight_V): + mlp_l0_weight.append(torch.cat(weights, dim=0)) + else: + mlp_l0_bias = torch.chunk( + msg.pop("mlp l0 bias"), args.target_tensor_parallel_size, dim=0) + + # Save them to the model + for tp_rank in range(args.target_tensor_parallel_size): + layer_encoder = models[tp_rank].language_model.encoder.layers[layer] + layer_encoder.input_norm.weight.data.copy_(input_norm_weight) + if md.norm_has_bias: + layer_encoder.input_norm.bias.data.copy_(input_norm_bias) + layer_encoder.self_attention.query_key_value.weight.data.copy_( + qkv_weight[tp_rank]) + layer_encoder.self_attention.dense.weight.data.copy_(dense_weight[tp_rank]) + layer_encoder.post_attention_norm.weight.data.copy_(post_norm_weight) + if md.norm_has_bias: + layer_encoder.post_attention_norm.bias.data.copy_(post_norm_bias) + layer_encoder.mlp.dense_h_to_4h.weight.data.copy_(mlp_l0_weight[tp_rank]) + layer_encoder.mlp.dense_4h_to_h.weight.data.copy_(mlp_l1_weight[tp_rank]) + if md.linear_bias: + layer_encoder.self_attention.query_key_value.bias.data.copy_( + qkv_bias[tp_rank]) + layer_encoder.self_attention.dense.bias.data.copy_(dense_bias) + layer_encoder.mlp.dense_h_to_4h.bias.data.copy_(mlp_l0_bias[tp_rank]) + layer_encoder.mlp.dense_4h_to_h.bias.data.copy_(mlp_l1_bias) + if args.add_qkv_bias: + layer_encoder.self_attention.query_key_value.bias.data.copy_( + qkv_bias[tp_rank]) + if args.add_dense_bias: + layer_encoder.self_attention.dense.bias.data.copy_(dense_bias) + + total_layer_num = total_layer_num + 1 + check_message(msg) + + if post_process: + msg = queue_get("final norm") + final_norm_weight = msg.pop("weight") + if md.norm_has_bias: + final_norm_bias = msg.pop("bias") + for tp_rank in range(args.target_tensor_parallel_size): + models[tp_rank].language_model.encoder.final_norm.weight.data.copy_( + final_norm_weight) + if md.norm_has_bias: + models[tp_rank].language_model.encoder.final_norm.bias.data.copy_( + final_norm_bias) + if pp_rank != 0 and not md.output_layer: + # Copy word embeddings to final pipeline rank + models[tp_rank].word_embeddings.weight.data.copy_( + out_word_embed[tp_rank]) + del final_norm_weight + if md.norm_has_bias: + del final_norm_bias + check_message(msg) + + if md.output_layer: + msg = queue_get("output layer") + if not hasattr(models[0].language_model, 'output_layer'): + raise AttributeError( + "ERROR: got an output layer, but model does not have one") + output_layer_weight = torch.chunk( + msg.pop("weight"), args.target_tensor_parallel_size, dim=0) + for tp_rank in range(args.target_tensor_parallel_size): + models[tp_rank].language_model.output_layer.weight.data.copy_( + output_layer_weight[tp_rank]) + del output_layer_weight + check_message(msg) + + msg = queue_get() + if msg != "done" and msg["name"] == "pooler": + if not hasattr(models[0].language_model, 'pooler'): + raise AttributeError( + "ERROR: got a pooler, but model does not have one") + print("received pooler") + pooler_weight = msg.pop("weight") + pooler_bias = msg.pop("bias") + for tp_rank in range(args.target_tensor_parallel_size): + models[tp_rank].language_model.pooler.dense.weight.data.copy_( + pooler_weight) + models[tp_rank].language_model.pooler.dense.bias.data.copy_( + pooler_bias) + del pooler_weight + del pooler_bias + check_message(msg) + msg = queue_get() + + if msg != "done" and msg["name"] == "lm head": + if not hasattr(models[0], 'lm_head'): + raise RuntimeError("ERROR: got an lm head, but model does not have one") + print("received lm head") + lm_head_dense_weight = msg.pop("dense weight") + lm_head_dense_bias = msg.pop("dense bias") + lm_head_norm_weight = msg.pop("norm weight") + if md.norm_has_bias: + lm_head_norm_bias = msg.pop("norm bias") + for tp_rank in range(args.target_tensor_parallel_size): + models[tp_rank].lm_head.dense.weight.data.copy_( + lm_head_dense_weight) + models[tp_rank].lm_head.dense.bias.data.copy_( + lm_head_dense_bias) + models[tp_rank].lm_head.norm.weight.data.copy_( + lm_head_norm_weight) + if md.norm_has_bias: + models[tp_rank].lm_head.norm.bias.data.copy_( + lm_head_norm_bias) + check_message(msg) + msg = queue_get() + + if msg != "done" and msg["name"] == "binary head": + if not hasattr(models[0], 'binary_head'): + raise RuntimeError("ERROR: got a binary head, but model does not have one") + print("received binary head") + binary_head_weight = msg.pop("weight") + binary_head_bias = msg.pop("bias") + for tp_rank in range(args.target_tensor_parallel_size): + models[tp_rank].binary_head.weight.data.copy_( + binary_head_weight) + models[tp_rank].binary_head.bias.data.copy_( + binary_head_bias) + check_message(msg) + msg = queue_get() + + if msg != "done": + print("ERROR: got some more data but was expecting to be done") + + for tp_rank in range(args.target_tensor_parallel_size): + mpu.set_tensor_model_parallel_rank(tp_rank) + save_huggingface_llama(args, models[tp_rank], md) diff --git a/debug/weight_convert/load_utils.py b/debug/weight_convert/load_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a041d61cff122cc0a7218f80d19ff81c4512f7b7 --- /dev/null +++ b/debug/weight_convert/load_utils.py @@ -0,0 +1,371 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the License); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +import types +import argparse +import importlib +import torch +import torch.multiprocessing as mp + + +def add_arguments(parser): + group = parser.add_argument_group(title='Megatron loader') + + group.add_argument('--true-vocab-size', type=int, default=None, + help='original size of vocab, if specified will trim padding from embedding table.') + group.add_argument('--vocab-file', type=str, default=None, + help='Path to the vocab file. If specified will use this to get vocab size and ' + 'trim padding from the embedding table.') + parser.add_argument('--add-qkv-bias', action='store_true', + help='Add bias for attention qkv', default=False, + ) + parser.add_argument('--add-dense-bias', action='store_true', + help='Add bias for attention dense', default=False, + ) + parser.add_argument('--embed-layernorm', action='store_true', + help='Add embed layernorm for word embedding', default=False, + ) + parser.add_argument('--params-dtype', type=str, + help='Set weight dtype', default='fp16', + ) + + +def _load_checkpoint(queue, args): + + # Search in directory above this + sys.path.append(os.path.abspath( + os.path.join(os.path.dirname(__file__), + os.path.pardir))) + try: + from pretrain_gpt import model_provider + from ascendspeed import megatron_adaptor + from megatron.arguments import validate_args + from modellink.utils import parse_args + from megatron.global_vars import set_args, set_global_variables + from megatron.checkpointing import load_args_from_checkpoint + from megatron.checkpointing import load_checkpoint as load_checkpoint_mg + from megatron.model import module + from megatron.core import mpu + from megatron.core.enums import ModelType + except ModuleNotFoundError: + print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.") + queue.put("exit") + if args.input_model_dir: + print(f">>>{args.input_model_dir}") + else: + print("NO") + # We want all arguments to come from us + sys.argv = ['script.py', + '--no-masked-softmax-fusion', + '--no-bias-gelu-fusion', + '--no-bias-dropout-fusion', + '--no-async-tensor-model-parallel-allreduce', + '--use-cpu-initialization', + '--micro-batch-size', '1', + '--no-load-optim', + '--no-load-rng', + '--no-save-optim', + '--no-save-rng', + '--no-initialization', + '--load', args.input_model_dir + ] + margs = parse_args() + margs.embed_layernorm = args.embed_layernorm + margs, checkpoint_args = load_args_from_checkpoint(margs) + margs.add_qkv_bias = args.add_qkv_bias + margs.add_dense_bias = args.add_dense_bias + margs.fp16 = True + if args.add_dense_bias: + margs.skip_bias_add = False + + # Arguments do sanity checks on the world size, but we don't care, + # so trick it into thinking we are plenty of processes + margs.world_size = margs.tensor_model_parallel_size * \ + margs.pipeline_model_parallel_size + + margs = validate_args(margs) + + def check_for_arg(arg_name, default=None): + if getattr(margs, arg_name, None) is None: + if default is not None: + setattr(margs, arg_name, default) + else: + print( + f"Checkpoint does not specify the argument {arg_name}. Exiting.") + print(f"Arguments: {margs}") + queue.put("exit") + + check_for_arg('tensor_model_parallel_size') + check_for_arg('pipeline_model_parallel_size') + check_for_arg('num_layers') + check_for_arg('hidden_size') + check_for_arg('seq_length') + check_for_arg('num_attention_heads') + check_for_arg('max_position_embeddings') + check_for_arg('position_embedding_type') + check_for_arg('tokenizer_type') + check_for_arg('iteration') + check_for_arg('bert_binary_head') + check_for_arg('disable_bias_linear', False) + check_for_arg('params_dtype') + check_for_arg('swiglu', False) + + margs.model_type = ModelType.encoder_or_decoder + # supress warning about torch.distributed not being initialized + module.MegatronModule.embedding_warning_printed = True + + consumed_train_samples = None + consumed_valid_samples = None + + def get_models(count, dtype): + nonlocal consumed_train_samples + nonlocal consumed_valid_samples + model_array_len = margs.virtual_pipeline_model_parallel_size + if model_array_len is None: + model_array_len = 1 + models = [[] for _ in range(model_array_len)] + pre_process = mpu.is_pipeline_first_stage() + post_process = mpu.is_pipeline_last_stage() + for rank in range(count): + mpu.set_tensor_model_parallel_rank(rank) + if margs.virtual_pipeline_model_parallel_size is not None: + model_ = [] + for i in range(margs.virtual_pipeline_model_parallel_size): + mpu.set_virtual_pipeline_model_parallel_rank(i) + # Set pre_process and post_process only after virtual rank is set. + pre_process = mpu.is_pipeline_first_stage() + post_process = mpu.is_pipeline_last_stage() + this_model = model_provider( + pre_process=pre_process, + post_process=post_process + ).to(dtype) + model_.append(this_model) + else: + pre_process = mpu.is_pipeline_first_stage() + post_process = mpu.is_pipeline_last_stage() + model_rank = 0 + model_ = [model_provider(pre_process, post_process).to(dtype)] + margs.consumed_train_samples = 0 + margs.consumed_valid_samples = 0 + load_checkpoint_mg(model_, None, None) + + if consumed_train_samples is not None: + if margs.consumed_train_samples != consumed_train_samples: + return None + else: + consumed_train_samples = margs.consumed_train_samples + if consumed_valid_samples is not None: + if margs.consumed_valid_samples != consumed_valid_samples: + return None + else: + consumed_valid_samples = margs.consumed_valid_samples + for vp_rank in range(model_array_len): + models[vp_rank].append(model_[vp_rank]) + return models + + set_global_variables(margs, build_tokenizer=False) + mpu.set_tensor_model_parallel_world_size(margs.tensor_model_parallel_size) + mpu.set_pipeline_model_parallel_world_size( + margs.pipeline_model_parallel_size) + mpu.set_virtual_pipeline_model_parallel_world_size( + margs.virtual_pipeline_model_parallel_size) + + # Get true (non-padded) vocab size + if args.true_vocab_size is not None: + true_vocab_size = args.true_vocab_size + elif args.vocab_file is not None: + vb_file = open(args.vocab_file) + vocab = json.load(vb_file) + true_vocab_size = len(vocab) + if args.true_vocab_size is not None and true_vocab_size != args.true_vocab_size: + print( + "Both --true-vocab-size and --vocab-file specified and the vocab size does not match, aborting.") + queue.put("exit") + vb_file.close() + else: + true_vocab_size = None + + # short aliases + tp_size = margs.tensor_model_parallel_size + pp_size = margs.pipeline_model_parallel_size + vp_size = margs.virtual_pipeline_model_parallel_size + if vp_size is None: + vp_size = 1 + + # Layernorm has bias; RMSNorm does not. + if hasattr(checkpoint_args, 'normalization'): + norm_has_bias = checkpoint_args.normalization == "LayerNorm" + else: + # older models only supported LayerNorm + norm_has_bias = True + + # metadata + md = types.SimpleNamespace() + md.model_type = 'GPT' + md.num_layers = margs.num_layers + md.hidden_size = margs.hidden_size + md.seq_length = margs.seq_length + md.num_attention_heads = margs.num_attention_heads + md.max_position_embeddings = margs.max_position_embeddings + md.tokenizer_type = margs.tokenizer_type + md.iteration = margs.iteration + md.params_dtype = margs.params_dtype + md.bert_binary_head = margs.bert_binary_head + md.output_layer = margs.untie_embeddings_and_output_weights + md.position_embedding_type = margs.position_embedding_type + md.linear_bias = margs.add_bias_linear + md.norm_has_bias = norm_has_bias + md.swiglu = margs.swiglu + md.previous_tensor_parallel_size = margs.tensor_model_parallel_size + md.previous_pipeline_parallel_size = margs.pipeline_model_parallel_size + md.true_vocab_size = true_vocab_size + md.make_vocab_size_divisible_by = margs.make_vocab_size_divisible_by + md.checkpoint_args = checkpoint_args + md.embed_layernorm = margs.embed_layernorm + + # Get first pipe stage + mpu.set_pipeline_model_parallel_rank(0) + all_models = [get_models(tp_size, md.params_dtype)] + models = all_models[0][0] + + md.consumed_train_samples = consumed_train_samples + md.consumed_valid_samples = consumed_valid_samples + queue.put(md) + + def queue_put(name, msg): + print(f"sending {name}") + msg["name"] = name + queue.put(msg) + + # Send embeddings + message_word_embedding = [] + for tp_rank in range(tp_size): + message_word_embedding.append(models[tp_rank].language_model.embedding.word_embeddings.weight.data) + message = {"word embeddings": torch.cat(message_word_embedding, dim=0)} + if md.position_embedding_type == 'learned_absolute': + message["position embeddings"] = models[0].language_model.embedding.position_embeddings.weight.data + if md.embed_layernorm: + message["word embeddings norm_w"] = models[0].language_model.embedding.word_embeddings.norm.weight.data + message["word embeddings norm_b"] = models[0].language_model.embedding.word_embeddings.norm.bias.data + queue_put("embeddings", message) + + total_layer_num = 0 + for vp_rank in range(vp_size): + mpu.set_virtual_pipeline_model_parallel_rank(vp_rank) + for pp_rank in range(pp_size): + if pp_rank > 0: + mpu.set_pipeline_model_parallel_rank(pp_rank) + if vp_rank == 0: + all_models.append(get_models(tp_size, md.params_dtype)) + models = all_models[pp_rank][vp_rank] + for layer_num, _ in enumerate(models[0].language_model.encoder.layers): + message = {} + + # Get non-parallel tensors from tp_rank 0 + layer = models[0].language_model.encoder.layers[layer_num] + message["input norm weight"] = layer.input_norm.weight.data + if norm_has_bias: + message["input norm bias"] = layer.input_norm.bias.data + message["post norm weight"] = layer.post_attention_norm.weight.data + if norm_has_bias: + message["post norm bias"] = layer.post_attention_norm.bias.data + if md.linear_bias: + message["dense bias"] = layer.self_attention.dense.bias.data + message["mlp l1 bias"] = layer.mlp.dense_4h_to_h.bias.data + if args.add_dense_bias: + message["dense bias"] = layer.self_attention.dense.bias.data + + # Grab all parallel tensors for this layer + qkv_weight = [] + qkv_bias = [] + dense_weight = [] + mlp_l0_weight = [] + mlp_l0_bias = [] + mlp_l1_weight = [] + for tp_rank, model in enumerate(models): + layer = model.language_model.encoder.layers[layer_num] + qkv_weight.append( + layer.self_attention.query_key_value.weight.data) + dense_weight.append(layer.self_attention.dense.weight.data) + mlp_l0_weight.append(layer.mlp.dense_h_to_4h.weight.data) + mlp_l1_weight.append(layer.mlp.dense_4h_to_h.weight.data) + if md.linear_bias: + qkv_bias.append( + layer.self_attention.query_key_value.bias.data) + mlp_l0_bias.append(layer.mlp.dense_h_to_4h.bias.data) + if args.add_qkv_bias: + qkv_bias.append( + layer.self_attention.query_key_value.bias.data) + + # Handle gated linear units + if md.swiglu: + # concat all the first halves ('W's) and all the second halves ('V's) + for tp_rank in range(tp_size): + mlp_l0_weight[tp_rank] = torch.chunk( + mlp_l0_weight[tp_rank], 2, dim=0) + message["mlp l0 weight W"] = torch.cat( + [w[0] for w in mlp_l0_weight], dim=0) + message["mlp l0 weight V"] = torch.cat( + [w[1] for w in mlp_l0_weight], dim=0) + else: + message["mlp l0 weight"] = torch.cat(mlp_l0_weight, dim=0) + + # simple concat of the rest + message["qkv weight"] = torch.cat(qkv_weight, dim=0) + message["dense weight"] = torch.cat(dense_weight, dim=1) + message["mlp l1 weight"] = torch.cat(mlp_l1_weight, dim=1) + if md.linear_bias: + message["qkv bias"] = torch.cat(qkv_bias, dim=0) + if md.swiglu: + for tp_rank in range(tp_size): + mlp_l0_bias[tp_rank] = torch.chunk( + mlp_l0_bias[tp_rank], 2, dim=0) + message["mlp l0 bias W"] = torch.cat( + [b[0] for b in mlp_l0_bias], dim=0) + message["mlp l0 bias V"] = torch.cat( + [b[1] for b in mlp_l0_bias], dim=0) + else: + message["mlp l0 bias"] = torch.cat(mlp_l0_bias, dim=0) + if args.add_qkv_bias: + message["qkv bias"] = torch.cat(qkv_bias, dim=0) + + queue_put(f"transformer layer {total_layer_num}", message) + + total_layer_num = total_layer_num + 1 + + # Send final norm from tp_rank 0 + message = { + "weight": models[0].language_model.encoder.final_norm.weight.data, + } + if norm_has_bias: + message["bias"] = models[0].language_model.encoder.final_norm.bias.data + queue_put("final norm", message) + + if md.output_layer: + message_weight = [] + for tp_rank in range(tp_size): + message_weight.append(models[tp_rank].language_model.output_layer.weight.data) + message = {"weight": torch.cat(message_weight, dim=0)} + queue_put("output layer", message) + + queue.put("done") + + +def load_checkpoint(queue, args): + try: + _load_checkpoint(queue, args) + except: + queue.put("exit") + raise