diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/apply_adam_w.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/apply_adam_w.py index 4772fe21b4bf3ab395f4dff852150096a730abec..d68363a6d3c264cdf3e8d853dc9903dc6e8ccfd4 100644 --- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/apply_adam_w.py +++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/apply_adam_w.py @@ -16,11 +16,15 @@ def npu_apply_adam_w(beta1_power, beta2_power, lr, weight_decay, beta2_power_out = beta2_power * beta2 if amsgrad: max_grad_norm_out = torch.max(max_grad_norm, v_out) + if (1 - beta2_power_out) == 0: + beta2_power_out -= eps denom = torch.sqrt(torch.div(max_grad_norm_out, (1 - beta2_power_out))) + eps else: vraintain = torch.div(v_out, (1 - beta2_power_out)) denom = torch.sqrt(vraintain) + eps + if (1 - beta1_power_out) == 0: + beta1_power_out -= eps var_out = var_t + torch.div(-lr * m_out, (1 - beta1_power_out)).div(denom) - return var_out, m_out, v_out + return var_out.cpu(), m_out.cpu(), v_out.cpu() diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/confusion_transpose.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/confusion_transpose.py index d7323f272b63f469930f95937e6a23390419f63d..95bd7f0cf9458d8805b8fd8fbf12dbb31d728222 100644 --- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/confusion_transpose.py +++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/confusion_transpose.py @@ -8,7 +8,7 @@ def npu_confusion_transpose(data, perm, shape, transpose_first): output = data.permute(*perm).contiguous().view(shape) else: output = data.view(shape).permute(*perm) - return output + return output.cpu() @npu_custom_grad_functions @@ -22,4 +22,4 @@ def npu_confusion_transpose_backward(grad, perm, shape, transpose_first): result = grad.permute(*perm_cal).reshape(shape_cal) else: result = grad.reshape(shape_cal).permute(*perm_cal) - return result + return result.cpu() diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/fast_gelu.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/fast_gelu.py index 3a04a24b6f3522b320a5e882e24f6eb9d9fb7338..57a495a92dd9314791f98cc33792408af8fe328b 100644 --- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/fast_gelu.py +++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/fast_gelu.py @@ -21,7 +21,7 @@ def fast_gelu(input0): div_down_rec = torch.reciprocal(div_down) result = div_up * div_down_rec - return result + return result.cpu() @npu_custom_grad_functions @@ -55,4 +55,4 @@ def npu_fast_gelu_backward(grad, input_x): result_temp = div_up * div_down_rec result = grad * result_temp - return result + return result.cpu() diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/layer_norm_eval.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/layer_norm_eval.py index f257c356695f62cd38ff4185cbee93f553084534..28dd5ef909b80ec1e0bfa4b4aa9d13f322fd3c54 100644 --- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/layer_norm_eval.py +++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/layer_norm_eval.py @@ -5,4 +5,4 @@ from api_accuracy_checker.common.function_factory import npu_custom_functions @npu_custom_functions def npu_layer_norm_eval(data, normalized_shape): result = torch.nn.functional.layer_norm(data, normalized_shape) - return result + return result.cpu() diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/linear.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/linear.py index 48bb08c4f3c5dc3f7b978933cbd7dcc02ee9d01f..5aef70b6dad438d3ac8899b671ad9e3942b8ef7b 100644 --- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/linear.py +++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/linear.py @@ -5,11 +5,11 @@ from api_accuracy_checker.common.function_factory import npu_custom_functions, n @npu_custom_functions def npu_linear(x, weight, bias): output = torch.nn.functional.linear(x, weight, bias) - return output + return output.cpu() @npu_custom_grad_functions def npu_linear_backward(grad, input_data, weight): input_grad = torch.matmul(grad, weight) weight_grad = torch.matmul(grad.t(), input_data) - return input_grad, weight_grad + return input_grad.cpu(), weight_grad.cpu() diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/matmul_backward.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/matmul_backward.py index c7685090505a185aa8c922a17c824f952fc3c3ea..4a05b3e13059d714b79a7bee049f98aca78015fc 100644 --- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/matmul_backward.py +++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/matmul_backward.py @@ -48,4 +48,4 @@ def matmul_backward(grad, self, other, mask): grad_self = torch.matmul(grad, other.transpose(-1, -2)) if mask[0] else grad_self grad_other = torch.matmul(self.transpose(-1, -2), grad) if mask[1] else grad_other - return grad_self, grad_other + return grad_self.cpu(), grad_other.cpu() diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/rms_norm.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/rms_norm.py index bdf7ea616a2e0434dd30969875110fa2e28214b0..75e03e314c6b09e16661777ba9b8e173e0337e15 100644 --- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/rms_norm.py +++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/rms_norm.py @@ -6,7 +6,7 @@ from api_accuracy_checker.common.function_factory import npu_custom_functions, n def npu_rms_norm(x, gamma, eps=1e-5): rstd = torch.rsqrt(torch.mean(torch.pow(x, 2), axis=-1, keepdim=True) + eps) res = x * rstd * gamma - return res, rstd + return res.cpu(), rstd.cpu() @npu_custom_grad_functions @@ -14,5 +14,5 @@ def npu_rms_norm_backward(grad, x, gamma, rstd): mean_gy = (grad * x * gamma * rstd).mean(dim=-1, keepdim=True) grad_x = (grad * gamma - x * rstd * mean_gy) * rstd grad_gamma = x * grad * rstd - return grad_x, grad_gamma + return grad_x.cpu(), grad_gamma.cpu() diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/rotary_mul.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/rotary_mul.py index cad5459c767fef0f4de2c158f40bbb44c9ea12d2..b61924b3ce49b1c9dd8d72909577ad256eed419f 100644 --- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/rotary_mul.py +++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/rotary_mul.py @@ -7,11 +7,14 @@ def npu_rotary_mul(x, r1, r2): x1, x2 = torch.chunk(x, 2, -1) x_new = torch.cat((-x2, x1), dim=-1) output = r1 * x + r2 * x_new - return output + return output.cpu() @npu_custom_grad_functions def npu_rotary_mul_backward(dy_tensor, x, r1, r2): + x.requires_grad = True + r1.requires_grad = True + r2.requires_grad = True # golden x1, x2 = torch.chunk(x, 2, -1) x_new = torch.cat((-x2, x1), dim=-1) @@ -49,4 +52,4 @@ def npu_rotary_mul_backward(dy_tensor, x, r1, r2): for j in range(x_shape[2]): r2_grad[:, 0, 0, :] += (x_new2[:, i, j, :] * grad[:, i, j, :]) r1_grad[:, 0, 0, :] += (h[:, i, j, :] * grad[:, i, j, :]) - return x.grad, r1_grad, r2_grad + return x.grad.cpu(), r1_grad.cpu(), r2_grad.cpu() diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/scaled_mask_softmax.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/scaled_mask_softmax.py index 8e99ab2008f278eac0dcbe32f5bbd325c443792e..22ba1842da7e40d02bb1eacc65d92f7bb565ac91 100644 --- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/scaled_mask_softmax.py +++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/scaled_mask_softmax.py @@ -11,7 +11,7 @@ def npu_scaled_masked_softmax(x, mask, scale, fixed_triu_mask): x = x - torch.max(x, dim=-1, keepdims=True)[0] x = torch.exp(x.float()) y = torch.div(x, torch.sum(x, dim=-1, keepdims=True)) - return y.to(dtype) + return y.to(dtype).cpu() @npu_custom_grad_functions @@ -26,4 +26,4 @@ def npu_scaled_masked_softmax_backward(y_grad, y, mask, scale, fixed_triu_mask): x_grad = x_grad * y x_grad = x_grad * scale x_grad = x_grad.masked_fill(mask, value=0) - return x_grad.to(dtype) + return x_grad.to(dtype).cpu() diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/swiglu.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/swiglu.py index 6685b5f47be0fd4c84fb0bbc82190d6efdc6896d..bd2a059b85ca86f1f4cd9f0203b5ebb7ce0b001e 100644 --- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/swiglu.py +++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/swiglu.py @@ -16,7 +16,7 @@ def npu_swiglu(x, dim=-1): tensor_out_float = torch.nn.functional.silu(tensor_self_float).type(tensor_dtype).type( torch.float32) * tensor_other_float output_data = tensor_out_float.type(tensor_dtype) - return output_data + return output_data.cpu() @npu_custom_grad_functions @@ -46,7 +46,7 @@ def npu_swiglu_backward(grad, x, dim=-1): tensor_out1 = torch.mul(torch.mul(in_tensors[1], swish_grad(1.0, in_tensors[0])), tensor_grad_out) tensor_out2 = torch.mul(tensor_grad_out, swish(1.0, in_tensors[0])) output = torch.cat((tensor_out1, tensor_out2), dim) - return output + return output.cpu() def swish_grad(beta, x): diff --git a/debug/accuracy_tools/api_accuracy_checker/common/config.py b/debug/accuracy_tools/api_accuracy_checker/common/config.py index 8001257e8ed1758ef4ed6fed7b1c9dc9bd108370..8a5ec88c59213b7d693fe7fa0a08bd90c32b68fa 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/config.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/config.py @@ -25,7 +25,7 @@ class Config: 'jit_compile': bool, 'precision': int, 'is_online': bool, - 'is_golden': bool, + 'is_benchmark_device': bool, 'host': str, 'port': int, 'rank_list': list @@ -62,7 +62,7 @@ class Config: return '\n'.join(f"{key}={value}" for key, value in self.config.items()) def update_config(self, dump_path=None, real_data=None, target_iter=None, white_list=None, enable_dataloader=None, - is_online=None, port=None, host=None, rank_list=None): + is_online=None, is_benchmark_device=True, port=None, host=None, rank_list=None): args = { "dump_path": dump_path if dump_path is not None else self.config.get("dump_path", './'), "real_data": real_data if real_data is not None else self.config.get("real_data", False), @@ -71,9 +71,9 @@ class Config: "enable_dataloader": enable_dataloader if enable_dataloader is not None else self.config.get("enable_dataloader", False), "is_online": is_online if is_online is not None else self.config.get("is_online", False), - "is_golden": False if is_online and host is not None else True, - "host": host if host is not None else self.config.get("host", "127.0.0.1"), - "port": port if port is not None else self.config.get("port", 30001), + "is_benchmark_device": is_benchmark_device, + "host": host if host is not None else self.config.get("host", ""), + "port": port if port is not None else self.config.get("port", -1), "rank_list": rank_list if rank_list is not None else self.config.get("rank_list", [0]) } for key, value in args.items(): diff --git a/debug/accuracy_tools/api_accuracy_checker/config.yaml b/debug/accuracy_tools/api_accuracy_checker/config.yaml index 24cd96997e3cc0a662966c0d8cc285a9df6eaf5e..8241cbe5066acfaf3437cfa85af2c9b1f6e5d778 100644 --- a/debug/accuracy_tools/api_accuracy_checker/config.yaml +++ b/debug/accuracy_tools/api_accuracy_checker/config.yaml @@ -7,8 +7,8 @@ error_data_path: './' jit_compile: True precision: 14 is_online: False -is_golden: True -host: "127.0.0.1" -port: 30001 +is_benchmark_device: True +host: "" +port: -1 rank_list: [0] \ No newline at end of file diff --git a/debug/accuracy_tools/api_accuracy_checker/dump/dump.py b/debug/accuracy_tools/api_accuracy_checker/dump/dump.py index 535b912dd24960e1470506a2b0b9489c47dc6503..4ebbd488fd3312d2464f5204171909f182d722e8 100644 --- a/debug/accuracy_tools/api_accuracy_checker/dump/dump.py +++ b/debug/accuracy_tools/api_accuracy_checker/dump/dump.py @@ -69,13 +69,12 @@ class DumpUtil(object): call_num = 0 phase = "all" rank_list = msCheckerConfig.rank_list - attl_config = None attl = None - if msCheckerConfig.is_online: - attl_config = ATTLConfig(msCheckerConfig.is_golden, connect_ip=msCheckerConfig.host, + if msCheckerConfig.is_online and not msCheckerConfig.is_benchmark_device: + attl_config = ATTLConfig(False, connect_ip=msCheckerConfig.host, connect_port=msCheckerConfig.port) need_dump = dist.get_rank() in msCheckerConfig.rank_list if dist.is_initialized() else True - attl = ATTL('gpu' if msCheckerConfig.is_golden else 'npu', attl_config, need_dump=need_dump) + attl = ATTL('npu', attl_config, need_dump=need_dump) @staticmethod def set_dump_switch(switch): diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index e45c56732bccf4947d049ec0c5385218d2a02653..bfc3451664487ac6a2b1ddaa238bdf10d50acfc6 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -316,6 +316,9 @@ def run_torch_api_online(api_full_name, api_data, backward_content): device_out = exec_api(api_type, api_name, args, kwargs) device_out = device_out.cpu() if hasattr(device_out, "cpu") else device_out + if "torch.return_types" in str(type(device_out)): + device_out = tuple(device_out) + out = tuple(out) return UtDataInfo(None, None, out, device_out, None, in_fwd_data_list, None, rank=api_data.rank) @@ -384,7 +387,7 @@ def get_validated_details_csv_path(validated_result_csv_path): def init_attl(): - attl = ATTL('gpu', ATTLConfig(is_golden=True, connect_port=msCheckerConfig.port)) + attl = ATTL('gpu', ATTLConfig(is_benchmark_device=True, connect_ip=msCheckerConfig.host, connect_port=msCheckerConfig.port)) return attl diff --git a/debug/accuracy_tools/api_accuracy_checker/tensor_transport_layer/attl.py b/debug/accuracy_tools/api_accuracy_checker/tensor_transport_layer/attl.py index fcf93b58a49eb1fc38286f632dcf5dd47b04630c..5ed386c4c53cd0a8afac1756d50b192e16b5be7f 100644 --- a/debug/accuracy_tools/api_accuracy_checker/tensor_transport_layer/attl.py +++ b/debug/accuracy_tools/api_accuracy_checker/tensor_transport_layer/attl.py @@ -1,5 +1,6 @@ import io import time +import re from multiprocessing import Queue from typing import Optional, Union, Dict, Any from collections import namedtuple @@ -20,9 +21,9 @@ BufferType = Union[ApiData, Dict[str, Any], str] # Union[Tensor, Tuple[Optional @dataclass class ATTLConfig: # net_config: dict - is_golden: bool - connect_ip: str = "127.0.0.1" - connect_port: int = 8006 + is_benchmark_device: bool + connect_ip: str + connect_port: int # storage_config check_sum: bool = True queue_size: int = 50 @@ -38,7 +39,8 @@ class ATTL: self.dequeue_list = [] self.message_end = False self.kill_progress = False - if self.session_config.is_golden: + self.check_attl_config() + if self.session_config.is_benchmark_device: self.socket_manager = TCPServer(self.session_config.connect_port, self.data_queue, self.session_config.check_sum) @@ -49,6 +51,14 @@ class ATTL: self.session_config.check_sum) self.socket_manager.start() + def check_attl_config(self): + ipv4_pattern = "([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5])(\.([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5])){3}$" + if not re.match(ipv4_pattern, self.session_config.connect_ip): + raise Exception(f"host {self.session_config.connect_ip} is invalid.") + if not (0 < self.session_config.connect_port <= 65535): + raise Exception(f"port {self.session_config.connect_port} is invalid.") + + def stop_serve(self): if isinstance(self.socket_manager, TCPServer): self.socket_manager.stop() @@ -137,14 +147,9 @@ def move2target_device(buffer: ApiData, target_device): new_kwargs = move2device_exec(buffer.kwargs, target_device) # handle result - new_results = [] - res = buffer.result[0] if isinstance(buffer.result, (tuple, list)) else buffer.result - if isinstance(res, torch.Tensor) and res.device.type != target_device: - new_results.append(res.detach().to(target_device)) - else: - new_results.append(res) + new_results = move2device_exec(buffer.result, target_device) if target_device == torch.device('cpu') or target_device == "cpu": - return ApiData(buffer.name, tuple(new_args), new_kwargs, new_results[0], buffer.step, buffer.rank) + return ApiData(buffer.name, tuple(new_args), new_kwargs, new_results, buffer.step, buffer.rank) else: return ApiData(buffer.name, tuple(new_args), new_kwargs, buffer.result, buffer.step, buffer.rank)