diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/apply_adam_w.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/apply_adam_w.py
index 4772fe21b4bf3ab395f4dff852150096a730abec..d68363a6d3c264cdf3e8d853dc9903dc6e8ccfd4 100644
--- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/apply_adam_w.py
+++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/apply_adam_w.py
@@ -16,11 +16,15 @@ def npu_apply_adam_w(beta1_power, beta2_power, lr, weight_decay,
     beta2_power_out = beta2_power * beta2
     if amsgrad:
         max_grad_norm_out = torch.max(max_grad_norm, v_out)
+        if (1 - beta2_power_out) == 0:
+            beta2_power_out -= eps
         denom = torch.sqrt(torch.div(max_grad_norm_out, (1 - beta2_power_out))) + eps
     else:
         vraintain = torch.div(v_out, (1 - beta2_power_out))
         denom = torch.sqrt(vraintain) + eps
 
+    if (1 - beta1_power_out) == 0:
+        beta1_power_out -= eps
     var_out = var_t + torch.div(-lr * m_out, (1 - beta1_power_out)).div(denom)
-    return var_out, m_out, v_out
+    return var_out.cpu(), m_out.cpu(), v_out.cpu()
 
diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/confusion_transpose.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/confusion_transpose.py
index d7323f272b63f469930f95937e6a23390419f63d..95bd7f0cf9458d8805b8fd8fbf12dbb31d728222 100644
--- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/confusion_transpose.py
+++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/confusion_transpose.py
@@ -8,7 +8,7 @@ def npu_confusion_transpose(data, perm, shape, transpose_first):
         output = data.permute(*perm).contiguous().view(shape)
     else:
         output = data.view(shape).permute(*perm)
-    return output
+    return output.cpu()
 
 
 @npu_custom_grad_functions
@@ -22,4 +22,4 @@ def npu_confusion_transpose_backward(grad, perm, shape, transpose_first):
         result = grad.permute(*perm_cal).reshape(shape_cal)
     else:
         result = grad.reshape(shape_cal).permute(*perm_cal)
-    return result
+    return result.cpu()
diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/fast_gelu.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/fast_gelu.py
index 3a04a24b6f3522b320a5e882e24f6eb9d9fb7338..57a495a92dd9314791f98cc33792408af8fe328b 100644
--- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/fast_gelu.py
+++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/fast_gelu.py
@@ -21,7 +21,7 @@ def fast_gelu(input0):
     div_down_rec = torch.reciprocal(div_down)
     result = div_up * div_down_rec
 
-    return result
+    return result.cpu()
 
 
 @npu_custom_grad_functions
@@ -55,4 +55,4 @@ def npu_fast_gelu_backward(grad, input_x):
     result_temp = div_up * div_down_rec
     result = grad * result_temp
 
-    return result
+    return result.cpu()
diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/layer_norm_eval.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/layer_norm_eval.py
index f257c356695f62cd38ff4185cbee93f553084534..28dd5ef909b80ec1e0bfa4b4aa9d13f322fd3c54 100644
--- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/layer_norm_eval.py
+++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/layer_norm_eval.py
@@ -5,4 +5,4 @@ from api_accuracy_checker.common.function_factory import npu_custom_functions
 @npu_custom_functions
 def npu_layer_norm_eval(data, normalized_shape):
     result = torch.nn.functional.layer_norm(data, normalized_shape)
-    return result
+    return result.cpu()
diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/linear.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/linear.py
index 48bb08c4f3c5dc3f7b978933cbd7dcc02ee9d01f..5aef70b6dad438d3ac8899b671ad9e3942b8ef7b 100644
--- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/linear.py
+++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/linear.py
@@ -5,11 +5,11 @@ from api_accuracy_checker.common.function_factory import npu_custom_functions, n
 @npu_custom_functions
 def npu_linear(x, weight, bias):
     output = torch.nn.functional.linear(x, weight, bias)
-    return output
+    return output.cpu()
 
 
 @npu_custom_grad_functions
 def npu_linear_backward(grad, input_data, weight):
     input_grad = torch.matmul(grad, weight)
     weight_grad = torch.matmul(grad.t(), input_data)
-    return input_grad, weight_grad
+    return input_grad.cpu(), weight_grad.cpu()
diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/matmul_backward.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/matmul_backward.py
index c7685090505a185aa8c922a17c824f952fc3c3ea..4a05b3e13059d714b79a7bee049f98aca78015fc 100644
--- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/matmul_backward.py
+++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/matmul_backward.py
@@ -48,4 +48,4 @@ def matmul_backward(grad, self, other, mask):
         grad_self = torch.matmul(grad, other.transpose(-1, -2)) if mask[0] else grad_self
         grad_other = torch.matmul(self.transpose(-1, -2), grad) if mask[1] else grad_other
 
-    return grad_self, grad_other
+    return grad_self.cpu(), grad_other.cpu()
diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/rms_norm.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/rms_norm.py
index bdf7ea616a2e0434dd30969875110fa2e28214b0..75e03e314c6b09e16661777ba9b8e173e0337e15 100644
--- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/rms_norm.py
+++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/rms_norm.py
@@ -6,7 +6,7 @@ from api_accuracy_checker.common.function_factory import npu_custom_functions, n
 def npu_rms_norm(x, gamma, eps=1e-5):
     rstd = torch.rsqrt(torch.mean(torch.pow(x, 2), axis=-1, keepdim=True) + eps)
     res = x * rstd * gamma
-    return res, rstd
+    return res.cpu(), rstd.cpu()
 
 
 @npu_custom_grad_functions
@@ -14,5 +14,5 @@ def npu_rms_norm_backward(grad, x, gamma, rstd):
     mean_gy = (grad * x * gamma * rstd).mean(dim=-1, keepdim=True)
     grad_x = (grad * gamma - x * rstd * mean_gy) * rstd
     grad_gamma = x * grad * rstd
-    return grad_x, grad_gamma
+    return grad_x.cpu(), grad_gamma.cpu()
 
diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/rotary_mul.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/rotary_mul.py
index cad5459c767fef0f4de2c158f40bbb44c9ea12d2..b61924b3ce49b1c9dd8d72909577ad256eed419f 100644
--- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/rotary_mul.py
+++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/rotary_mul.py
@@ -7,11 +7,14 @@ def npu_rotary_mul(x, r1, r2):
     x1, x2 = torch.chunk(x, 2, -1)
     x_new = torch.cat((-x2, x1), dim=-1)
     output = r1 * x + r2 * x_new
-    return output
+    return output.cpu()
 
 
 @npu_custom_grad_functions
 def npu_rotary_mul_backward(dy_tensor, x, r1, r2):
+    x.requires_grad = True
+    r1.requires_grad = True
+    r2.requires_grad = True
     # golden
     x1, x2 = torch.chunk(x, 2, -1)
     x_new = torch.cat((-x2, x1), dim=-1)
@@ -49,4 +52,4 @@ def npu_rotary_mul_backward(dy_tensor, x, r1, r2):
             for j in range(x_shape[2]):
                 r2_grad[:, 0, 0, :] += (x_new2[:, i, j, :] * grad[:, i, j, :])
                 r1_grad[:, 0, 0, :] += (h[:, i, j, :] * grad[:, i, j, :])
-    return x.grad, r1_grad, r2_grad
+    return x.grad.cpu(), r1_grad.cpu(), r2_grad.cpu()
diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/scaled_mask_softmax.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/scaled_mask_softmax.py
index 8e99ab2008f278eac0dcbe32f5bbd325c443792e..22ba1842da7e40d02bb1eacc65d92f7bb565ac91 100644
--- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/scaled_mask_softmax.py
+++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/scaled_mask_softmax.py
@@ -11,7 +11,7 @@ def npu_scaled_masked_softmax(x, mask, scale, fixed_triu_mask):
     x = x - torch.max(x, dim=-1, keepdims=True)[0]
     x = torch.exp(x.float())
     y = torch.div(x, torch.sum(x, dim=-1, keepdims=True))
-    return y.to(dtype)
+    return y.to(dtype).cpu()
 
 
 @npu_custom_grad_functions
@@ -26,4 +26,4 @@ def npu_scaled_masked_softmax_backward(y_grad, y, mask, scale, fixed_triu_mask):
     x_grad = x_grad * y
     x_grad = x_grad * scale
     x_grad = x_grad.masked_fill(mask, value=0)
-    return x_grad.to(dtype)
+    return x_grad.to(dtype).cpu()
diff --git a/debug/accuracy_tools/api_accuracy_checker/bench_functions/swiglu.py b/debug/accuracy_tools/api_accuracy_checker/bench_functions/swiglu.py
index 6685b5f47be0fd4c84fb0bbc82190d6efdc6896d..bd2a059b85ca86f1f4cd9f0203b5ebb7ce0b001e 100644
--- a/debug/accuracy_tools/api_accuracy_checker/bench_functions/swiglu.py
+++ b/debug/accuracy_tools/api_accuracy_checker/bench_functions/swiglu.py
@@ -16,7 +16,7 @@ def npu_swiglu(x, dim=-1):
         tensor_out_float = torch.nn.functional.silu(tensor_self_float).type(tensor_dtype).type(
             torch.float32) * tensor_other_float
         output_data = tensor_out_float.type(tensor_dtype)
-    return output_data
+    return output_data.cpu()
 
 
 @npu_custom_grad_functions
@@ -46,7 +46,7 @@ def npu_swiglu_backward(grad, x, dim=-1):
         tensor_out1 = torch.mul(torch.mul(in_tensors[1], swish_grad(1.0, in_tensors[0])), tensor_grad_out)
         tensor_out2 = torch.mul(tensor_grad_out, swish(1.0, in_tensors[0]))
         output = torch.cat((tensor_out1, tensor_out2), dim)
-    return output
+    return output.cpu()
 
 
 def swish_grad(beta, x):
diff --git a/debug/accuracy_tools/api_accuracy_checker/common/config.py b/debug/accuracy_tools/api_accuracy_checker/common/config.py
index 8001257e8ed1758ef4ed6fed7b1c9dc9bd108370..8a5ec88c59213b7d693fe7fa0a08bd90c32b68fa 100644
--- a/debug/accuracy_tools/api_accuracy_checker/common/config.py
+++ b/debug/accuracy_tools/api_accuracy_checker/common/config.py
@@ -25,7 +25,7 @@ class Config:
             'jit_compile': bool,
             'precision': int,
             'is_online': bool,
-            'is_golden': bool,
+            'is_benchmark_device': bool,
             'host': str,
             'port': int,
             'rank_list': list
@@ -62,7 +62,7 @@ class Config:
         return '\n'.join(f"{key}={value}" for key, value in self.config.items())
 
     def update_config(self, dump_path=None, real_data=None, target_iter=None, white_list=None, enable_dataloader=None,
-                      is_online=None, port=None, host=None, rank_list=None):
+                      is_online=None, is_benchmark_device=True, port=None, host=None, rank_list=None):
         args = {
             "dump_path": dump_path if dump_path is not None else self.config.get("dump_path", './'),
             "real_data": real_data if real_data is not None else self.config.get("real_data", False),
@@ -71,9 +71,9 @@ class Config:
             "enable_dataloader": enable_dataloader
             if enable_dataloader is not None else self.config.get("enable_dataloader", False),
             "is_online": is_online if is_online is not None else self.config.get("is_online", False),
-            "is_golden": False if is_online and host is not None else True,
-            "host": host if host is not None else self.config.get("host", "127.0.0.1"),
-            "port": port if port is not None else self.config.get("port", 30001),
+            "is_benchmark_device": is_benchmark_device,
+            "host": host if host is not None else self.config.get("host", ""),
+            "port": port if port is not None else self.config.get("port", -1),
             "rank_list": rank_list if rank_list is not None else self.config.get("rank_list", [0])
         }
         for key, value in args.items():
diff --git a/debug/accuracy_tools/api_accuracy_checker/config.yaml b/debug/accuracy_tools/api_accuracy_checker/config.yaml
index 24cd96997e3cc0a662966c0d8cc285a9df6eaf5e..8241cbe5066acfaf3437cfa85af2c9b1f6e5d778 100644
--- a/debug/accuracy_tools/api_accuracy_checker/config.yaml
+++ b/debug/accuracy_tools/api_accuracy_checker/config.yaml
@@ -7,8 +7,8 @@ error_data_path: './'
 jit_compile: True
 precision: 14
 is_online: False
-is_golden: True
-host: "127.0.0.1"
-port: 30001
+is_benchmark_device: True
+host: ""
+port: -1
 rank_list: [0]
  
\ No newline at end of file
diff --git a/debug/accuracy_tools/api_accuracy_checker/dump/dump.py b/debug/accuracy_tools/api_accuracy_checker/dump/dump.py
index 535b912dd24960e1470506a2b0b9489c47dc6503..4ebbd488fd3312d2464f5204171909f182d722e8 100644
--- a/debug/accuracy_tools/api_accuracy_checker/dump/dump.py
+++ b/debug/accuracy_tools/api_accuracy_checker/dump/dump.py
@@ -69,13 +69,12 @@ class DumpUtil(object):
     call_num = 0
     phase = "all"
     rank_list = msCheckerConfig.rank_list
-    attl_config = None
     attl = None
-    if msCheckerConfig.is_online:
-        attl_config = ATTLConfig(msCheckerConfig.is_golden, connect_ip=msCheckerConfig.host,
+    if msCheckerConfig.is_online and not msCheckerConfig.is_benchmark_device:
+        attl_config = ATTLConfig(False, connect_ip=msCheckerConfig.host,
                                  connect_port=msCheckerConfig.port)
         need_dump = dist.get_rank() in msCheckerConfig.rank_list if dist.is_initialized() else True
-        attl = ATTL('gpu' if msCheckerConfig.is_golden else 'npu', attl_config, need_dump=need_dump)
+        attl = ATTL('npu', attl_config, need_dump=need_dump)
 
     @staticmethod
     def set_dump_switch(switch):
diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py
index e45c56732bccf4947d049ec0c5385218d2a02653..bfc3451664487ac6a2b1ddaa238bdf10d50acfc6 100644
--- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py
+++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py
@@ -316,6 +316,9 @@ def run_torch_api_online(api_full_name, api_data, backward_content):
 
     device_out = exec_api(api_type, api_name, args, kwargs)
     device_out = device_out.cpu() if hasattr(device_out, "cpu") else device_out
+    if "torch.return_types" in str(type(device_out)):
+        device_out = tuple(device_out)
+        out = tuple(out)
     return UtDataInfo(None, None, out, device_out, None, in_fwd_data_list, None, rank=api_data.rank)
 
 
@@ -384,7 +387,7 @@ def get_validated_details_csv_path(validated_result_csv_path):
 
 
 def init_attl():
-    attl = ATTL('gpu', ATTLConfig(is_golden=True, connect_port=msCheckerConfig.port))
+    attl = ATTL('gpu', ATTLConfig(is_benchmark_device=True, connect_ip=msCheckerConfig.host, connect_port=msCheckerConfig.port))
     return attl
 
 
diff --git a/debug/accuracy_tools/api_accuracy_checker/tensor_transport_layer/attl.py b/debug/accuracy_tools/api_accuracy_checker/tensor_transport_layer/attl.py
index fcf93b58a49eb1fc38286f632dcf5dd47b04630c..5ed386c4c53cd0a8afac1756d50b192e16b5be7f 100644
--- a/debug/accuracy_tools/api_accuracy_checker/tensor_transport_layer/attl.py
+++ b/debug/accuracy_tools/api_accuracy_checker/tensor_transport_layer/attl.py
@@ -1,5 +1,6 @@
 import io
 import time
+import re
 from multiprocessing import Queue
 from typing import Optional, Union, Dict, Any
 from collections import namedtuple
@@ -20,9 +21,9 @@ BufferType = Union[ApiData, Dict[str, Any], str]  # Union[Tensor, Tuple[Optional
 @dataclass
 class ATTLConfig:
     # net_config: dict
-    is_golden: bool
-    connect_ip: str = "127.0.0.1"
-    connect_port: int = 8006
+    is_benchmark_device: bool
+    connect_ip: str
+    connect_port: int
     # storage_config
     check_sum: bool = True
     queue_size: int = 50
@@ -38,7 +39,8 @@ class ATTL:
         self.dequeue_list = []
         self.message_end = False
         self.kill_progress = False
-        if self.session_config.is_golden:
+        self.check_attl_config()
+        if self.session_config.is_benchmark_device:
             self.socket_manager = TCPServer(self.session_config.connect_port,
                                             self.data_queue,
                                             self.session_config.check_sum)
@@ -49,6 +51,14 @@ class ATTL:
                                             self.session_config.check_sum)
             self.socket_manager.start()
 
+    def check_attl_config(self):
+        ipv4_pattern = "([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5])(\.([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5])){3}$"
+        if not re.match(ipv4_pattern, self.session_config.connect_ip):
+            raise Exception(f"host {self.session_config.connect_ip} is invalid.")
+        if not (0 < self.session_config.connect_port <= 65535):
+            raise Exception(f"port {self.session_config.connect_port} is invalid.")
+
+
     def stop_serve(self):
         if isinstance(self.socket_manager, TCPServer):
             self.socket_manager.stop()
@@ -137,14 +147,9 @@ def move2target_device(buffer: ApiData, target_device):
     new_kwargs = move2device_exec(buffer.kwargs, target_device)
 
     # handle result
-    new_results = []
-    res = buffer.result[0] if isinstance(buffer.result, (tuple, list)) else buffer.result
-    if isinstance(res, torch.Tensor) and res.device.type != target_device:
-        new_results.append(res.detach().to(target_device))
-    else:
-        new_results.append(res)
+    new_results = move2device_exec(buffer.result, target_device)
 
     if target_device == torch.device('cpu') or target_device == "cpu":
-        return ApiData(buffer.name, tuple(new_args), new_kwargs, new_results[0], buffer.step, buffer.rank)
+        return ApiData(buffer.name, tuple(new_args), new_kwargs, new_results, buffer.step, buffer.rank)
     else:
         return ApiData(buffer.name, tuple(new_args), new_kwargs, buffer.result, buffer.step, buffer.rank)