diff --git a/debug/accuracy_tools/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/api_accuracy_checker/common/utils.py
index a7922eadfcdd30930cf6d37c95071249885fc6a0..94907f01493272f95641047074c908c96a9449b8 100644
--- a/debug/accuracy_tools/api_accuracy_checker/common/utils.py
+++ b/debug/accuracy_tools/api_accuracy_checker/common/utils.py
@@ -79,14 +79,17 @@ class Const:
     API_PATTERN = r"^[A-Za-z0-9]+[_]+([A-Za-z0-9]+[_]*[A-Za-z0-9]+)[_]+[0-9]+[_]+[A-Za-z0-9]+"
     WRITE_FLAGS = os.O_WRONLY | os.O_CREAT
     WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR
-
+    
+    RAISE_PRECISION = {
+        "torch.float16" : "torch.float32",
+        "torch.bfloat16" : "torch.float32",
+        "torch.float32" : "torch.float64"
+    }
     CONVERT = {
-        "fp16_to_fp32": ["torch.float16", "torch.float32"],
-        "int32_to_int64": ["torch.int32", "torch.int64"]
+        "int32_to_int64": ["torch.int32", "torch.int64"],
     }
 
     CONVERT_API = {
-        "fp16_to_fp32": ["conv2d", "batch_norm", "relu", "max_pool2d", "interpolate", "group_norm", "layer_norm", "bmm", "tanh",  "cross_entropy", "linear", "numel"],
         "int32_to_int64": ["cross_entropy"]
     }
 
diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py
index 7caf8a7b5224b0194de7b50c8ecb095a1ffb3fc1..b0b1aaf605937f0cbf932d83811e6d2735d5a289 100644
--- a/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py
+++ b/debug/accuracy_tools/api_accuracy_checker/compare/algorithm.py
@@ -150,11 +150,11 @@ def flatten_compare_result(result):
 def compare_core(bench_out, npu_out, alg):
     msg = ""
     if not isinstance(bench_out, type(npu_out)):
-        return CompareConst.NAN, False, "bench and npu output type is different.", CompareConst.NAN, CompareConst.NAN
+        return [(CompareConst.NAN, "bench and npu output type is different.")], False, CompareConst.NA, CompareConst.NA
     if isinstance(bench_out, (list, tuple)):
         compare_result, test_success, bench_dtype, npu_dtype = [], True, [], []
         if len(bench_out) != len(npu_out):
-            return CompareConst.NAN, False, "bench and npu output structure is different", CompareConst.NAN, CompareConst.NAN
+            return [(CompareConst.NAN, "bench and npu output structure is different")], False, CompareConst.NA, CompareConst.NA
         for b_out_i, n_out_i in zip(bench_out, npu_out):
             compare_result_i, test_success_i, bench_dtype_i, npu_dtype_i = compare_core(b_out_i, n_out_i, alg)
             compare_result.append(compare_result_i)
@@ -164,15 +164,14 @@ def compare_core(bench_out, npu_out, alg):
     elif isinstance(bench_out, dict):
         b_keys, n_keys = set(bench_out.keys()), set(npu_out.keys())
         if b_keys != n_keys:
-            compare_result, test_success, msg = CompareConst.NAN, False, "bench and npu output dict keys are different", \
-                CompareConst.NAN, CompareConst.NAN
+            compare_result, test_success, bench_dtype, npu_dtype = [(CompareConst.NAN, "bench and npu output dict keys are different")], False, \
+                CompareConst.NA, CompareConst.NA
         compare_result, test_success, bench_dtype, npu_dtype = compare_core(list(bench_out.values()), list(npu_out.values()), alg)
     elif isinstance(bench_out, torch.Tensor):
         bench_dtype = str(bench_out.dtype)
         npu_dtype = str(npu_out.dtype)
-        if bench_out.dtype == torch.bfloat16:
-            bench_out = bench_out.to(torch.float32)
-            npu_out = npu_out.to(torch.float32)
+        if bench_out.dtype in [torch.float32, torch.float64] and bench_out.dtype != npu_out.dtype:
+            npu_out = npu_out.type(bench_out.dtype)
         compare_result, test_success, msg = compare_torch_tensor(bench_out.detach().numpy(), npu_out.detach().cpu().numpy(), alg)
     elif isinstance(bench_out, (bool, int, float, str)):
         compare_result, test_success, msg = compare_builtin_type(bench_out, npu_out)
diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py
index 813d0cb58618eae048ea4ab2057bdc1f50185500..27efa4bd3dae3ef0e010bfa92e2212038faa401f 100644
--- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py
+++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py
@@ -7,7 +7,7 @@ import torch
 from tqdm import tqdm
 from api_accuracy_checker.run_ut.data_generate import gen_api_params, gen_args
 from api_accuracy_checker.common.utils import print_info_log, print_warn_log, get_json_contents, api_info_preprocess, \
-    print_error_log, check_file_or_directory_path, initialize_save_path
+    print_error_log, check_file_or_directory_path, initialize_save_path, Const
 from api_accuracy_checker.compare.compare import Comparator
 from api_accuracy_checker.hook_module.wrap_tensor import TensorOPTemplate
 from api_accuracy_checker.hook_module.wrap_functional import FunctionalOPTemplate
@@ -44,7 +44,7 @@ def exec_api(api_type, api_name, args, kwargs):
     return out
 
 
-def generate_npu_params(cpu_args, cpu_kwargs, need_backward):
+def generate_npu_params(input_args, input_kwargs, need_backward):
     def recursive_arg_to_npu(arg_in):
         if isinstance(arg_in, (list, tuple)):
             return type(arg_in)(recursive_arg_to_npu(arg) for arg in arg_in)
@@ -60,10 +60,34 @@ def generate_npu_params(cpu_args, cpu_kwargs, need_backward):
         else:
             return arg_in
 
-    npu_args = recursive_arg_to_npu(cpu_args)
-    npu_kwargs = {key: recursive_arg_to_npu(value) for key, value in cpu_kwargs.items()}
+    npu_args = recursive_arg_to_npu(input_args)
+    npu_kwargs = {key: recursive_arg_to_npu(value) for key, value in input_kwargs.items()}
     return npu_args, npu_kwargs
 
+def generate_cpu_params(input_args, input_kwargs, need_backward):
+    def recursive_arg_to_cpu(arg_in):
+        if isinstance(arg_in, (list, tuple)):
+            return type(arg_in)(recursive_arg_to_cpu(arg) for arg in arg_in)
+        elif isinstance(arg_in, torch.Tensor):
+            if need_backward and arg_in.requires_grad:
+                if str(arg_in.dtype) in Const.RAISE_PRECISION.keys():
+                    arg_in = arg_in.clone().type(eval(Const.RAISE_PRECISION[str(arg_in.dtype)])).detach().requires_grad_()
+                else:
+                    arg_in = arg_in.clone().detach().requires_grad_()
+                temp_arg_in = arg_in * 1
+                arg_in = temp_arg_in.type_as(arg_in)
+                arg_in.retain_grad()
+                return arg_in
+            else:
+                if str(arg_in.dtype) in Const.RAISE_PRECISION.keys():
+                    return arg_in.clone().type(eval(Const.RAISE_PRECISION[str(arg_in.dtype)])).detach()
+                return arg_in.clone().detach()
+        else:
+            return arg_in
+
+    cpu_args = recursive_arg_to_cpu(input_args)
+    cpu_kwargs = {key: recursive_arg_to_cpu(value) for key, value in input_kwargs.items()}
+    return cpu_args, cpu_kwargs
 
 def run_ut(forward_file, backward_file, out_path, save_error_data):
     print_info_log("start UT test")
@@ -81,10 +105,7 @@ def run_ut(forward_file, backward_file, out_path, save_error_data):
                 do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success)
         except Exception as err:
             [_, api_name, _] = api_full_name.split("*")
-            if "not implemented for 'Half'" in str(err):
-                print_warn_log(f"API {api_name} not support half tensor in CPU, please add {api_name} to CONVERT_API "
-                               f"'fp16_to_fp32' list in accuracy_tools/api_accuracy_check/common/utils.py file.")
-            elif "expected scalar type Long" in str(err):
+            if "expected scalar type Long" in str(err):
                 print_warn_log(f"API {api_name} not support int32 tensor in CPU, please add {api_name} to CONVERT_API "
                                f"'int32_to_int64' list in accuracy_tools/api_accuracy_check/common/utils.py file.")
             else:
@@ -118,11 +139,12 @@ def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_di
     need_backward = need_backward and need_grad
     if inplace or not need_grad:
         print_warn_log("%s involves in-place operations, skip backward" % api_full_name)
+    cpu_args, cpu_kwargs = generate_cpu_params(args, kwargs, need_backward)
     npu_args, npu_kwargs = generate_npu_params(args, kwargs, need_backward)
     grad_out, npu_grad_out = None, None
     if kwargs.get("device"):
         del kwargs["device"]
-    out = exec_api(api_type, api_name, args, kwargs)
+    out = exec_api(api_type, api_name, cpu_args, cpu_kwargs)
     npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs)
     grad_input_index = api_setting_dict.get(api_name)
     grad_index = None
@@ -131,7 +153,7 @@ def run_torch_api(api_full_name, api_setting_dict, backward_content, api_info_di
         grad_index = grad_input_index.get('grad_index')
 
     if need_backward:
-        grad_out, npu_grad_out, grad, npu_grad = run_backward(api_full_name, args, backward_content, grad_index, npu_args,
+        grad_out, npu_grad_out, grad, npu_grad = run_backward(api_full_name, cpu_args, backward_content, grad_index, npu_args,
                                                               npu_out, out)
     if grad_index is not None:
         return UtDataInfo(grad_out, npu_grad_out, npu_out[grad_index], out[grad_index], grad, in_fwd_data_list)
@@ -153,12 +175,13 @@ def get_api_info(api_info_dict, api_name):
 def run_backward(api_full_name, args, backward_content, grad_index, npu_args, npu_out, out):
     backward_args = backward_content[api_full_name]
     grad = gen_args(backward_args)[0]
+    cpu_grad, _ = generate_cpu_params(grad, {}, False)
     if grad_index is not None:
-        out[grad_index].backward(grad)
+        out[grad_index].backward(cpu_grad)
     elif isinstance(out, (list, tuple)):
         raise NotImplementedError("Multiple backward is not supported.")
     else:
-        out.backward(grad)
+        out.backward(cpu_grad)
     args_grad = []
     for arg in args:
         if isinstance(arg, torch.Tensor):