From 400103573bc15b0683c983451a9fcf277d783145 Mon Sep 17 00:00:00 2001
From: Yuanhao Ji <jiyuanhao@apache.org>
Date: Mon, 29 Apr 2024 11:23:07 +0800
Subject: [PATCH 1/2] fix: replace KERNEL_PRIVATEUSEONE2 with
 KERNEL_PRIVATEUSEONE

---
 torch_npu/csrc/aten/AutoCastOps.cpp | 38 ++++++++++++++---------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/torch_npu/csrc/aten/AutoCastOps.cpp b/torch_npu/csrc/aten/AutoCastOps.cpp
index 92bd295d8c0..9bad40a54b2 100644
--- a/torch_npu/csrc/aten/AutoCastOps.cpp
+++ b/torch_npu/csrc/aten/AutoCastOps.cpp
@@ -22,15 +22,15 @@ TORCH_LIBRARY_IMPL(_, AutocastPrivateUse1, m) {
 
 TORCH_LIBRARY_IMPL(aten, AutocastPrivateUse1, m) {
   // lower_precision_fp
-  KERNEL_PRIVATEUSEONE2(_convolution, deprecated, lower_precision_fp)
+  KERNEL_PRIVATEUSEONE(_convolution, deprecated, lower_precision_fp)
   KERNEL_PRIVATEUSEONE(_convolution, lower_precision_fp)
   KERNEL_PRIVATEUSEONE(conv1d, lower_precision_fp)
   KERNEL_PRIVATEUSEONE(conv2d, lower_precision_fp)
   KERNEL_PRIVATEUSEONE(conv3d, lower_precision_fp)
   KERNEL_PRIVATEUSEONE(conv_tbc, lower_precision_fp)
   KERNEL_PRIVATEUSEONE(conv_transpose1d, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE2(conv_transpose2d, input, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE2(conv_transpose3d, input, lower_precision_fp)
+  KERNEL_PRIVATEUSEONE(conv_transpose2d, input, lower_precision_fp)
+  KERNEL_PRIVATEUSEONE(conv_transpose3d, input, lower_precision_fp)
   KERNEL_PRIVATEUSEONE(convolution, lower_precision_fp)
   KERNEL_PRIVATEUSEONE(cudnn_convolution, lower_precision_fp)
   KERNEL_PRIVATEUSEONE(cudnn_convolution_transpose, lower_precision_fp)
@@ -72,16 +72,16 @@ TORCH_LIBRARY_IMPL(aten, AutocastPrivateUse1, m) {
   KERNEL_PRIVATEUSEONE(rsqrt, fp32)
   KERNEL_PRIVATEUSEONE(sinh, fp32)
   KERNEL_PRIVATEUSEONE(tan, fp32)
-  KERNEL_PRIVATEUSEONE2(pow, Tensor_Scalar, fp32)
-  KERNEL_PRIVATEUSEONE2(pow, Tensor_Tensor, fp32)
-  KERNEL_PRIVATEUSEONE2(pow, Scalar, fp32)
+  KERNEL_PRIVATEUSEONE(pow, Tensor_Scalar, fp32)
+  KERNEL_PRIVATEUSEONE(pow, Tensor_Tensor, fp32)
+  KERNEL_PRIVATEUSEONE(pow, Scalar, fp32)
   KERNEL_PRIVATEUSEONE(softplus, fp32)
   KERNEL_PRIVATEUSEONE(layer_norm, fp32)
   KERNEL_PRIVATEUSEONE(native_layer_norm, fp32)
   KERNEL_PRIVATEUSEONE(group_norm, fp32)
-  KERNEL_PRIVATEUSEONE2(frobenius_norm, dim, fp32)
+  KERNEL_PRIVATEUSEONE(frobenius_norm, dim, fp32)
   KERNEL_PRIVATEUSEONE(nuclear_norm, fp32)
-  KERNEL_PRIVATEUSEONE2(nuclear_norm, dim, fp32)
+  KERNEL_PRIVATEUSEONE(nuclear_norm, dim, fp32)
   KERNEL_PRIVATEUSEONE(cosine_similarity, fp32)
   KERNEL_PRIVATEUSEONE(poisson_nll_loss, fp32)
   KERNEL_PRIVATEUSEONE(cosine_embedding_loss, fp32)
@@ -106,24 +106,24 @@ TORCH_LIBRARY_IMPL(aten, AutocastPrivateUse1, m) {
   KERNEL_PRIVATEUSEONE(logsumexp, fp32)
   // fp32_set_opt_dtype
   KERNEL_PRIVATEUSEONE(prod, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE2(prod, dim_int, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE2(prod, dim_Dimname, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE2(softmax, int, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE2(softmax, Dimname, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE2(log_softmax, int, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE2(log_softmax, Dimname, fp32_set_opt_dtype)
+  KERNEL_PRIVATEUSEONE(prod, dim_int, fp32_set_opt_dtype)
+  KERNEL_PRIVATEUSEONE(prod, dim_Dimname, fp32_set_opt_dtype)
+  KERNEL_PRIVATEUSEONE(softmax, int, fp32_set_opt_dtype)
+  KERNEL_PRIVATEUSEONE(softmax, Dimname, fp32_set_opt_dtype)
+  KERNEL_PRIVATEUSEONE(log_softmax, int, fp32_set_opt_dtype)
+  KERNEL_PRIVATEUSEONE(log_softmax, Dimname, fp32_set_opt_dtype)
   KERNEL_PRIVATEUSEONE(cumprod, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE2(cumprod, dimname, fp32_set_opt_dtype)
+  KERNEL_PRIVATEUSEONE(cumprod, dimname, fp32_set_opt_dtype)
   KERNEL_PRIVATEUSEONE(cumsum, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE2(cumsum, dimname, fp32_set_opt_dtype)
+  KERNEL_PRIVATEUSEONE(cumsum, dimname, fp32_set_opt_dtype)
   KERNEL_PRIVATEUSEONE(linalg_vector_norm, fp32_set_opt_dtype)
   KERNEL_PRIVATEUSEONE(linalg_matrix_norm, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE2(linalg_matrix_norm, str_ord, fp32_set_opt_dtype)
+  KERNEL_PRIVATEUSEONE(linalg_matrix_norm, str_ord, fp32_set_opt_dtype)
   // commenting these out because they accept an explicit (not-optional) dtype, and we shouldn't try to flip that even
   // when autocasting.
   KERNEL_PRIVATEUSEONE(sum, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE2(sum, dim_IntList, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE2(sum, dim_DimnameList, fp32_set_opt_dtype)
+  KERNEL_PRIVATEUSEONE(sum, dim_IntList, fp32_set_opt_dtype)
+  KERNEL_PRIVATEUSEONE(sum, dim_DimnameList, fp32_set_opt_dtype)
   // fp32_append_dtype
   // The fp32_append_dtype wrapper overrides implicit promotion behavior.
   // norm does not implicitly promote, but be aware when adding new ops to this policy.
-- 
Gitee


From 327204b42fd069e746bc5277324a933e65843b13 Mon Sep 17 00:00:00 2001
From: Yuanhao Ji <jiyuanhao@apache.org>
Date: Mon, 29 Apr 2024 14:27:21 +0800
Subject: [PATCH 2/2] style: indent four spaces at a time

---
 torch_npu/csrc/aten/AutoCastOps.cpp | 260 ++++++++++++++--------------
 1 file changed, 130 insertions(+), 130 deletions(-)

diff --git a/torch_npu/csrc/aten/AutoCastOps.cpp b/torch_npu/csrc/aten/AutoCastOps.cpp
index 9bad40a54b2..ad641e4cb57 100644
--- a/torch_npu/csrc/aten/AutoCastOps.cpp
+++ b/torch_npu/csrc/aten/AutoCastOps.cpp
@@ -21,138 +21,138 @@ TORCH_LIBRARY_IMPL(_, AutocastPrivateUse1, m) {
 }
 
 TORCH_LIBRARY_IMPL(aten, AutocastPrivateUse1, m) {
-  // lower_precision_fp
-  KERNEL_PRIVATEUSEONE(_convolution, deprecated, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(_convolution, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(conv1d, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(conv2d, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(conv3d, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(conv_tbc, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(conv_transpose1d, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(conv_transpose2d, input, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(conv_transpose3d, input, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(convolution, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(cudnn_convolution, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(cudnn_convolution_transpose, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(prelu, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(addmm, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(addmv, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(addr, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(matmul, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(einsum, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(mm, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(mv, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(linear, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(addbmm, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(baddbmm, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(bmm, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(chain_matmul, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(linalg_multi_dot, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(_thnn_fused_lstm_cell, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(_thnn_fused_gru_cell, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(lstm_cell, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(gru_cell, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(rnn_tanh_cell, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(rnn_relu_cell, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(_scaled_dot_product_flash_attention, lower_precision_fp)
-  KERNEL_PRIVATEUSEONE(scaled_dot_product_attention, lower_precision_fp)
+    // lower_precision_fp
+    KERNEL_PRIVATEUSEONE(_convolution, deprecated, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(_convolution, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(conv1d, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(conv2d, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(conv3d, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(conv_tbc, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(conv_transpose1d, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(conv_transpose2d, input, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(conv_transpose3d, input, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(convolution, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(cudnn_convolution, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(cudnn_convolution_transpose, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(prelu, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(addmm, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(addmv, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(addr, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(matmul, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(einsum, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(mm, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(mv, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(linear, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(addbmm, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(baddbmm, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(bmm, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(chain_matmul, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(linalg_multi_dot, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(_thnn_fused_lstm_cell, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(_thnn_fused_gru_cell, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(lstm_cell, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(gru_cell, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(rnn_tanh_cell, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(rnn_relu_cell, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(_scaled_dot_product_flash_attention, lower_precision_fp)
+    KERNEL_PRIVATEUSEONE(scaled_dot_product_attention, lower_precision_fp)
 
-  // fp32
-  KERNEL_PRIVATEUSEONE(acos, fp32)
-  KERNEL_PRIVATEUSEONE(asin, fp32)
-  KERNEL_PRIVATEUSEONE(cosh, fp32)
-  KERNEL_PRIVATEUSEONE(erfinv, fp32)
-  KERNEL_PRIVATEUSEONE(exp, fp32)
-  KERNEL_PRIVATEUSEONE(expm1, fp32)
-  KERNEL_PRIVATEUSEONE(log, fp32)
-  KERNEL_PRIVATEUSEONE(log10, fp32)
-  KERNEL_PRIVATEUSEONE(log2, fp32)
-  KERNEL_PRIVATEUSEONE(log1p, fp32)
-  KERNEL_PRIVATEUSEONE(reciprocal, fp32)
-  KERNEL_PRIVATEUSEONE(rsqrt, fp32)
-  KERNEL_PRIVATEUSEONE(sinh, fp32)
-  KERNEL_PRIVATEUSEONE(tan, fp32)
-  KERNEL_PRIVATEUSEONE(pow, Tensor_Scalar, fp32)
-  KERNEL_PRIVATEUSEONE(pow, Tensor_Tensor, fp32)
-  KERNEL_PRIVATEUSEONE(pow, Scalar, fp32)
-  KERNEL_PRIVATEUSEONE(softplus, fp32)
-  KERNEL_PRIVATEUSEONE(layer_norm, fp32)
-  KERNEL_PRIVATEUSEONE(native_layer_norm, fp32)
-  KERNEL_PRIVATEUSEONE(group_norm, fp32)
-  KERNEL_PRIVATEUSEONE(frobenius_norm, dim, fp32)
-  KERNEL_PRIVATEUSEONE(nuclear_norm, fp32)
-  KERNEL_PRIVATEUSEONE(nuclear_norm, dim, fp32)
-  KERNEL_PRIVATEUSEONE(cosine_similarity, fp32)
-  KERNEL_PRIVATEUSEONE(poisson_nll_loss, fp32)
-  KERNEL_PRIVATEUSEONE(cosine_embedding_loss, fp32)
-  KERNEL_PRIVATEUSEONE(nll_loss, fp32)
-  KERNEL_PRIVATEUSEONE(nll_loss2d, fp32)
-  KERNEL_PRIVATEUSEONE(hinge_embedding_loss, fp32)
-  KERNEL_PRIVATEUSEONE(kl_div, fp32)
-  KERNEL_PRIVATEUSEONE(l1_loss, fp32)
-  KERNEL_PRIVATEUSEONE(smooth_l1_loss, fp32)
-  KERNEL_PRIVATEUSEONE(huber_loss, fp32)
-  KERNEL_PRIVATEUSEONE(mse_loss, fp32)
-  KERNEL_PRIVATEUSEONE(margin_ranking_loss, fp32)
-  KERNEL_PRIVATEUSEONE(multilabel_margin_loss, fp32)
-  KERNEL_PRIVATEUSEONE(soft_margin_loss, fp32)
-  KERNEL_PRIVATEUSEONE(triplet_margin_loss, fp32)
-  KERNEL_PRIVATEUSEONE(multi_margin_loss, fp32)
-  KERNEL_PRIVATEUSEONE(binary_cross_entropy_with_logits, fp32)
-  KERNEL_PRIVATEUSEONE(dist, fp32)
-  KERNEL_PRIVATEUSEONE(pdist, fp32)
-  KERNEL_PRIVATEUSEONE(cdist, fp32)
-  KERNEL_PRIVATEUSEONE(renorm, fp32)
-  KERNEL_PRIVATEUSEONE(logsumexp, fp32)
-  // fp32_set_opt_dtype
-  KERNEL_PRIVATEUSEONE(prod, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE(prod, dim_int, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE(prod, dim_Dimname, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE(softmax, int, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE(softmax, Dimname, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE(log_softmax, int, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE(log_softmax, Dimname, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE(cumprod, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE(cumprod, dimname, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE(cumsum, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE(cumsum, dimname, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE(linalg_vector_norm, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE(linalg_matrix_norm, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE(linalg_matrix_norm, str_ord, fp32_set_opt_dtype)
-  // commenting these out because they accept an explicit (not-optional) dtype, and we shouldn't try to flip that even
-  // when autocasting.
-  KERNEL_PRIVATEUSEONE(sum, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE(sum, dim_IntList, fp32_set_opt_dtype)
-  KERNEL_PRIVATEUSEONE(sum, dim_DimnameList, fp32_set_opt_dtype)
-  // fp32_append_dtype
-  // The fp32_append_dtype wrapper overrides implicit promotion behavior.
-  // norm does not implicitly promote, but be aware when adding new ops to this policy.
-  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE(ADD_NS(norm),
-      "norm.Scalar", at::Tensor (const at::Tensor &, const c10::Scalar&),
-      at::Tensor (const at::Tensor &, const c10::optional<c10::Scalar>&, at::ScalarType),
-      fp32_append_dtype)
-  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE(ADD_NS(norm), "norm.ScalarOpt_dim",
-      at::Tensor (const at::Tensor &, const c10::optional<c10::Scalar>&, at::IntArrayRef, bool),
-      at::Tensor (const at::Tensor &, const c10::optional<c10::Scalar>&, at::IntArrayRef, bool, at::ScalarType),
-      fp32_append_dtype)
-  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE(ADD_NS(norm), "norm.names_ScalarOpt_dim",
-  at::Tensor (const at::Tensor &, const c10::optional<c10::Scalar>&, at::DimnameList, bool),
-      at::Tensor (const at::Tensor &, const c10::optional<c10::Scalar>&, at::DimnameList, bool, at::ScalarType),
-      fp32_append_dtype)
-  // promote
-  KERNEL_PRIVATEUSEONE(addcdiv, promote)
-  KERNEL_PRIVATEUSEONE(addcmul, promote)
-  KERNEL_PRIVATEUSEONE(atan2, promote)
-  KERNEL_PRIVATEUSEONE(bilinear, promote)
-  KERNEL_PRIVATEUSEONE(cross, promote)
-  KERNEL_PRIVATEUSEONE(dot, promote)
-  KERNEL_PRIVATEUSEONE(grid_sampler, promote)
-  KERNEL_PRIVATEUSEONE(index_put, promote)
-  KERNEL_PRIVATEUSEONE(tensordot, promote)
-  KERNEL_PRIVATEUSEONE(scatter_add, promote)
+    // fp32
+    KERNEL_PRIVATEUSEONE(acos, fp32)
+    KERNEL_PRIVATEUSEONE(asin, fp32)
+    KERNEL_PRIVATEUSEONE(cosh, fp32)
+    KERNEL_PRIVATEUSEONE(erfinv, fp32)
+    KERNEL_PRIVATEUSEONE(exp, fp32)
+    KERNEL_PRIVATEUSEONE(expm1, fp32)
+    KERNEL_PRIVATEUSEONE(log, fp32)
+    KERNEL_PRIVATEUSEONE(log10, fp32)
+    KERNEL_PRIVATEUSEONE(log2, fp32)
+    KERNEL_PRIVATEUSEONE(log1p, fp32)
+    KERNEL_PRIVATEUSEONE(reciprocal, fp32)
+    KERNEL_PRIVATEUSEONE(rsqrt, fp32)
+    KERNEL_PRIVATEUSEONE(sinh, fp32)
+    KERNEL_PRIVATEUSEONE(tan, fp32)
+    KERNEL_PRIVATEUSEONE(pow, Tensor_Scalar, fp32)
+    KERNEL_PRIVATEUSEONE(pow, Tensor_Tensor, fp32)
+    KERNEL_PRIVATEUSEONE(pow, Scalar, fp32)
+    KERNEL_PRIVATEUSEONE(softplus, fp32)
+    KERNEL_PRIVATEUSEONE(layer_norm, fp32)
+    KERNEL_PRIVATEUSEONE(native_layer_norm, fp32)
+    KERNEL_PRIVATEUSEONE(group_norm, fp32)
+    KERNEL_PRIVATEUSEONE(frobenius_norm, dim, fp32)
+    KERNEL_PRIVATEUSEONE(nuclear_norm, fp32)
+    KERNEL_PRIVATEUSEONE(nuclear_norm, dim, fp32)
+    KERNEL_PRIVATEUSEONE(cosine_similarity, fp32)
+    KERNEL_PRIVATEUSEONE(poisson_nll_loss, fp32)
+    KERNEL_PRIVATEUSEONE(cosine_embedding_loss, fp32)
+    KERNEL_PRIVATEUSEONE(nll_loss, fp32)
+    KERNEL_PRIVATEUSEONE(nll_loss2d, fp32)
+    KERNEL_PRIVATEUSEONE(hinge_embedding_loss, fp32)
+    KERNEL_PRIVATEUSEONE(kl_div, fp32)
+    KERNEL_PRIVATEUSEONE(l1_loss, fp32)
+    KERNEL_PRIVATEUSEONE(smooth_l1_loss, fp32)
+    KERNEL_PRIVATEUSEONE(huber_loss, fp32)
+    KERNEL_PRIVATEUSEONE(mse_loss, fp32)
+    KERNEL_PRIVATEUSEONE(margin_ranking_loss, fp32)
+    KERNEL_PRIVATEUSEONE(multilabel_margin_loss, fp32)
+    KERNEL_PRIVATEUSEONE(soft_margin_loss, fp32)
+    KERNEL_PRIVATEUSEONE(triplet_margin_loss, fp32)
+    KERNEL_PRIVATEUSEONE(multi_margin_loss, fp32)
+    KERNEL_PRIVATEUSEONE(binary_cross_entropy_with_logits, fp32)
+    KERNEL_PRIVATEUSEONE(dist, fp32)
+    KERNEL_PRIVATEUSEONE(pdist, fp32)
+    KERNEL_PRIVATEUSEONE(cdist, fp32)
+    KERNEL_PRIVATEUSEONE(renorm, fp32)
+    KERNEL_PRIVATEUSEONE(logsumexp, fp32)
+    // fp32_set_opt_dtype
+    KERNEL_PRIVATEUSEONE(prod, fp32_set_opt_dtype)
+    KERNEL_PRIVATEUSEONE(prod, dim_int, fp32_set_opt_dtype)
+    KERNEL_PRIVATEUSEONE(prod, dim_Dimname, fp32_set_opt_dtype)
+    KERNEL_PRIVATEUSEONE(softmax, int, fp32_set_opt_dtype)
+    KERNEL_PRIVATEUSEONE(softmax, Dimname, fp32_set_opt_dtype)
+    KERNEL_PRIVATEUSEONE(log_softmax, int, fp32_set_opt_dtype)
+    KERNEL_PRIVATEUSEONE(log_softmax, Dimname, fp32_set_opt_dtype)
+    KERNEL_PRIVATEUSEONE(cumprod, fp32_set_opt_dtype)
+    KERNEL_PRIVATEUSEONE(cumprod, dimname, fp32_set_opt_dtype)
+    KERNEL_PRIVATEUSEONE(cumsum, fp32_set_opt_dtype)
+    KERNEL_PRIVATEUSEONE(cumsum, dimname, fp32_set_opt_dtype)
+    KERNEL_PRIVATEUSEONE(linalg_vector_norm, fp32_set_opt_dtype)
+    KERNEL_PRIVATEUSEONE(linalg_matrix_norm, fp32_set_opt_dtype)
+    KERNEL_PRIVATEUSEONE(linalg_matrix_norm, str_ord, fp32_set_opt_dtype)
+    // commenting these out because they accept an explicit (not-optional) dtype, and we shouldn't try to flip that even
+    // when autocasting.
+    KERNEL_PRIVATEUSEONE(sum, fp32_set_opt_dtype)
+    KERNEL_PRIVATEUSEONE(sum, dim_IntList, fp32_set_opt_dtype)
+    KERNEL_PRIVATEUSEONE(sum, dim_DimnameList, fp32_set_opt_dtype)
+    // fp32_append_dtype
+    // The fp32_append_dtype wrapper overrides implicit promotion behavior.
+    // norm does not implicitly promote, but be aware when adding new ops to this policy.
+    KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE(ADD_NS(norm),
+        "norm.Scalar", at::Tensor (const at::Tensor &, const c10::Scalar&),
+        at::Tensor (const at::Tensor &, const c10::optional<c10::Scalar>&, at::ScalarType),
+        fp32_append_dtype)
+    KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE(ADD_NS(norm), "norm.ScalarOpt_dim",
+        at::Tensor (const at::Tensor &, const c10::optional<c10::Scalar>&, at::IntArrayRef, bool),
+        at::Tensor (const at::Tensor &, const c10::optional<c10::Scalar>&, at::IntArrayRef, bool, at::ScalarType),
+        fp32_append_dtype)
+    KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE(ADD_NS(norm), "norm.names_ScalarOpt_dim",
+    at::Tensor (const at::Tensor &, const c10::optional<c10::Scalar>&, at::DimnameList, bool),
+        at::Tensor (const at::Tensor &, const c10::optional<c10::Scalar>&, at::DimnameList, bool, at::ScalarType),
+        fp32_append_dtype)
+    // promote
+    KERNEL_PRIVATEUSEONE(addcdiv, promote)
+    KERNEL_PRIVATEUSEONE(addcmul, promote)
+    KERNEL_PRIVATEUSEONE(atan2, promote)
+    KERNEL_PRIVATEUSEONE(bilinear, promote)
+    KERNEL_PRIVATEUSEONE(cross, promote)
+    KERNEL_PRIVATEUSEONE(dot, promote)
+    KERNEL_PRIVATEUSEONE(grid_sampler, promote)
+    KERNEL_PRIVATEUSEONE(index_put, promote)
+    KERNEL_PRIVATEUSEONE(tensordot, promote)
+    KERNEL_PRIVATEUSEONE(scatter_add, promote)
 
-  m.impl(TORCH_SELECTIVE_NAME("aten::binary_cross_entropy"),
-         TORCH_FN((&binary_cross_entropy_banned)));
+    m.impl(TORCH_SELECTIVE_NAME("aten::binary_cross_entropy"),
+           TORCH_FN((&binary_cross_entropy_banned)));
 }
 
 }
-- 
Gitee