diff --git a/tf_adapter/interface_spec/api_hccl_ops.pyh b/tf_adapter/interface_spec/api_hccl_ops.pyh index e5f8c54a2aebb50b7c1dcc4b27affbbe6136807d..853de9060c36121f4ed4d1077857f19dee18677b 100644 --- a/tf_adapter/interface_spec/api_hccl_ops.pyh +++ b/tf_adapter/interface_spec/api_hccl_ops.pyh @@ -1,7 +1,7 @@ # source file:./python/npu_bridge/hccl/hccl_ops.py def allreduce(tensor, reduction, fusion=1, fusion_id=-1, group="hccl_world_group"): def allgather(tensor, rank_size, group="hccl_world_group"): -def broadcast(tensor, root_rank, fusion=0, fusion_id=-1, group="hccl_world_group"): +def broadcast(tensor, root_rank, fusion=2, fusion_id=0, group="hccl_world_group"): def reduce(tensor, reduction, root_rank, fusion=0, fusion_id=-1, group="hccl_world_group"): def reduce_scatter(tensor, reduction, rank_size, group="hccl_world_group"): def send(tensor, sr_tag, dest_rank, group="hccl_world_group"): diff --git a/tf_adapter/optimizers/gradient_fusion_optimizer.cc b/tf_adapter/optimizers/gradient_fusion_optimizer.cc index 0235497e071eee143e04b7b6c94e2695c1af7743..4de208b4f0745e5fc0cbb9788ff232228fdea6d1 100644 --- a/tf_adapter/optimizers/gradient_fusion_optimizer.cc +++ b/tf_adapter/optimizers/gradient_fusion_optimizer.cc @@ -242,65 +242,10 @@ int64 GradFusionOptimizer::GetFusionTensorSize() { } Status GradFusionOptimizer::Optimize(Cluster *cluster, const GrapplerItem &item, GraphDef *optimizedGraph) { + LOG(INFO) << "INFO: GradFusionOptimizer::Optimize begin."; REQUIRES_NOT_NULL(optimizedGraph); - const int64 fusionTensorSize = GetFusionTensorSize(); - GraphDef graphOrigin; - std::map, std::vector> fusionHcomOps; - std::map, int64_t> currentGradSumSize; *optimizedGraph = item.graph; - LOG(INFO) << "INFO: GradFusionOptimizer::Optimize begin, OriginNodeNum: " << item.graph.node_size(); - LOG(INFO) << "INFO: FUSION_TENSOR_SIZE: " << fusionTensorSize; - - if (fusionTensorSize < 0) { return errors::InvalidArgument("FUSION_TENSOR_SIZE is invalid"); } - - REQUIRES_STATUS_OK(TopologicalSort(optimizedGraph)); - nodeMap_.reset(new (std::nothrow) NodeMap(optimizedGraph)); - REQUIRES_NOT_NULL(nodeMap_); - fusionOpInfo_.clear(); - fusionOpPool_.clear(); - graphOrigin = *optimizedGraph; - for (const auto &nodeDef : graphOrigin.node()) { nameToNode_[nodeDef.name()] = nodeDef; } - - for (const auto &nodeDef : graphOrigin.node()) { - if (IsHcomOp(nodeDef)) { - DataType dType; - auto attrMap = nodeDef.attr(); - auto iter = attrMap.find(DATA_TYPE_ATTR); - if (iter != attrMap.end()) { - dType = iter->second.list().type(0); - } else { - LOG(INFO) << "INFO: Use default dataType: DT_FLOAT"; - dType = DT_FLOAT; - } - std::pair key = std::make_pair(nodeDef.op(), dType); - - fusionHcomOps[key].push_back(nodeDef); - int64_t inputTensorSize = 0; - NodeDef tmpNode = nodeDef; - TF_RETURN_IF_ERROR(GetInputTensorSize(tmpNode, inputTensorSize)); - if (currentGradSumSize.count(key) != 0) { - if (INT64_MAX - inputTensorSize < currentGradSumSize[key]) { - return errors::InvalidArgument("input tensor size is overflow"); - } - currentGradSumSize[key] += inputTensorSize; - } else { - currentGradSumSize[key] = inputTensorSize; - } - if (currentGradSumSize[key] >= fusionTensorSize) { - if (fusionHcomOps[key].size() > 1) { TF_RETURN_IF_ERROR(FusionOp(fusionHcomOps[key], optimizedGraph)); } - fusionHcomOps[key].clear(); - currentGradSumSize[key] = 0; - } - } - } - - for (auto iter : fusionHcomOps) { - if (!iter.second.empty()) { - if (iter.second.size() > 1) { TF_RETURN_IF_ERROR(FusionOp(iter.second, optimizedGraph)); } - iter.second.clear(); - } - } - LOG(INFO) << "INFO: GradFusionOptimizer::Optimize end, finalNodeNum: " << optimizedGraph->node_size(); + LOG(INFO) << "INFO: GradFusionOptimizer::Optimize end."; return Status::OK(); } diff --git a/tf_adapter/python/npu_bridge/hccl/hccl_ops.py b/tf_adapter/python/npu_bridge/hccl/hccl_ops.py index 3926ac2c6a2799bb1d39e3aba9df662fb012d211..f9e60bf73c4ce07728bda7e56c225f6312b2f495 100644 --- a/tf_adapter/python/npu_bridge/hccl/hccl_ops.py +++ b/tf_adapter/python/npu_bridge/hccl/hccl_ops.py @@ -52,7 +52,7 @@ def allgather(tensor, rank_size, group="hccl_world_group"): # @param fusion_id int类型,算子融合索引标识,相同fusion_id的算子将会融合。 # @param group string类型,group名称,可以为用户自定义group或者"hccl_world_group"; # @return 对输入tensor执行完broadcast操作之后的结果tensor -def broadcast(tensor, root_rank, fusion=0, fusion_id=-1, group="hccl_world_group"): +def broadcast(tensor, root_rank, fusion=2, fusion_id=0, group="hccl_world_group"): result = gen_hccl_ops.hcom_broadcast( input=tensor, fusion=fusion,