From 1b8cbcee18ab7687c6369f33a5fc47d34730ba9f Mon Sep 17 00:00:00 2001 From: xuepeng Date: Tue, 1 Nov 2022 20:20:36 +0800 Subject: [PATCH] bugfix for cycle for mix compute mode --- .../core/optimizers/runtime/node_placer.cpp | 79 ++++++++++--------- .../core/optimizers/runtime/node_placer.h | 12 +-- 2 files changed, 47 insertions(+), 44 deletions(-) diff --git a/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.cpp b/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.cpp index 3b5550a2c..5bb840ec1 100644 --- a/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.cpp +++ b/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.cpp @@ -24,8 +24,8 @@ namespace npu { const std::string kSharedGroup = "_shared_group_id"; -Cluster::Cluster(const NodePlacer *placer, tensorflow::Node *node, uint64_t topo, Placement place) - : min_topo(topo), max_topo(topo), placement(place), placer_(placer) { +Cluster::Cluster(const NodePlacer *placer, tensorflow::Node *node, Placement place) + : id(node->id()), placement(place), placer_(placer) { static std::atomic_int64_t index{0}; name = kPlacementString[placement] + "_cluster_" + std::to_string(index.fetch_add(1)); DLOG() << "Create cluster " << name << " for " << node->name(); @@ -49,7 +49,6 @@ bool Cluster::Merge(tensorflow::Node *node) { (void)out_nodes.insert(n); } } - UpdateTopo(placer_->Topo(node)); return true; } @@ -59,14 +58,6 @@ void Cluster::Merge(const std::shared_ptr other) { } } -void Cluster::UpdateTopo(uint64_t topo) { - if (topo > max_topo) { - max_topo = topo; - } else if (topo < min_topo) { - min_topo = topo; - } -} - tensorflow::Status NodePlacer::Apply(size_t depth) { const static size_t kMaxRecursionDepth = 16; NPU_REQUIRES(depth <= kMaxRecursionDepth, tensorflow::errors::Unimplemented( @@ -82,12 +73,36 @@ tensorflow::Status NodePlacer::Apply(size_t depth) { NPU_REQUIRES_OK(PlaceCpuNodeSubgraphs(depth)); return tensorflow::Status::OK(); } + void NodePlacer::InitNodeTopo() { uint64_t topo = 0U; auto leave = [this, &topo](const tensorflow::Node *node) { node_topo_[node] = topo++; }; tensorflow::ReverseDFS(*graph_, {}, leave); } +void NodePlacer::ResetNodeMask() { + node_mask_.resize(graph_->num_node_ids()); + (void)memset(node_mask_.data(), 0, node_mask_.size() * sizeof(uint8_t)); +} + +bool NodePlacer::FetchSetMask(const NodeOrCluster &node_or_cluster) { + auto &mask = node_mask_[node_or_cluster.Id()]; + if (mask == 0) { + mask = 1; + return false; + } + return true; +} + +bool NodePlacer::FetchClearMask(const NodeOrCluster &node_or_cluster) { + auto &mask = node_mask_[node_or_cluster.Id()]; + if (mask != 0) { + mask = 0; + return true; + } + return false; +} + tensorflow::Status NodePlacer::CopyShareableNode() { std::vector shared_nodes; for (auto node : graph_->nodes()) { @@ -670,46 +685,32 @@ bool NodePlacer::VisitPathNodes(tensorflow::Node *start, tensorflow::Node *end, std::swap(start, end); } - NodeOrCluster s = GetNodeOrCluster(start); - NodeOrCluster e = GetNodeOrCluster(end); - auto min_topo = s.IsCluster() ? s.GetCluster()->min_topo : node_topo_[start]; - auto max_topo = e.IsCluster() ? e.GetCluster()->max_topo : node_topo_[end]; + ResetNodeMask(); // Clear all masks - std::unordered_set f_vst; - std::unordered_set b_vst; std::queue q; - q.push(s); + auto collect_seen_nodes = [this, &q](tensorflow::Node *n) { // Visitor for collect nodes + q.push(GetNodeOrCluster(n)); + }; + + GetNodeOrCluster(start).VisitOutNodes(collect_seen_nodes); // Collect start nodes while (!q.empty()) { auto node = q.front(); q.pop(); - if (!f_vst.insert(node).second) { - continue; + if (!FetchSetMask(node)) { // Visit node at first masking + node.VisitOutNodes(collect_seen_nodes); } - node.VisitOutNodes([this, &max_topo, &q](tensorflow::Node *n) { - if (node_topo_[n] <= max_topo) { - q.push(GetNodeOrCluster(n)); - } - }); } - (void)f_vst.erase(s); - (void)f_vst.erase(e); - q.push(e); + + GetNodeOrCluster(end).VisitInNodes(collect_seen_nodes); while (!q.empty()) { auto node = q.front(); q.pop(); - if (!b_vst.insert(node).second) { - continue; - } - if (f_vst.count(node) > 0) { + if (FetchClearMask(node)) { // Access masked node only once if (!node.VisitNodes(visitor)) { return false; } + node.VisitInNodes(collect_seen_nodes); } - node.VisitInNodes([this, &min_topo, &q](tensorflow::Node *n) { - if (node_topo_[n] >= min_topo) { - q.push(GetNodeOrCluster(n)); - } - }); } return true; } @@ -719,7 +720,7 @@ std::shared_ptr NodePlacer::GetOrCreateConcreteCluster(tensorflow::Node if (iter != concrete_clusters_.end()) { return iter->second; } - auto cluster = std::make_shared(this, node, node_topo_[node], Placement::WHEREVER); + auto cluster = std::make_shared(this, node, Placement::WHEREVER); concrete_clusters_[node] = cluster; return cluster; } @@ -729,7 +730,7 @@ std::shared_ptr NodePlacer::GetOrCreateNpuCluster(tensorflow::Node *nod if (iter != npu_clusters_.end()) { return iter->second; } - auto cluster = std::make_shared(this, node, node_topo_[node], Placement::NPU); + auto cluster = std::make_shared(this, node, Placement::NPU); npu_clusters_[node] = cluster; auto concrete_nodes = GetConcreteNodes(node); for (auto &n : concrete_nodes) { diff --git a/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.h b/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.h index 4e407c5bc..b9c75bd89 100644 --- a/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.h +++ b/tf_adapter_2.x/npu_device/core/optimizers/runtime/node_placer.h @@ -28,16 +28,14 @@ static std::map kPlacementString = { class NodePlacer; struct Cluster { - explicit Cluster(const NodePlacer *placer, tensorflow::Node *node, uint64_t topo, Placement place); + explicit Cluster(const NodePlacer *placer, tensorflow::Node *node, Placement place); bool Merge(tensorflow::Node *node); void Merge(const std::shared_ptr other); - void UpdateTopo(uint64_t topo); std::set nodes; std::unordered_set in_nodes; std::unordered_set out_nodes; std::string name; - uint64_t min_topo; - uint64_t max_topo; + int32_t id; Placement placement; private: @@ -51,6 +49,7 @@ struct NodeOrCluster { void VisitOutNodes(const std::function &visitor) const; bool VisitNodes(const std::function &visitor) const; size_t Hash() const { return (is_cluster_ ? reinterpret_cast(cluster_) : reinterpret_cast(node_)); } + int32_t Id() const { return (is_cluster_ ? cluster_->id : node_->id()); } bool operator==(const NodeOrCluster &other) const { return (is_cluster_ ? cluster_ == other.cluster_ : node_ == other.node_); } @@ -73,6 +72,9 @@ class NodePlacer { : context_(context), graph_(graph), device_(device) {} tensorflow::Status Apply(size_t depth = 0); void InitNodeTopo(); + void ResetNodeMask(); + bool FetchSetMask(const NodeOrCluster &node_or_cluster); + bool FetchClearMask(const NodeOrCluster &node_or_cluster); tensorflow::Status PlaceCpuNodeSubgraphs(size_t depth) const; tensorflow::Status BuildNpuOp(); tensorflow::Status CopyShareableNode(); @@ -96,7 +98,6 @@ class NodePlacer { std::set> GetNpuClusters(); void Concrete(tensorflow::Node *src, tensorflow::Node *dst); bool ColocateNpu(tensorflow::Node *src, tensorflow::Node *dst); - uint64_t Topo(tensorflow::Node *node) const { return node_topo_.at(node); } private: static bool IsNpuMeaningLessNode(const tensorflow::Node *node); @@ -114,6 +115,7 @@ class NodePlacer { TFE_Context *context_; // not owned tensorflow::Graph *graph_; // not owned NpuDevice *device_; // not owned + std::vector node_mask_; std::map node_topo_; std::map node_placement_; // Just npu or cpu, never store wherever here -- Gitee