diff --git a/runtime/mem/gc/epsilon-g1/epsilon-g1.cpp b/runtime/mem/gc/epsilon-g1/epsilon-g1.cpp index 0964dc314552fd57556309a083fa09051ad307aa..607e10fad6505f01557fbd1c43065c5f18879097 100644 --- a/runtime/mem/gc/epsilon-g1/epsilon-g1.cpp +++ b/runtime/mem/gc/epsilon-g1/epsilon-g1.cpp @@ -49,10 +49,9 @@ void EpsilonG1GC::InitializeImpl() this->CreateCardTable(allocator, PoolManager::GetMmapMemPool()->GetMinObjectAddress(), PoolManager::GetMmapMemPool()->GetTotalObjectSize()); - auto barrier_set = - allocator->New(allocator, &PreWrbFuncEntrypoint, &PostWrbUpdateCardFuncEntrypoint, - panda::helpers::math::GetIntLog2(this->GetG1ObjectAllocator()->GetRegionSize()), - this->GetCardTable(), this->updated_refs_queue_, &this->queue_lock_); + auto barrier_set = allocator->New( + allocator, &PreWrbFuncEntrypoint, &PostWrbUpdateCardFuncEntrypoint, + panda::helpers::math::GetIntLog2(this->GetG1ObjectAllocator()->GetRegionSize()), this->GetCardTable()); ASSERT(barrier_set != nullptr); this->SetGCBarrierSet(barrier_set); diff --git a/runtime/mem/gc/g1/g1-gc.cpp b/runtime/mem/gc/g1/g1-gc.cpp index 5d79ca0a23c982bacbd3eb7bd8b836b8fc2c405f..91c7909268643be319772ede6ac39e34a7a827fd 100644 --- a/runtime/mem/gc/g1/g1-gc.cpp +++ b/runtime/mem/gc/g1/g1-gc.cpp @@ -144,7 +144,6 @@ G1GC::G1GC(ObjectAllocatorBase *object_allocator, const GCSettin InternalAllocatorPtr allocator = this->GetInternalAllocator(); this->SetType(GCType::G1_GC); this->SetTLABsSupported(); - updated_refs_queue_ = allocator->New(); auto *first_ref_vector = allocator->New(); first_ref_vector->reserve(MAX_REFS); unique_refs_from_remsets_.push_back(first_ref_vector); @@ -160,7 +159,6 @@ G1GC::~G1GC() allocator->Delete(obj_vector); } } - allocator->Delete(updated_refs_queue_); ASSERT(unique_refs_from_remsets_.size() == 1); allocator->Delete(unique_refs_from_remsets_.front()); unique_refs_from_remsets_.clear(); @@ -642,15 +640,18 @@ void G1GC::WorkerTaskProcessing(GCWorkersTask *task, [[maybe_unu case GCWorkersTaskTypes::TASK_ENQUEUE_REMSET_REFS: { auto *moved_objects_range = task->Cast>()->GetMovedObjectsRange(); auto *task_updated_refs_queue = - this->GetInternalAllocator()->template New(); + this->GetInternalAllocator()->template New>(); EnqueueRemsetRefUpdater ref_updater(this->GetCardTable(), task_updated_refs_queue, region_size_bits_); DoUpdateReferencesToMovedObjectsRange(moved_objects_range, ref_updater); { os::memory::LockHolder lock(gc_worker_queue_lock_); - updated_refs_queue_->insert(updated_refs_queue_->end(), task_updated_refs_queue->begin(), - task_updated_refs_queue->end()); + auto *barriers = GetG1BarrierSet(); + for (auto card : *task_updated_refs_queue) { + // do not mark + barriers->EnqueueToOld(card); + } } this->GetInternalAllocator()->Delete(moved_objects_range); this->GetInternalAllocator()->Delete(task_updated_refs_queue); @@ -918,10 +919,10 @@ bool G1GC::HaveGarbageRegions(const PandaPriorityQueue -void G1GC::ProcessDirtyCards() +void G1GC::ProcessDirtyCards(bool process_old_cards) { ScopedTiming t(__FUNCTION__, *this->GetTiming()); - update_remset_thread_->GCProcessCards(); + update_remset_thread_->GCProcessCards(process_old_cards); } template @@ -933,10 +934,9 @@ void G1GC::InitializeImpl() PoolManager::GetMmapMemPool()->GetTotalObjectSize()); // TODO(dtrubenkov): initialize barriers - auto barrier_set = - allocator->New(allocator, &PreWrbFuncEntrypoint, &PostWrbUpdateCardFuncEntrypoint, - panda::helpers::math::GetIntLog2(this->GetG1ObjectAllocator()->GetRegionSize()), - this->GetCardTable(), updated_refs_queue_, &queue_lock_); + auto barrier_set = allocator->New( + allocator, &PreWrbFuncEntrypoint, &PostWrbUpdateCardFuncEntrypoint, + panda::helpers::math::GetIntLog2(this->GetG1ObjectAllocator()->GetRegionSize()), this->GetCardTable()); ASSERT(barrier_set != nullptr); this->SetGCBarrierSet(barrier_set); @@ -952,14 +952,12 @@ void G1GC::InitializeImpl() ASSERT(gc_task_pool != nullptr); this->SetWorkersPool(gc_task_pool); } - { - // to make TSAN happy because we access updated_refs_queue_ inside constructor of UpdateRemsetThread - os::memory::LockHolder lock(queue_lock_); - update_remset_thread_ = allocator->template New>( - this, this->GetPandaVm(), updated_refs_queue_, &queue_lock_, this->GetG1ObjectAllocator()->GetRegionSize(), - this->GetSettings()->G1EnableConcurrentUpdateRemset(), this->GetSettings()->G1MinConcurrentCardsToProcess(), - this->GetCardTable()); - } + + update_remset_thread_ = allocator->template New>( + this, this->GetPandaVm(), barrier_set, this->GetG1ObjectAllocator()->GetRegionSize(), + this->GetSettings()->G1EnableConcurrentUpdateRemset(), this->GetSettings()->G1MinConcurrentCardsToProcess(), + this->GetCardTable()); + ASSERT(update_remset_thread_ != nullptr); LOG_DEBUG_GC << "G1GC initialized"; } @@ -1161,7 +1159,7 @@ MemRange G1GC::MixedMarkAndCacheRefs(const GCTask &task, const C analytics_.ReportMarkingEnd(panda::time::GetCurrentTimeInNanos()); // HandleReferences could write a new barriers - so we need to handle them before moving - ProcessDirtyCards(); + ProcessDirtyCards(true); return dirty_cards_range; } @@ -1248,17 +1246,14 @@ bool G1GC::CollectAndMove(const CollectionSet &collection_set) moved_objects_container = &mixed_marked_objects_; } - { - os::memory::LockHolder lock(queue_lock_); - analytics_.ReportUpdateRefsStart(panda::time::GetCurrentTimeInNanos()); - if (this->GetSettings()->ParallelRefUpdatingEnabled()) { - UpdateRefsToMovedObjects(moved_objects_container); - } else { - UpdateRefsToMovedObjects(moved_objects_container); - } - analytics_.ReportUpdateRefsEnd(panda::time::GetCurrentTimeInNanos()); - ActualizeRemSets(); + analytics_.ReportUpdateRefsStart(panda::time::GetCurrentTimeInNanos()); + if (this->GetSettings()->ParallelRefUpdatingEnabled()) { + UpdateRefsToMovedObjects(moved_objects_container); + } else { + UpdateRefsToMovedObjects(moved_objects_container); } + analytics_.ReportUpdateRefsEnd(panda::time::GetCurrentTimeInNanos()); + ActualizeRemSets(); VerifyCollectAndMove(std::move(collect_verifier), collection_set); SweepRegularVmRefs(); @@ -1302,14 +1297,28 @@ bool G1GC::CollectAndMove(const CollectionSet &collection_set) } template -template -std::conditional_t, EnqueueRemsetRefUpdater> -G1GC::CreateRefUpdater([[maybe_unused]] GCG1BarrierSet::ThreadLocalCardQueues *updated_ref_queue) const +template +std::conditional_t, EnqueueRemsetRefUpdater> +G1GC::CreateConcurrentRefUpdater(PandaVector *queue) const { if constexpr (FULL_GC) { - return UpdateRemsetRefUpdater(region_size_bits_); + return UpdateRemsetRefUpdater(region_size_bits_); } else { - return EnqueueRemsetRefUpdater(this->GetCardTable(), updated_ref_queue, region_size_bits_); + return EnqueueRemsetRefUpdater(this->GetCardTable(), queue, region_size_bits_); + } +} + +template +template +std::conditional_t, + SharedEnqueueRemsetRefUpdater> +G1GC::CreateRefUpdater() const +{ + if constexpr (FULL_GC) { + return UpdateRemsetRefUpdater(region_size_bits_); + } else { + return SharedEnqueueRemsetRefUpdater(this->GetCardTable(), GetG1BarrierSet(), + region_size_bits_); } } @@ -1320,17 +1329,14 @@ void G1GC::UpdateRefsToMovedObjects(MovedObjectsContainer scope(__FUNCTION__, this); // Currently lock for RemSet too much influences for pause, so don't use workers on FULL-GC constexpr bool ENABLE_WORKERS = USE_WORKERS && !FULL_GC; - auto internal_allocator = this->GetInternalAllocator(); - auto *updated_ref_queue = (ENABLE_WORKERS) - ? internal_allocator->template New() - : updated_refs_queue_; - auto ref_updater = this->CreateRefUpdater(updated_ref_queue); - // update reference from objects which were moved while garbage collection - LOG_DEBUG_GC << "=== Update ex-cset -> ex-cset references. START. ==="; - { - ScopedTiming t("UpdateMovedObjectsReferences", *this->GetTiming()); - for (auto *moved_objects : *moved_objects_container) { - if constexpr (ENABLE_WORKERS) { + + if constexpr (ENABLE_WORKERS) { + auto internal_allocator = this->GetInternalAllocator(); + // update reference from objects which were moved while garbage collection + LOG_DEBUG_GC << "=== Update ex-cset -> ex-cset references. START. ==="; + { + ScopedTiming t("UpdateMovedObjectsReferences", *this->GetTiming()); + for (auto *moved_objects : *moved_objects_container) { auto range_begin = moved_objects->begin(); auto range_end = range_begin; while (range_begin != moved_objects->end()) { @@ -1350,33 +1356,58 @@ void G1GC::UpdateRefsToMovedObjects(MovedObjectsContainerWorkerTaskProcessing(&gc_worker_task, nullptr); } - } else { // GC workers are not used - typename GCUpdateRefsWorkersTask::MovedObjectsRange moved_objects_range(moved_objects->begin(), - moved_objects->end()); - DoUpdateReferencesToMovedObjectsRange( - &moved_objects_range, ref_updater); } } - } - LOG_DEBUG_GC << "=== Update ex-cset -> ex-cset references. END. ==="; + LOG_DEBUG_GC << "=== Update ex-cset -> ex-cset references. END. ==="; + + // update references from objects which are not part of collection set + LOG_DEBUG_GC << "=== Update non ex-cset -> ex-cset references. START. ==="; + + auto *updated_ref_queue = internal_allocator->template New>(); + auto ref_updater = this->CreateConcurrentRefUpdater(updated_ref_queue); + + if constexpr (FULL_GC) { + UpdateRefsFromRemSets(ref_updater); + } else { + VisitRemSets(ref_updater); + } + LOG_DEBUG_GC << "=== Update non ex-cset -> ex-cset references. END. ==="; - // update references from objects which are not part of collection set - LOG_DEBUG_GC << "=== Update non ex-cset -> ex-cset references. START. ==="; - if constexpr (FULL_GC) { - UpdateRefsFromRemSets(ref_updater); - } else { - VisitRemSets(ref_updater); - } - LOG_DEBUG_GC << "=== Update non ex-cset -> ex-cset references. END. ==="; - if constexpr (ENABLE_WORKERS) { { os::memory::LockHolder lock(gc_worker_queue_lock_); - updated_refs_queue_->insert(updated_refs_queue_->end(), updated_ref_queue->begin(), - updated_ref_queue->end()); + auto *barriers = GetG1BarrierSet(); + for (auto *card : *updated_ref_queue) { + barriers->EnqueueToShared(card); + } this->GetInternalAllocator()->Delete(updated_ref_queue); } this->GetWorkersPool()->WaitUntilTasksEnd(); + } else { + auto ref_updater = this->CreateRefUpdater(); + // update reference from objects which were moved while garbage collection + LOG_DEBUG_GC << "=== Update ex-cset -> ex-cset references. START. ==="; + { + ScopedTiming t("UpdateMovedObjectsReferences", *this->GetTiming()); + for (auto *moved_objects : *moved_objects_container) { + typename GCUpdateRefsWorkersTask::MovedObjectsRange moved_objects_range(moved_objects->begin(), + moved_objects->end()); + DoUpdateReferencesToMovedObjectsRange( + &moved_objects_range, ref_updater); + } + } + LOG_DEBUG_GC << "=== Update ex-cset -> ex-cset references. END. ==="; + + // update references from objects which are not part of collection set + LOG_DEBUG_GC << "=== Update non ex-cset -> ex-cset references. START. ==="; + if constexpr (FULL_GC) { + UpdateRefsFromRemSets(ref_updater); + } else { + VisitRemSets(ref_updater); + } + LOG_DEBUG_GC << "=== Update non ex-cset -> ex-cset references. END. ==="; } + + GetG1BarrierSet()->ClearOldCards(); this->CommonUpdateRefsToMovedObjects(); } @@ -1433,7 +1464,7 @@ void G1GC::FullMarking(panda::GCTask &task) } } // Force card updater here, after swapping bitmap, to skip dead objects - ProcessDirtyCards(); + ProcessDirtyCards(true); auto garbage_regions = GetG1ObjectAllocator()->template GetTopGarbageRegions(); auto empty_tenured_regions = GetEmptyTenuredRegularRegionsFromQueue(std::move(garbage_regions)); CollectEmptyRegions(task, &empty_tenured_regions); @@ -1609,7 +1640,7 @@ void G1GC::Remark(panda::GCTask const &task) } } // Force card updater here, after swapping bitmap, to skip dead objects - ProcessDirtyCards(); + ProcessDirtyCards(true); } template @@ -1758,7 +1789,6 @@ CollectionSet G1GC::GetFullCollectionSet() { ASSERT(this->IsFullGC()); // FillRemSet should be always finished before GetCollectibleRegions - ASSERT(update_remset_thread_->GetQueueSize() == 0); auto g1_allocator = this->GetG1ObjectAllocator(); g1_allocator->ClearCurrentTenuredRegion(); CollectionSet collection_set(g1_allocator->GetYoungRegions()); @@ -1943,20 +1973,26 @@ template void G1GC::HandlePendingDirtyCards() { ScopedTiming t(__FUNCTION__, *this->GetTiming()); - update_remset_thread_->DrainAllCards(&dirty_cards_); - std::for_each(dirty_cards_.cbegin(), dirty_cards_.cend(), [](auto card) { card->Clear(); }); + update_remset_thread_->DrainAllCards(&dirty_cards_, &old_dirty_cards_); + std::for_each(dirty_cards_.cbegin(), dirty_cards_.cend(), [](auto *card) { card->Clear(); }); + std::for_each(old_dirty_cards_.cbegin(), old_dirty_cards_.cend(), [](auto *card) { card->Clear(); }); } template void G1GC::ReenqueueDirtyCards() { ScopedTiming t(__FUNCTION__, *this->GetTiming()); - os::memory::LockHolder lock(queue_lock_); - std::for_each(dirty_cards_.cbegin(), dirty_cards_.cend(), [this](auto card) { + auto *barriers = GetG1BarrierSet(); + std::for_each(dirty_cards_.cbegin(), dirty_cards_.cend(), [barriers](auto *card) { card->Mark(); - updated_refs_queue_->push_back(card); + barriers->EnqueueToShared(card); + }); + std::for_each(old_dirty_cards_.cbegin(), old_dirty_cards_.cend(), [barriers](auto *card) { + // do not mark old cards + barriers->EnqueueToOld(card); }); dirty_cards_.clear(); + old_dirty_cards_.clear(); } template @@ -2064,6 +2100,9 @@ MemRange G1GC::CacheRefsFromRemsets(const MemRangeRefsChecker &r if (!this->IsFullGC()) { CacheRefsFromDirtyCards(visitor); + if (collection_set_.size() > collection_set_.Young().size()) { + CacheRefsFromOldDirtyCards(visitor); + } #ifndef NDEBUG unique_cards_initialized_ = true; #endif // NDEBUG @@ -2079,9 +2118,24 @@ template void G1GC::CacheRefsFromDirtyCards(Visitor visitor) { ScopedTiming t(__FUNCTION__, *this->GetTiming()); + CacheRefsFromDirtyCards(visitor, &dirty_cards_); +} + +template +template +void G1GC::CacheRefsFromOldDirtyCards(Visitor visitor) +{ + ScopedTiming t(__FUNCTION__, *this->GetTiming()); + CacheRefsFromDirtyCards(visitor, &old_dirty_cards_); +} + +template +template +void G1GC::CacheRefsFromDirtyCards(Visitor visitor, PandaUnorderedSet *dirty_cards) +{ auto card_table = this->GetCardTable(); constexpr size_t MEM_SIZE = DEFAULT_REGION_SIZE / RemSet<>::Bitmap::GetNumBits(); - for (auto it = dirty_cards_.cbegin(); it != dirty_cards_.cend();) { + for (auto it = dirty_cards->cbegin(); it != dirty_cards->cend();) { auto range = card_table->GetMemoryRange(*it); auto addr = range.GetStartAddress(); ASSERT_DO(IsHeapSpace(PoolManager::GetMmapMemPool()->GetSpaceTypeForAddr(ToVoidPtr(addr))), @@ -2089,7 +2143,7 @@ void G1GC::CacheRefsFromDirtyCards(Visitor visitor) auto end_addr = range.GetEndAddress(); auto region = panda::mem::AddrToRegion(ToVoidPtr(addr)); if (!RemsetRegionPredicate(region)) { - it = dirty_cards_.erase(it); + it = dirty_cards->erase(it); continue; } @@ -2101,7 +2155,7 @@ void G1GC::CacheRefsFromDirtyCards(Visitor visitor) addr += MEM_SIZE; } if (all_cross_region_refs_processed) { - it = dirty_cards_.erase(it); + it = dirty_cards->erase(it); continue; } ++it; diff --git a/runtime/mem/gc/g1/g1-gc.h b/runtime/mem/gc/g1/g1-gc.h index fa0a2649f7c731b90cca449af992d1f395734817..6a64e82378ae8e2bbd12fcf1fb75c1b268c1bd38 100644 --- a/runtime/mem/gc/g1/g1-gc.h +++ b/runtime/mem/gc/g1/g1-gc.h @@ -154,15 +154,12 @@ protected: } // NOLINTBEGIN(misc-non-private-member-variables-in-classes) - /// Queue with updated refs info - GCG1BarrierSet::ThreadLocalCardQueues *updated_refs_queue_ {nullptr}; - os::memory::Mutex queue_lock_; os::memory::Mutex gc_worker_queue_lock_; // NOLINTEND(misc-non-private-member-variables-in-classes) private: void WaitForUpdateRemsetThread(); - void ProcessDirtyCards(); + void ProcessDirtyCards(bool process_old_cads); bool HaveGarbageRegions(); bool HaveGarbageRegions(const PandaPriorityQueue> ®ions); @@ -195,6 +192,12 @@ private: template void CacheRefsFromDirtyCards(Visitor visitor); + template + void CacheRefsFromOldDirtyCards(Visitor visitor); + + template + void CacheRefsFromDirtyCards(Visitor visitor, PandaUnorderedSet *dirty_cards); + void InitializeImpl() override; bool NeedFullGC(const panda::GCTask &task); @@ -293,10 +296,14 @@ private: void VerifyCollectAndMove(HeapVerifierIntoGC &&collect_verifier, const CollectionSet &collection_set); - template - std::conditional_t, - EnqueueRemsetRefUpdater> - CreateRefUpdater(GCG1BarrierSet::ThreadLocalCardQueues *updated_ref_queue) const; + template + std::conditional_t, EnqueueRemsetRefUpdater> + CreateConcurrentRefUpdater(PandaVector *queue) const; + + template + std::conditional_t, + SharedEnqueueRemsetRefUpdater> + CreateRefUpdater() const; /// Update all refs to moved objects template @@ -448,6 +455,7 @@ private: // Dirty cards which are not fully processed before collection. // These cards are processed later. PandaUnorderedSet dirty_cards_; + PandaUnorderedSet old_dirty_cards_; #ifndef NDEBUG bool unique_cards_initialized_ = false; #endif // NDEBUG diff --git a/runtime/mem/gc/g1/ref_updater.h b/runtime/mem/gc/g1/ref_updater.h index e2f8797192365c14d2b4e1a4b3deb40bd5a58199..2e8e2f3dd51cdf6e10035f530fd2a986e370fa70 100644 --- a/runtime/mem/gc/g1/ref_updater.h +++ b/runtime/mem/gc/g1/ref_updater.h @@ -62,31 +62,65 @@ protected: }; template -class EnqueueRemsetRefUpdater : public BaseRefUpdater { +class BaseEnqueueRemsetRefUpdater : public BaseRefUpdater { public: - EnqueueRemsetRefUpdater(CardTable *card_table, GCG1BarrierSet::ThreadLocalCardQueues *updated_refs_queue, - uint32_t region_size_bits) - : BaseRefUpdater(region_size_bits), - card_table_(card_table), - updated_refs_queue_(updated_refs_queue) + BaseEnqueueRemsetRefUpdater(CardTable *card_table, uint32_t region_size_bits) + : BaseRefUpdater(region_size_bits), card_table_(card_table) { } protected: - void Process(ObjectHeader *object, size_t offset, ObjectHeader *ref) const override + void Process(ObjectHeader *object, size_t offset, ObjectHeader *ref) const override final { if (!this->IsSameRegion(object, ref)) { auto *card = card_table_->GetCardPtr(ToUintPtr(object) + offset); if (card->IsClear()) { card->Mark(); - updated_refs_queue_->push_back(card); + Enqueue(card); } } } + virtual void Enqueue(CardTable::CardPtr card) const = 0; + private: CardTable *card_table_; - GCG1BarrierSet::ThreadLocalCardQueues *updated_refs_queue_; +}; + +template +class EnqueueRemsetRefUpdater : public BaseEnqueueRemsetRefUpdater { +public: + EnqueueRemsetRefUpdater(CardTable *card_table, PandaVector *queue, uint32_t region_size_bits) + : BaseEnqueueRemsetRefUpdater(card_table, region_size_bits), queue_(queue) + { + } + +protected: + void Enqueue(CardTable::CardPtr card) const override + { + queue_->push_back(card); + } + +private: + PandaVector *queue_; +}; + +template +class SharedEnqueueRemsetRefUpdater : public BaseEnqueueRemsetRefUpdater { +public: + SharedEnqueueRemsetRefUpdater(CardTable *card_table, GCG1BarrierSet *barriers, uint32_t region_size_bits) + : BaseEnqueueRemsetRefUpdater(card_table, region_size_bits), barriers_(barriers) + { + } + +protected: + void Enqueue(CardTable::CardPtr card) const override + { + barriers_->EnqueueToOld(card); + } + +private: + GCG1BarrierSet *barriers_; }; } // namespace panda::mem diff --git a/runtime/mem/gc/g1/update_remset_thread.cpp b/runtime/mem/gc/g1/update_remset_thread.cpp index a07ac0b32b28a4aee5163bbdc2d33264f674dde7..f7f1c9d6774ca40faf14a7345cb5eb180eda76de 100644 --- a/runtime/mem/gc/g1/update_remset_thread.cpp +++ b/runtime/mem/gc/g1/update_remset_thread.cpp @@ -29,16 +29,13 @@ namespace panda::mem { static constexpr size_t PREALLOCATED_SET_SIZE = 256; template -UpdateRemsetThread::UpdateRemsetThread(GC *gc, PandaVM *vm, - GCG1BarrierSet::ThreadLocalCardQueues *queue, - os::memory::Mutex *queue_lock, size_t region_size, - bool update_concurrent, size_t min_concurrent_cards_to_process, - CardTable *card_table) +UpdateRemsetThread::UpdateRemsetThread(GC *gc, PandaVM *vm, GCG1BarrierSet *barriers, + size_t region_size, bool update_concurrent, + size_t min_concurrent_cards_to_process, CardTable *card_table) : gc_(gc), vm_(vm), + barriers_(barriers), card_table_(card_table), - queue_(queue), - queue_lock_(queue_lock), update_concurrent_(update_concurrent), region_size_bits_(panda::helpers::math::GetIntLog2(region_size)), min_concurrent_cards_to_process_(min_concurrent_cards_to_process) @@ -126,7 +123,7 @@ void UpdateRemsetThread::WaitUntilTasksEnd() while (pause_thread_) { // runtime is destroying, handle all refs anyway for now if (stop_thread_ || update_thread_ == nullptr) { - ProcessAllCards(); // Process all cards inside gc + ProcessAllCards(false); // Process all cards inside gc pause_thread_ = false; break; } @@ -134,17 +131,15 @@ void UpdateRemsetThread::WaitUntilTasksEnd() thread_cond_var_.Wait(&loop_lock_); } thread_cond_var_.Signal(); - ASSERT(GetQueueSize() == 0); } else { os::memory::LockHolder holder(loop_lock_); // we will handle all remsets even when thread is stopped (we are trying to destroy Runtime, but it's the last // GC), try to eliminate it in the future for faster shutdown - ProcessAllCards(); // Process all cards inside gc + ProcessAllCards(false); // Process all cards inside gc pause_thread_ = false; } stats_.PrintStats(); stats_.Reset(); - ASSERT(GetQueueSize() == 0); ASSERT(!pause_thread_); } @@ -167,7 +162,7 @@ void UpdateRemsetThread::ThreadLoop() // gc is waiting for us to handle all updates // possible improvements: let GC thread to help us to handle elements in queue in parallel, instead of // waiting - ProcessAllCards(); // Process all cards inside gc + ProcessAllCards(false); // Process all cards inside gc pause_thread_ = false; thread_cond_var_.Signal(); // notify GC thread that we processed all updates thread_cond_var_.Wait(&loop_lock_); // let WaitUntilTasksEnd to finish @@ -188,7 +183,7 @@ void UpdateRemsetThread::ThreadLoop() continue; } ASSERT(!paused_by_gc_thread_); - auto processed_cards = ProcessAllCards(); + auto processed_cards = ProcessAllCards(false); if (processed_cards < min_concurrent_cards_to_process_) { Sleep(); @@ -201,7 +196,6 @@ void UpdateRemsetThread::ThreadLoop() template void UpdateRemsetThread::FillFromDefered(PandaUnorderedSet *cards) { - os::memory::LockHolder holder(*queue_lock_); std::copy(cards_.begin(), cards_.end(), std::inserter(*cards, cards->end())); cards_.clear(); } @@ -209,9 +203,13 @@ void UpdateRemsetThread::FillFromDefered(PandaUnorderedSet void UpdateRemsetThread::FillFromQueue(PandaUnorderedSet *cards) { - os::memory::LockHolder holder(*queue_lock_); - std::copy(queue_->begin(), queue_->end(), std::inserter(*cards, cards->end())); - queue_->clear(); + barriers_->Dump(cards); +} + +template +void UpdateRemsetThread::FillFromOldQueue(PandaUnorderedSet *cards) +{ + barriers_->DumpOld(cards); } template @@ -262,7 +260,7 @@ void UpdateRemsetThread::FillFromPostBarrierBuffer( } template -void UpdateRemsetThread::FillFromPostBarrierBuffer(GCG1BarrierSet::ThreadLocalCardQueues *post_wrb, +void UpdateRemsetThread::FillFromPostBarrierBuffer(PandaVector *post_wrb, PandaUnorderedSet *cards) { while (!post_wrb->empty()) { @@ -303,17 +301,30 @@ private: }; template -size_t UpdateRemsetThread::ProcessAllCards() +size_t UpdateRemsetThread::ProcessAllCards(bool process_all_cards) { FillFromQueue(&cards_); FillFromThreads(&cards_); FillFromPostBarrierBuffers(&cards_); + if (process_all_cards) { + barriers_->DumpOld(&cards_); + } if (!cards_.empty()) { LOG(DEBUG, GC) << "Remset thread started process: " << cards_.size() << " cards"; } size_t cards_size = 0; RemsetCardHandler card_handler(card_table_, region_size_bits_, defer_cards_); + for (auto it = cards_.begin(); it != cards_.end();) { + if (!card_handler.Handle(*it)) { + return cards_size; + } + cards_size++; + + it = cards_.erase(it); + } + + barriers_->DumpOld(&cards_); for (auto it = cards_.begin(); it != cards_.end();) { if (!card_handler.Handle(*it)) { break; @@ -322,11 +333,19 @@ size_t UpdateRemsetThread::ProcessAllCards() it = cards_.erase(it); } + + for (auto it = cards_.begin(); it != cards_.end(); ++it) { + barriers_->EnqueueToOld(*it); + } + + cards_.clear(); + return cards_size; } template -void UpdateRemsetThread::DrainAllCards(PandaUnorderedSet *cards) +void UpdateRemsetThread::DrainAllCards(PandaUnorderedSet *cards, + PandaUnorderedSet *old_cards) { pause_thread_ = true; // Atomic with relaxed order reason: memory order is not required @@ -337,6 +356,7 @@ void UpdateRemsetThread::DrainAllCards(PandaUnorderedSet::SuspendThread() } template -void UpdateRemsetThread::GCProcessCards() +void UpdateRemsetThread::GCProcessCards(bool process_all_cards) { ASSERT(gc_pause_thread_); os::memory::LockHolder holder(loop_lock_); - ProcessAllCards(); + ProcessAllCards(process_all_cards); } template diff --git a/runtime/mem/gc/g1/update_remset_thread.h b/runtime/mem/gc/g1/update_remset_thread.h index c98a505c600f024825eb1de91a0ee23316a5d503..4e44161fdf1ef293e601230be911f3d36ba33b60 100644 --- a/runtime/mem/gc/g1/update_remset_thread.h +++ b/runtime/mem/gc/g1/update_remset_thread.h @@ -87,9 +87,8 @@ private: template class UpdateRemsetThread { public: - explicit UpdateRemsetThread(GC *gc, PandaVM *vm, GCG1BarrierSet::ThreadLocalCardQueues *queue, - os::memory::Mutex *queue_lock, size_t region_size, bool update_concurrent, - size_t min_concurrent_cards_to_process, CardTable *card_table); + explicit UpdateRemsetThread(GC *gc, PandaVM *vm, GCG1BarrierSet *barriers, size_t region_size, + bool update_concurrent, size_t min_concurrent_cards_to_process, CardTable *card_table); ~UpdateRemsetThread(); NO_COPY_SEMANTIC(UpdateRemsetThread); NO_MOVE_SEMANTIC(UpdateRemsetThread); @@ -105,13 +104,6 @@ public: void ThreadLoop(); - // only debug purpose - size_t GetQueueSize() const - { - os::memory::LockHolder holder(*queue_lock_); - return queue_->size(); - } - void SetUpdateConcurrent(bool value) { os::memory::LockHolder holder(loop_lock_); @@ -140,7 +132,7 @@ public: } // Interrupts card processing and returns all unprocessed cards - void DrainAllCards(PandaUnorderedSet *cards); + void DrainAllCards(PandaUnorderedSet *cards, PandaUnorderedSet *old_cards); /// Suspend UpdateRemsetThread to reduce CPU usage void SuspendThread(); @@ -150,7 +142,7 @@ public: * Process all cards in the GC thread. * Can be called only if UpdateRemsetThread is suspended */ - void GCProcessCards(); + void GCProcessCards(bool process_all_cards); /** * Invalidate regions in the GC thread * Can be called only if UpdateRemsetThread is suspended @@ -160,14 +152,15 @@ public: private: void FillFromDefered(PandaUnorderedSet *cards) REQUIRES(loop_lock_); void FillFromQueue(PandaUnorderedSet *cards) REQUIRES(loop_lock_); + void FillFromOldQueue(PandaUnorderedSet *cards) REQUIRES(loop_lock_); void FillFromThreads(PandaUnorderedSet *cards) REQUIRES(loop_lock_); void FillFromPostBarrierBuffers(PandaUnorderedSet *cards); void FillFromPostBarrierBuffer(GCG1BarrierSet::G1PostBarrierRingBufferType *post_wrb, PandaUnorderedSet *cards); - void FillFromPostBarrierBuffer(GCG1BarrierSet::ThreadLocalCardQueues *post_wrb, + void FillFromPostBarrierBuffer(PandaVector *post_wrb, PandaUnorderedSet *cards); - size_t ProcessAllCards() REQUIRES(loop_lock_); + size_t ProcessAllCards(bool process_old_cards) REQUIRES(loop_lock_); void Sleep() REQUIRES(loop_lock_) { @@ -177,12 +170,12 @@ private: GC *gc_ {nullptr}; PandaVM *vm_ {nullptr}; + GCG1BarrierSet *barriers_; CardTable *card_table_ {nullptr}; - GCG1BarrierSet::ThreadLocalCardQueues *queue_ GUARDED_BY(queue_lock_) {nullptr}; - os::memory::Mutex *queue_lock_ {nullptr}; PandaUnorderedSet cards_; + PandaUnorderedSet old_region_cards_; PandaVector *invalidate_regions_ GUARDED_BY(loop_lock_) {nullptr}; - PandaVector post_barrier_buffers_ GUARDED_BY(post_barrier_buffers_lock_); + PandaVector *> post_barrier_buffers_ GUARDED_BY(post_barrier_buffers_lock_); os::memory::Mutex post_barrier_buffers_lock_; /* diff --git a/runtime/mem/gc/gc_barrier_set.cpp b/runtime/mem/gc/gc_barrier_set.cpp index 2d7a927b6f26f37b73fd037161b36146fc44335b..4ac6e36de49ad546a90db94eac3f4a476181e49b 100644 --- a/runtime/mem/gc/gc_barrier_set.cpp +++ b/runtime/mem/gc/gc_barrier_set.cpp @@ -178,10 +178,7 @@ void GCG1BarrierSet::Invalidate(const void *begin, const void *last) void GCG1BarrierSet::Enqueue(CardTable::CardPtr card) { auto *thread = ManagedThread::GetCurrent(); - if (thread == nullptr) { // slow path via shared-queue for VM threads: gc/compiler/etc - os::memory::LockHolder lock(*queue_lock_); - updated_refs_queue_->push_back(card); - } else { + if (thread != nullptr) { // general fast-path for mutators ASSERT(thread->GetPreBuff() != nullptr); // write barrier cant be called after Terminate auto *buffer = thread->GetG1PostBarrierBuffer(); @@ -195,8 +192,41 @@ void GCG1BarrierSet::Enqueue(CardTable::CardPtr card) } // After 2 unsuccessfull pushing, we see that current buffer still full // so, reuse shared buffer - os::memory::LockHolder lock(*queue_lock_); - updated_refs_queue_->push_back(card); + } + + // slow path via shared-queue for VM threads: gc/compiler/etc + os::memory::LockHolder lock(shared_cards_queue_lock_); + EnqueueToShared(card); +} + +void GCG1BarrierSet::EnqueueToShared(CardTable::CardPtr card) +{ + shared_cards_queue_.push_back(card); +} + +void GCG1BarrierSet::EnqueueToOld(CardTable::CardPtr card) +{ + old_region_cards_queue_.push_back(card); +} + +void GCG1BarrierSet::Dump(PandaUnorderedSet *cards) +{ + os::memory::LockHolder lock(shared_cards_queue_lock_); + std::copy(shared_cards_queue_.begin(), shared_cards_queue_.end(), std::inserter(*cards, cards->end())); + shared_cards_queue_.clear(); +} + +void GCG1BarrierSet::DumpOld(PandaUnorderedSet *cards) +{ + os::memory::LockHolder lock(shared_cards_queue_lock_); + std::copy(old_region_cards_queue_.begin(), old_region_cards_queue_.end(), std::inserter(*cards, cards->end())); + old_region_cards_queue_.clear(); +} + +void GCG1BarrierSet::ClearOldCards() +{ + for (auto *card : old_region_cards_queue_) { + card->Clear(); } } } // namespace panda::mem diff --git a/runtime/mem/gc/gc_barrier_set.h b/runtime/mem/gc/gc_barrier_set.h index 50348c540c47e1d99804f8637beb557fb81ac7c3..10c303aef5078e6f009a8fb778e350203ec0ba00 100644 --- a/runtime/mem/gc/gc_barrier_set.h +++ b/runtime/mem/gc/gc_barrier_set.h @@ -199,7 +199,7 @@ private: class GCG1BarrierSet : public GCBarrierSet { public: - using ThreadLocalCardQueues = PandaVector; + using SharedCardsQueue = PandaVector; static constexpr size_t G1_POST_BARRIER_RING_BUFFER_SIZE = 1024 * 8; using G1PostBarrierRingBufferType = mem::LockFreeBuffer; @@ -207,16 +207,13 @@ public: // PRE ARGS: ObjRefProcessFunc pre_store_func, // POST ARGS: - ObjTwoRefProcessFunc post_func, uint8_t region_size_bits_count, CardTable *card_table, - ThreadLocalCardQueues *updated_refs_queue, os::memory::Mutex *queue_lock) + ObjTwoRefProcessFunc post_func, uint8_t region_size_bits_count, CardTable *card_table) : GCBarrierSet(allocator, BarrierType::PRE_SATB_BARRIER, BarrierType::POST_INTERREGION_BARRIER), pre_store_func_(pre_store_func), post_func_(post_func), region_size_bits_count_(region_size_bits_count), card_table_(card_table), - min_addr_(ToVoidPtr(card_table->GetMinAddress())), - updated_refs_queue_(updated_refs_queue), - queue_lock_(queue_lock) + min_addr_(ToVoidPtr(card_table->GetMinAddress())) { ASSERT(pre_store_func_ != nullptr); ASSERT(post_func_ != nullptr); @@ -250,6 +247,16 @@ public: void Enqueue(CardTable::CardPtr card); + void EnqueueToShared(CardTable::CardPtr card); + + void EnqueueToOld(CardTable::CardPtr card); + + void Dump(PandaUnorderedSet *cards); + + void DumpOld(PandaUnorderedSet *cards); + + void ClearOldCards(); + ~GCG1BarrierSet() override = default; CardTable *GetCardTable() const @@ -275,8 +282,9 @@ private: CardTable *card_table_ {nullptr}; /// Minimal address used by VM. Used as a base for card index calculation void *min_addr_ {nullptr}; - ThreadLocalCardQueues *updated_refs_queue_; - os::memory::Mutex *queue_lock_; + os::memory::Mutex shared_cards_queue_lock_; + SharedCardsQueue shared_cards_queue_; + SharedCardsQueue old_region_cards_queue_; }; } // namespace panda::mem